k-means
https://www.cnblogs.com/zy230530/p/7029025.html
k-means算法中的k表示聚类为k个簇,means代表取每一个聚类中数据的均值作为该簇的中心(质心)即用每一个类的质心对该簇进行描述。k-means算法的原理比较简单,但它有缺陷,即其可能收敛到局部最优解(局部最优不如全局最优效果好),且在大规模数据集上收敛速度相对较慢。换种说法,k-means算法是受初始值影响的局部最优的迭代算法。伪代码实现:
创建k个初始值作为初始质心(要位于数据边界内)
if 任意一个点的簇分配结果发生改变:
遍历数据集中的每一个点:
遍历k个质心:
计算质心与数据点之间的距离
将数据点分配到距离其最近的质心的簇
遍历k个簇:
计算每个簇中所有点的均值
得到的k个均值更新为新的质心
https://www.cnblogs.com/pinard/p/6169370.html
R语言代码
library(tidyverse)
library(corrplot)
library(gridExtra)
library(GGally)
library(knitr)
wines <- read.csv('/home/zwt/PycharmProjects/test/data/wine.data')
wines <- wines[2:14] #删除第一列,种类列
head(wines)
tail(wines)
summary(wines)
str(wines)
#画每个属性的直方图
wines %>%
gather(Attributes, value, 1:13) %>%
ggplot(aes(x=value, fill=Attributes)) +
geom_histogram(colour="black", show.legend=FALSE) +
facet_wrap(~Attributes, scales="free_x") +
labs(x="Values", y="Frequency",
title="Wines Attributes - Histograms") +
theme_bw()
#密度图
wines %>%
gather(Attributes, value, 1:13) %>%
ggplot(aes(x=value, fill=Attributes)) +
geom_density(colour="black", alpha=0.5, show.legend=FALSE) +
facet_wrap(~Attributes, scales="free_x") +
labs(x="Values", y="Density",
title="Wines Attributes - Density plots") +
theme_bw()
#箱形图
wines %>%
gather(Attributes, values, c(1:4, 6:12)) %>%
ggplot(aes(x=reorder(Attributes, values, FUN=median), y=values, fill=Attributes)) +
geom_boxplot(show.legend=FALSE) +
labs(title="Wines Attributes - Boxplots") +
theme_bw() +
theme(axis.title.y=element_blank(),
axis.title.x=element_blank()) +
ylim(0, 50) +
coord_flip()
#相关系数图
corrplot(cor(wines), type="upper", method="ellipse", tl.cex=0.9)
ggplot(wines, aes(x=Total_Phenols, y=Flavanoids)) +
geom_point() +
geom_smooth(method="lm", se=FALSE) +
labs(title="Wines Attributes",
subtitle="Relationship between Phenols and Flavanoids") +
theme_bw()
#归一化(标准化)使用scale函数
winesNorm <- as.data.frame(scale(wines))
#原始数据(节选)
p1 <- ggplot(wines, aes(x=Alcohol, y=Malic_Acid)) +
geom_point() +
labs(title="Original data") +
theme_bw()
#归一化后的数据
p2 <- ggplot(winesNorm, aes(x=Alcohol, y=Malic_Acid)) +
geom_point() +
labs(title="Normalized data") +
theme_bw()
#画图
grid.arrange(p1, p2, ncol=2)
set.seed(6666)
wines_k2 <- kmeans(winesNorm, centers=2)
ggpairs(cbind(wines, Cluster=as.factor(wines_k2$cluster)),
columns=1:6, aes(colour=Cluster, alpha=0.5),
lower=list(continuous="points"),
upper=list(continuous="blank"),
axisLabels="none", switch="both") +
theme_bw()
#查看每个点被分成哪个集群
wines_k2$cluster
#查看聚类中心矩阵
wines_k2$centers
#数量
wines_k2$size
#组间平方和
wines_k2$betweenss
#每个集群的组内平方和
wines_k2$withinss
#组内平方和
wines_k2$tot.withinss
#总平方和
wines_k2$totss
bss <- numeric()
wss <- numeric()
# Run the algorithm for different values of k
set.seed(6666)
for(i in 1:10){
# For each k, calculate betweenss and tot.withinss
bss[i] <- kmeans(winesNorm, centers=i)$betweenss
wss[i] <- kmeans(winesNorm, centers=i)$tot.withinss
}
# Between-cluster sum of squares vs Choice of k
p3 <- qplot(1:10, bss, geom=c("point", "line"),
xlab="Number of clusters", ylab="Between-cluster sum of squares") +
scale_x_continuous(breaks=seq(0, 10, 1)) +
theme_bw()
# Total within-cluster sum of squares vs Choice of k
p4 <- qplot(1:10, wss, geom=c("point", "line"),
xlab="Number of clusters", ylab="Total within-cluster sum of squares") +
scale_x_continuous(breaks=seq(0, 10, 1)) +
theme_bw()
# Subplot
grid.arrange(p3, p4, ncol=2)
set.seed(6666)
wines_k3 <- kmeans(winesNorm,centers = 3)
aggregate(wines, by = list(wines_k3$cluster), mean)
ggpairs(cbind(wines, Cluster=as.factor(wines_k3$cluster)),
columns=1:6, aes(colour=Cluster, alpha=0.5),
lower=list(continuous="points"),
upper=list(continuous="blank"),
axisLabels="none", switch="both") +
theme_bw()