k-means

https://www.cnblogs.com/zy230530/p/7029025.html

k-means算法中的k表示聚类为k个簇,means代表取每一个聚类中数据的均值作为该簇的中心(质心)即用每一个类的质心对该簇进行描述。k-means算法的原理比较简单,但它有缺陷,即其可能收敛到局部最优解(局部最优不如全局最优效果好),且在大规模数据集上收敛速度相对较慢。换种说法,k-means算法是受初始值影响的局部最优的迭代算法。伪代码实现:

创建k个初始值作为初始质心(要位于数据边界内)
    if 任意一个点的簇分配结果发生改变:
        遍历数据集中的每一个点:
            遍历k个质心:
                计算质心与数据点之间的距离
            将数据点分配到距离其最近的质心的簇
        遍历k个簇:
            计算每个簇中所有点的均值
            得到的k个均值更新为新的质心

 https://www.cnblogs.com/pinard/p/6169370.html

R语言代码
library(tidyverse)
library(corrplot)
library(gridExtra)
library(GGally)
library(knitr)
wines <- read.csv('/home/zwt/PycharmProjects/test/data/wine.data')
wines <- wines[2:14]    #删除第一列,种类列
head(wines)  
tail(wines)
summary(wines)
str(wines)
#画每个属性的直方图
wines %>%
  gather(Attributes, value, 1:13) %>%
  ggplot(aes(x=value, fill=Attributes)) +
  geom_histogram(colour="black", show.legend=FALSE) +
  facet_wrap(~Attributes, scales="free_x") +
  labs(x="Values", y="Frequency",
       title="Wines Attributes - Histograms") +
  theme_bw()
#密度图
wines %>%
  gather(Attributes, value, 1:13) %>%
  ggplot(aes(x=value, fill=Attributes)) +
  geom_density(colour="black", alpha=0.5, show.legend=FALSE) +
  facet_wrap(~Attributes, scales="free_x") +
  labs(x="Values", y="Density",
       title="Wines Attributes - Density plots") +
  theme_bw()
#箱形图
wines %>%
  gather(Attributes, values, c(1:4, 6:12)) %>%
  ggplot(aes(x=reorder(Attributes, values, FUN=median), y=values, fill=Attributes)) +
  geom_boxplot(show.legend=FALSE) +
  labs(title="Wines Attributes - Boxplots") +
  theme_bw() +
  theme(axis.title.y=element_blank(),
        axis.title.x=element_blank()) +
  ylim(0, 50) +
  coord_flip()
#相关系数图
corrplot(cor(wines), type="upper", method="ellipse", tl.cex=0.9)
ggplot(wines, aes(x=Total_Phenols, y=Flavanoids)) +
  geom_point() +
  geom_smooth(method="lm", se=FALSE) +
  labs(title="Wines Attributes",
       subtitle="Relationship between Phenols and Flavanoids") +
  theme_bw()
#归一化(标准化)使用scale函数
winesNorm <- as.data.frame(scale(wines))
#原始数据(节选)
p1 <- ggplot(wines, aes(x=Alcohol, y=Malic_Acid)) +
  geom_point() +
  labs(title="Original data") +
  theme_bw()
#归一化后的数据
p2 <- ggplot(winesNorm, aes(x=Alcohol, y=Malic_Acid)) +
  geom_point() +
  labs(title="Normalized data") +
  theme_bw()
#画图
grid.arrange(p1, p2, ncol=2)
set.seed(6666)
wines_k2 <- kmeans(winesNorm, centers=2)
ggpairs(cbind(wines, Cluster=as.factor(wines_k2$cluster)),
        columns=1:6, aes(colour=Cluster, alpha=0.5),
        lower=list(continuous="points"),
        upper=list(continuous="blank"),
        axisLabels="none", switch="both") +
        theme_bw()
#查看每个点被分成哪个集群
wines_k2$cluster
#查看聚类中心矩阵
wines_k2$centers
#数量
wines_k2$size
#组间平方和
wines_k2$betweenss
#每个集群的组内平方和
wines_k2$withinss
#组内平方和
wines_k2$tot.withinss
#总平方和
wines_k2$totss
bss <- numeric()
wss <- numeric()

# Run the algorithm for different values of k 
set.seed(6666)

for(i in 1:10){

  # For each k, calculate betweenss and tot.withinss
  bss[i] <- kmeans(winesNorm, centers=i)$betweenss
  wss[i] <- kmeans(winesNorm, centers=i)$tot.withinss

}

# Between-cluster sum of squares vs Choice of k
p3 <- qplot(1:10, bss, geom=c("point", "line"), 
            xlab="Number of clusters", ylab="Between-cluster sum of squares") +
  scale_x_continuous(breaks=seq(0, 10, 1)) +
  theme_bw()

# Total within-cluster sum of squares vs Choice of k
p4 <- qplot(1:10, wss, geom=c("point", "line"),
            xlab="Number of clusters", ylab="Total within-cluster sum of squares") +
  scale_x_continuous(breaks=seq(0, 10, 1)) +
  theme_bw()

# Subplot
grid.arrange(p3, p4, ncol=2)
set.seed(6666)
wines_k3 <- kmeans(winesNorm,centers = 3)
aggregate(wines, by = list(wines_k3$cluster), mean)
ggpairs(cbind(wines, Cluster=as.factor(wines_k3$cluster)),
        columns=1:6, aes(colour=Cluster, alpha=0.5),
        lower=list(continuous="points"),
        upper=list(continuous="blank"),
        axisLabels="none", switch="both") +
        theme_bw()

 

 

posted @ 2019-04-17 20:25  zwtzz  阅读(2704)  评论(0编辑  收藏  举报