k-means二维向量版(C#)
以下代码以最简单的二维向量演示了k-means的处理流程
有一堆二维向量<x1,y1>,<x2,y2>..<xn,yn>,聚成两个类
聚类中心就是类中所有向量的均值<xm,ym>, xm = (x1+x2+..+xn)/2, ym = (y1+y2+..+yn)/2
每次聚类都重新计算各个向量和聚类中心的距离(用欧氏距离),将各个向量分配到离各自最近的类
然后比较本次的聚类中心是否与上次相等,如果相等,即表示所有类已经平衡,聚类结束.
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
using System;
using System.Collections.Generic;
using System.Text;
using System.Drawing;
namespace Clustering.Demo
{
/// <summary>
/// 利用点坐标(二维向量)来演示k-means算法
/// </summary>
class Kmeans
{
/// <summary>
/// 点的坐标数组(coordinates)
/// </summary>
private static readonly int[,] coords = new int[,]
{
{ 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 }, { 2, 1 }, { 1, 2 }, { 2, 2 }, { 3, 2 }, { 6, 6 }, { 7, 6 },
{ 8, 6 }, { 6, 7 }, { 7, 7 }, { 8, 7 }, { 9, 7 }, { 7, 8 }, { 8, 8 }, { 9, 8 }, { 8, 9 }, { 9, 9 },
//{ 12, 12 }, { 12, 13 }, { 13, 14 }, { 14, 14 }, { 14, 15 }, { 15, 16 }, { 16, 14 }, { 17, 15 }, { 16, 18 }, { 16, 17 },
};
/// <summary>
/// 聚类的数量
/// </summary>
private static readonly int k = 2;
/// <summary>
///
/// </summary>
public static void Cluster()
{
Point[] points = GetPoints(); //获取原始数据点
PointF[] means = new PointF[k]; //当前的聚类中心
PointF[] prevMeans = new PointF[k]; //之前的聚类中心
int[] pointAssigns = new int[points.Length]; //每个点所属的类
InitMeans(points, k, means); //随机选k个点作为聚类中心
PrintInit(points, k, means); //打印初始数据
int iter = 0; //迭代次数(iteration times)
while (true)
{
Classify(points, k, means, pointAssigns); //将每个点与各个聚类中心对比进行归类划分
int conv = 0; //收敛类数(convergent clusters)
for (int j = 0; j < k; j++)
{
means[j] = CalcMeans(points, pointAssigns, j); //重新计算聚类中心
if (Compare(means[j], prevMeans[j]))
conv++;
else
prevMeans[j] = means[j];
}
if (conv == k) //新旧聚类中心一样表示该聚类已收敛至平衡
{
break;
}
iter++;
PrintIter(points, k, means, pointAssigns, iter); //打印迭代数据
}
}
/// <summary>
/// 初始化点数组(根据自定义坐标)
/// </summary>
private static Point[] GetPoints()
{
int len = coords.GetLength(0);
Point[] points = new Point[len];
for (int i = 0; i < len; i++)
{
points[i] = new Point(coords[i, 0], coords[i, 1]);
}
return points;
}
/// <summary>
/// 随机选k个点作为聚类中心
/// </summary>
private static void InitMeans(Point[] points, int k, PointF[] means)
{
Random random = new Random();
for (int i = 0; i < k; i++)
{
means[i] = points[random.Next(points.Length)];
}
}
/// <summary>
/// 将每个点与各个聚类中心对比进行归类划分
/// </summary>
private static void Classify(Point[] points, int k, PointF[] means, int[] pointAssigns)
{
for (int i = 0; i < points.Length; i++)
{
double minDist = double.MaxValue; //最短距离
for (int j = 0; j < k; j++)
{
double dist = Distance(points[i], means[j]);
if (dist < minDist)
{
minDist = dist;
pointAssigns[i] = j;
}
}
//Console.WriteLine("{0}归入类{1}", points[i], pointAssigns[i]);
}
}
#region 欧氏距离(Euclidean distance)
/// <summary>
/// 计算欧氏距离(勾股定理的多维空间扩展)
/// </summary>
/// <remarks>
/// 勾股定理: x^2 + y^2 = z^2
/// 欧氏距离: distance = sqrt(x1^2 + x2^2 + ... + xn^2)
/// 欧氏距离的变换式: x1^2 + x2^2 + ... + xn^2 = distance^2
///
/// 二维空间的两点p1(x1,y1)和p(x2,y2),两点间距离dist=sqrt((x1-x2)^2+(y1-y2)^2)
/// </remarks>
private static double Distance(PointF p1, PointF p2)
{
double pow2X = Math.Pow(p1.X - p2.X, 2);
double pow2Y = Math.Pow(p1.Y - p2.Y, 2);
return Math.Sqrt(pow2X + pow2Y);
}
#endregion
/// <summary>
/// 计算新的聚类中心(均值)
/// </summary>
/// <remarks>
/// meanX = (x1 + x2 + ... + xn) / n
/// meanY = (y1 + y2 + ... + yn) / n
/// </remarks>
private static PointF CalcMeans(Point[] points, int[] pointAssigns, int j)
{
PointF mean = new PointF();
int n = 0;
for (int i = 0; i < points.Length; i++)
{
if (pointAssigns[i] == j)
{
mean.X += points[i].X;
mean.Y += points[i].Y;
n++;
}
}
mean.X /= (float)n;
mean.Y /= (float)n;
return mean;
}
/// <summary>
/// 比较两个聚类中心是否相等
/// </summary>
private static bool Compare(PointF a, PointF b)
{
if (((int)(a.X * 10) == (int)(b.X * 10)) && ((int)(a.Y * 10) == (int)(b.Y * 10)))
{
return true;
}
else
{
return false;
}
}
#region 打印数据(print datas)
/// <summary>
/// 打印初始数据
/// </summary>
private static void PrintInit(Point[] points, int k, PointF[] means)
{
Console.WriteLine("总共{0}个样本:", points.Length);
for (int i = 0; i < points.Length; i++)
{
Console.WriteLine("{0}, {1}", points[i].X, points[i].Y);
}
Console.WriteLine("\n初始化时随机选取k个样本作为聚类中心:");
for (int i = 0; i < k; i++)
{
Console.WriteLine("聚类{0}的中心: {1}, {2}", i, means[i].X, means[i].Y);
}
}
/// <summary>
/// 打印迭代数据
/// </summary>
private static void PrintIter(Point[] points, int k, PointF[] means, int[] pointAssigns, int iter)
{
Console.WriteLine("\n\n--------第{0}次迭代的结果--------", iter);
for (int j = 0; j < k; j++)
{
Console.WriteLine("\n第{0}个类的成员:", j);
for (int i = 0; i < points.Length; i++)
{
if (pointAssigns[i] == j)
{
Console.WriteLine("{0}, {1}", points[i].X, points[i].Y);
}
}
Console.WriteLine("均值(聚类中心): {0}, {1}", means[j].X, means[j].Y);
}
}
#endregion
}
}
using System.Collections.Generic;
using System.Text;
using System.Drawing;
namespace Clustering.Demo
{
/// <summary>
/// 利用点坐标(二维向量)来演示k-means算法
/// </summary>
class Kmeans
{
/// <summary>
/// 点的坐标数组(coordinates)
/// </summary>
private static readonly int[,] coords = new int[,]
{
{ 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 }, { 2, 1 }, { 1, 2 }, { 2, 2 }, { 3, 2 }, { 6, 6 }, { 7, 6 },
{ 8, 6 }, { 6, 7 }, { 7, 7 }, { 8, 7 }, { 9, 7 }, { 7, 8 }, { 8, 8 }, { 9, 8 }, { 8, 9 }, { 9, 9 },
//{ 12, 12 }, { 12, 13 }, { 13, 14 }, { 14, 14 }, { 14, 15 }, { 15, 16 }, { 16, 14 }, { 17, 15 }, { 16, 18 }, { 16, 17 },
};
/// <summary>
/// 聚类的数量
/// </summary>
private static readonly int k = 2;
/// <summary>
///
/// </summary>
public static void Cluster()
{
Point[] points = GetPoints(); //获取原始数据点
PointF[] means = new PointF[k]; //当前的聚类中心
PointF[] prevMeans = new PointF[k]; //之前的聚类中心
int[] pointAssigns = new int[points.Length]; //每个点所属的类
InitMeans(points, k, means); //随机选k个点作为聚类中心
PrintInit(points, k, means); //打印初始数据
int iter = 0; //迭代次数(iteration times)
while (true)
{
Classify(points, k, means, pointAssigns); //将每个点与各个聚类中心对比进行归类划分
int conv = 0; //收敛类数(convergent clusters)
for (int j = 0; j < k; j++)
{
means[j] = CalcMeans(points, pointAssigns, j); //重新计算聚类中心
if (Compare(means[j], prevMeans[j]))
conv++;
else
prevMeans[j] = means[j];
}
if (conv == k) //新旧聚类中心一样表示该聚类已收敛至平衡
{
break;
}
iter++;
PrintIter(points, k, means, pointAssigns, iter); //打印迭代数据
}
}
/// <summary>
/// 初始化点数组(根据自定义坐标)
/// </summary>
private static Point[] GetPoints()
{
int len = coords.GetLength(0);
Point[] points = new Point[len];
for (int i = 0; i < len; i++)
{
points[i] = new Point(coords[i, 0], coords[i, 1]);
}
return points;
}
/// <summary>
/// 随机选k个点作为聚类中心
/// </summary>
private static void InitMeans(Point[] points, int k, PointF[] means)
{
Random random = new Random();
for (int i = 0; i < k; i++)
{
means[i] = points[random.Next(points.Length)];
}
}
/// <summary>
/// 将每个点与各个聚类中心对比进行归类划分
/// </summary>
private static void Classify(Point[] points, int k, PointF[] means, int[] pointAssigns)
{
for (int i = 0; i < points.Length; i++)
{
double minDist = double.MaxValue; //最短距离
for (int j = 0; j < k; j++)
{
double dist = Distance(points[i], means[j]);
if (dist < minDist)
{
minDist = dist;
pointAssigns[i] = j;
}
}
//Console.WriteLine("{0}归入类{1}", points[i], pointAssigns[i]);
}
}
#region 欧氏距离(Euclidean distance)
/// <summary>
/// 计算欧氏距离(勾股定理的多维空间扩展)
/// </summary>
/// <remarks>
/// 勾股定理: x^2 + y^2 = z^2
/// 欧氏距离: distance = sqrt(x1^2 + x2^2 + ... + xn^2)
/// 欧氏距离的变换式: x1^2 + x2^2 + ... + xn^2 = distance^2
///
/// 二维空间的两点p1(x1,y1)和p(x2,y2),两点间距离dist=sqrt((x1-x2)^2+(y1-y2)^2)
/// </remarks>
private static double Distance(PointF p1, PointF p2)
{
double pow2X = Math.Pow(p1.X - p2.X, 2);
double pow2Y = Math.Pow(p1.Y - p2.Y, 2);
return Math.Sqrt(pow2X + pow2Y);
}
#endregion
/// <summary>
/// 计算新的聚类中心(均值)
/// </summary>
/// <remarks>
/// meanX = (x1 + x2 + ... + xn) / n
/// meanY = (y1 + y2 + ... + yn) / n
/// </remarks>
private static PointF CalcMeans(Point[] points, int[] pointAssigns, int j)
{
PointF mean = new PointF();
int n = 0;
for (int i = 0; i < points.Length; i++)
{
if (pointAssigns[i] == j)
{
mean.X += points[i].X;
mean.Y += points[i].Y;
n++;
}
}
mean.X /= (float)n;
mean.Y /= (float)n;
return mean;
}
/// <summary>
/// 比较两个聚类中心是否相等
/// </summary>
private static bool Compare(PointF a, PointF b)
{
if (((int)(a.X * 10) == (int)(b.X * 10)) && ((int)(a.Y * 10) == (int)(b.Y * 10)))
{
return true;
}
else
{
return false;
}
}
#region 打印数据(print datas)
/// <summary>
/// 打印初始数据
/// </summary>
private static void PrintInit(Point[] points, int k, PointF[] means)
{
Console.WriteLine("总共{0}个样本:", points.Length);
for (int i = 0; i < points.Length; i++)
{
Console.WriteLine("{0}, {1}", points[i].X, points[i].Y);
}
Console.WriteLine("\n初始化时随机选取k个样本作为聚类中心:");
for (int i = 0; i < k; i++)
{
Console.WriteLine("聚类{0}的中心: {1}, {2}", i, means[i].X, means[i].Y);
}
}
/// <summary>
/// 打印迭代数据
/// </summary>
private static void PrintIter(Point[] points, int k, PointF[] means, int[] pointAssigns, int iter)
{
Console.WriteLine("\n\n--------第{0}次迭代的结果--------", iter);
for (int j = 0; j < k; j++)
{
Console.WriteLine("\n第{0}个类的成员:", j);
for (int i = 0; i < points.Length; i++)
{
if (pointAssigns[i] == j)
{
Console.WriteLine("{0}, {1}", points[i].X, points[i].Y);
}
}
Console.WriteLine("均值(聚类中心): {0}, {1}", means[j].X, means[j].Y);
}
}
#endregion
}
}