聚类算法是ML中一个重要分支,一般采用unsupervised learning进行学习,聚类算法分为K-Means, K-Medoids, GMM, Spectral clustering,Ncut五个算法;本文将实现K-eans算法。
K-Means算法:
1. 将数据分为k个非空子集
2. 计算每个类中心点(k-means<centroid>中心点是所有点的average),记为seed point
3. 将每个object聚类到最近seed point
4. 返回2,当聚类结果不再变化的时候stop
复杂度:
O(kndt)
-计算两点间距离:d
-指定类:O(kn) ,k是类数
-迭代次数上限:t
KMeans.h
1 #include "stdafx.h" 2 #include <iostream> 3 using namespace std; 4 template <typename Type> 5 class KMeans{ 6 public: 7 KMeans(const size_t nd =0,const int nk=0,const float precision = 0.0001):m_ndataNumbers(nd),\ 8 m_nkNumbers(nk),m_iterations(0),m_datas(NULL),m_center(NULL),m_precision(precision){ 9 m_datas = new Type[m_ndataNumbers]; 10 m_center = new Type[m_nkNumbers]; 11 } 12 KMeans(Type[],Type[],const size_t ,const int,const float); 13 ~KMeans(){ 14 delete[]m_datas; 15 delete[]m_center; 16 } 17 18 Type* getDatas()const; // get the datas 19 Type* getCenter()const; // get the centers 20 int iterationTimes()const; // iteration times 21 void kmeans(); // carry out k-means 22 void printCenter(); // cout center 23 private: 24 float dataDivide(Type* , size_t*); // data divide 25 void changeCenters(Type* , size_t*); // change centers 26 private: 27 Type *m_datas; 28 Type *m_center; 29 const size_t m_ndataNumbers; //data numbers 30 const int m_nkNumbers; // center numbers 31 const float m_precision; // end iteration precision 32 int m_iterations; // carry out times 33 }; 34 // initialize the datas and the center 35 template <typename Type> 36 KMeans<Type>::KMeans( Type datas[] ,Type center[],const size_t nd,const int nk,const float precision):\ 37 m_ndataNumbers(nd),m_nkNumbers(nk),m_iterations(0),m_center(NULL),m_datas(NULL),m_precision(precision){ 38 m_datas = new Type[m_ndataNumbers]; 39 m_center=new Type[m_nkNumbers]; 40 for(size_t i = 0 ; i<m_ndataNumbers ;i++){ 41 m_datas[i] = datas[i]; 42 } 43 for(int i = 0 ; i<m_nkNumbers ; i++){ 44 m_center[i] = center[i]; 45 } 46 } 47 template <typename Type> 48 Type* KMeans<Type> ::getDatas()const{ 49 return this->m_datas; 50 } 51 // get the center 52 template <typename Type> 53 Type* KMeans<Type> ::getCenter()const{ 54 return this->m_center; 55 } 56 // get iteration times 57 template<typename Type> 58 int KMeans<Type>::iterationTimes()const{ 59 return this->m_iterations; 60 } 61 // carry out kmeans 62 template<typename Type> 63 void KMeans<Type>::kmeans(){ 64 float previous = 0; // 65 float current = 1; 66 size_t *numbers = new size_t[m_nkNumbers]; // record every cluster datas 67 Type *sumvalues = new Type[m_nkNumbers]; // record every cluster values 68 while((current-previous)>m_precision){ 69 // initialize zero 70 for(int i = 0 ; i<m_nkNumbers ; i++){ 71 numbers[i] = 0; 72 sumvalues[i] =0; 73 } 74 previous = current; 75 current = dataDivide(sumvalues,numbers); 76 changeCenters(sumvalues,numbers); 77 m_iterations++; 78 79 } 80 delete[] numbers; 81 delete[] sumvalues; 82 } 83 // data divide 84 template <typename Type> 85 float KMeans<Type>::dataDivide(Type*sumvalues , size_t*numbers){ 86 float dist = 0.0; 87 for(size_t i = 0 ; i<m_ndataNumbers ; i++){ 88 float d = sqrt(float(m_datas[i]-m_center[0])); 89 int pos = 0; 90 for(int j =1 ; j <m_nkNumbers ; j++){ 91 if(d > sqrt(float(m_datas[i]-m_center[j]))){ 92 d = sqrt(float(m_datas[i]-m_center[j])); 93 pos = j; 94 } 95 dist+=d; 96 sumvalues[pos]+=m_datas[i]; 97 numbers[pos]++; 98 } 99 } 100 return dist; 101 } 102 // change the center 103 template<typename Type> 104 void KMeans<Type>::changeCenters(Type*sumvalues , size_t*numbers){ 105 for(int i=0 ; i<m_nkNumbers ; i++){ 106 if(numbers[i]==0)continue; 107 m_center[i] = sumvalues[i]/numbers[i]; 108 } 109 } 110 template<typename Type> 111 void KMeans<Type>::printCenter(){ 112 for(int i = 0 ; i<m_nkNumbers ;i++){ 113 cout << "center " << i <<": "<< m_center[i] <<endl; 114 } 115 cout <<endl; 116 }