K-Medoide聚类算法原理及其实现Java代码
K-中心点算法是对K-Means均值算法的改进,由于样本数据可能具有很大的极端值对象,这些数据会严重的扭曲数据的分布,而平方误差和的使用可能会更加恶化这一影响。而k-Medoide算法不是选取簇中对象的均值作为质心,而是在每一个簇内选出一个实际的对象来代表该簇,这个对象就称之为簇的中心点。
算法实现步骤:
1.任意选择k个对象作为k个中心点
2.计算每个非中心点的对象到每个中心点之间的距离
3.把每一个非中心点对象分配到距离它最近的的中心点所代表的簇中
4.在每一个聚簇中按照顺序依次选取点,计算该点到当前聚簇中所有点的距离之和,最终距离之和最小的点则视为新的中心点
5 重复步骤2-4直到各个聚簇的中心点不在发生改变。
Java代码如下:
package com.kmedoids;
import java.awt.font.TextHitInfo;
import java.util.ArrayList;
public class Cluster {
private String clusterName;//类簇名
private Medoid medoid;//类簇的质点
private ArrayList<DataPoint> dataPoints;//类簇中个样本点
//Generate Constructor using Fileds
//构造方法完成簇类的初始化工作
public Cluster(String clusterName){
//super();
this.clusterName = clusterName;
this.medoid = null;
dataPoints = new ArrayList<DataPoint>();
}
public void setMedoid(Medoid c){
this.medoid = c;
}
public Medoid getMedoid(){
return this.medoid;
}
//添加样本点
public void addDataPoint(DataPoint dp){
dp.setCluster(this);
this.dataPoints.add(dp);
}
public ArrayList<DataPoint> getDataPoints() {
// TODO Auto-generated method stub
return this.dataPoints;
}
public void removeDataPoint(DataPoint dp){
this.dataPoints.remove(dp);
}
public int getNumDataPoints(){
return dataPoints.size();
}
public DataPoint getDataPoint(int pos){
return (DataPoint)this.dataPoints.get(pos);
}
public String getName(){
return this.clusterName;
}
}
/*******************************************/
package com.kmedoids;
import java.util.ArrayList;
public class DataPoint{
private double dimension[];//样本点的维度
private String PointName;//样本点的名字
private Cluster cluster;//类簇
private double euDt;//样本点到质点的距离
public DataPoint(double[] dimension, String pointName) {
//super();
this.dimension = dimension;
//PointName = pointName;
this.PointName = pointName;
this.cluster = null;
}
public Cluster getCluster() {
return cluster;
}
public void setCluster(Cluster cluster) {
this.cluster = cluster;
}
//计算欧几里得距离
public double calEuclideanDistanceSum(){
double sum = 0.0;
Cluster cluster = this.getCluster();
ArrayList<DataPoint> dataPoints = cluster.getDataPoints();
for(int i=0;i<dataPoints.size();i++){
double []dims = dataPoints.get(i).getDimensioin();
for(int j=0;j<dims.length;j++){
double temp = Math.pow(dims[j]-this.dimension[j],2);
sum += temp;
}
}
return Math.sqrt(sum);
}
public double[] getDimensioin() {
// TODO Auto-generated method stub
return this.dimension;
}
public double testEuclideanDistance(Medoid md){
double sum = 0.0;
double [] cDim = md.getDimension();
for(int i=0;i<dimension.length;i++){
double temp = Math.pow(dimension[i]-cDim[i], 2);
sum += temp;
}
return Math.sqrt(sum);
}
public String getPointName() {
return this.PointName;
}
public void setPointName(String pointName) {
PointName = pointName;
}
public double getCurrentEudt(){
return this.euDt;
}
}
/*******************************************************/
package com.kmedoids;
import java.util.ArrayList;
import javax.xml.crypto.Data;
public class Medoid{
private double dimension[];//质点的维度
private Cluster cluster;//所属类簇
private double etdDisSum;//Medoid到本类簇中的所有的所有欧氏距离之和
public Medoid(double dimension[]){
this.dimension=dimension;
}
public double[] getDimension() {
return dimension;
}
public void setDimension(double[] dimension) {
this.dimension = dimension;
}
public Cluster getCluster() {
return cluster;
}
public void setCluster(Cluster cluster) {
this.cluster = cluster;
}
//取代价最小的函数点
public void calcMedoid(){
calcEtdDisSum();
double minEucDisSum = this.etdDisSum;
ArrayList<DataPoint> dps = this.cluster.getDataPoints();
for(int i = 0;i<dps.size();i++){
//get()方法获得类簇上面的指定的点
double tempeucDisSum = dps.get(i).calEuclideanDistanceSum();
if (tempeucDisSum<minEucDisSum){
dimension = dps.get(i).getDimensioin();
minEucDisSum=tempeucDisSum;
}
}
}
//计算该Medoid到同类簇所有样本点的欧氏距离和
private void calcEtdDisSum() {
// TODO Auto-generated method stub
double sum = 0.0;
Cluster cluster = this.getCluster();
ArrayList<DataPoint> dataPoints = cluster.getDataPoints();
for(int i=0;i<dataPoints.size();i++){
double [] dims = dataPoints.get(i).getDimensioin();
for(int j=0;j<dims.length;j++){
double temp = Math.abs(dims[j]-this.dimension[j]);
sum += temp;
}
}
etdDisSum = sum;
}
}
/*****************************************************/
package com.kmedoids;
import java.util.ArrayList;
import javax.xml.crypto.Data;
public class ClusterAnalysis{
private Cluster[] clusters;//所有类簇
private int miter;//迭代次数
//所有样本点
private ArrayList<DataPoint> dataPoints = new ArrayList<DataPoint>();
//维度
private int dimNum;
public ClusterAnalysis(int k,int iter, ArrayList<DataPoint> dataPoints, int dimNum) {
//super();
clusters = new Cluster[k];//类簇的种类数
for(int i=0;i<k;i++){
//调用Cluster的public方法
clusters[i]=new Cluster("Cluster: "+ i);
}
this.miter = iter;
this.dataPoints = dataPoints;
this.dimNum = dimNum;
}
public int getIterations(){
return miter;
}
public ArrayList<DataPoint>[] getClusterOutput(){
ArrayList<DataPoint> data[]= new ArrayList[clusters.length];
for(int i=0;i<clusters.length;i++){
data[i] = clusters[i].getDataPoints();
}
return data;
}
public void startAnalysis(double[][] medoids){
setInitiaMedoids(medoids);
double[][]newMedoids = medoids;
double[][]oldMedoids = new double[medoids.length][this.dimNum];
while(!isEqual(oldMedoids,newMedoids)){
//每次迭代开始时清空各类簇的点
for(int m=0;m<clusters.length;m++){
clusters[m].getDataPoints().clear();
}
for(int j =0;j<dataPoints.size();j++){
int clusterIndex = 0;
double minDistance = Double.MAX_VALUE;
//判断样本点属于哪一个类簇
for(int k=0;k<clusters.length;k++){
double eucDistance = dataPoints.get(j).testEuclideanDistance(clusters[k].getMedoid());
if(eucDistance<minDistance){
minDistance = eucDistance;
clusterIndex = k;
}
}
//将样本点添加到该类簇中
clusters[clusterIndex].addDataPoint(dataPoints.get(j));
}
//重新计算各类簇的质点
for(int m=0;m<clusters.length;m++){
clusters[m].getMedoid().calcMedoid();
}
// medoids是一个二维数组
for(int i=0;i<medoids.length;i++){
for(int j =0 ;j<this.dimNum;j++){
oldMedoids[i][j] = newMedoids[i][j];
}
}
for(int n=0;n<clusters.length;n++){
newMedoids[n]=clusters[n].getMedoid().getDimension();
}
this.miter ++;
}
}
private boolean isEqual(double[][] oldMedoids, double[][] newMedoids) {
// TODO Auto-generated method stub
boolean flag = false;
for(int i=0;i<oldMedoids.length;i++){
for(int j=0;j<newMedoids[i].length;j++){
if(oldMedoids[i][j] != newMedoids[i][j]){
return flag;
}
}
}
flag = true;
return flag;
}
private void setInitiaMedoids(double[][] medoids) {
// TODO Auto-generated method stub
for(int n =0;n<clusters.length;n++){
Medoid medoid = new Medoid(medoids[n]);
clusters[n].setMedoid(medoid);
medoid.setCluster(clusters[n]);
}
}
}
/***************************************************/
package com.kmedoids;
import java.util.ArrayList;
import java.util.Iterator;
public class TestMain{
public static void main(String[] args) {
ArrayList<DataPoint> dataPoints = new ArrayList<DataPoint>();
double[] a={2,3};
double[] b={2,4};
double[] c={1,4};
double[] d={1,3};
double[] e={2,2};
double[] f={3,2};
double[] g={8,7};
double[] h={8,6};
double[] i={7,7};
double[] j={7,6};
double[] k={8,5};
// double[] gg={18,7};
// double[] hh={8,16};
// double[] ii={7,17};
// double[] jj={7,16};
// double[] kk={8,51};
double[] l={100,200};//孤立点
double[] m={8,20};
double[] n={8,19};
double[] o={7,18};
double[] p={7,17};
double[] q={7,20};
dataPoints.add(new DataPoint(a,"a"));
dataPoints.add(new DataPoint(b,"b"));
dataPoints.add(new DataPoint(c,"c"));
dataPoints.add(new DataPoint(d,"d"));
dataPoints.add(new DataPoint(e,"e"));
dataPoints.add(new DataPoint(f,"f"));
dataPoints.add(new DataPoint(g,"g"));
dataPoints.add(new DataPoint(h,"h"));
dataPoints.add(new DataPoint(i,"i"));
dataPoints.add(new DataPoint(j,"j"));
dataPoints.add(new DataPoint(k,"k"));
// dataPoints.add(new DataPoint(gg,"gg"));
// dataPoints.add(new DataPoint(hh,"hh"));
// dataPoints.add(new DataPoint(ii,"ii"));
// dataPoints.add(new DataPoint(jj,"jj"));
// dataPoints.add(new DataPoint(kk,"kk"));
dataPoints.add(new DataPoint(l,"l"));
dataPoints.add(new DataPoint(m,"m"));
dataPoints.add(new DataPoint(n,"n"));
dataPoints.add(new DataPoint(o,"o"));
dataPoints.add(new DataPoint(p,"p"));
dataPoints.add(new DataPoint(q,"q"));
//设置初始k值 初始迭代次数 样本点 样本维度
ClusterAnalysis ca = new ClusterAnalysis(5, 0, dataPoints, 2);
double[][] cen ={
//初始中心点的具体值是哪一个
//K-Medoide算法的初始质心是原样本中的点
{8,7},{8,6},{7,7},{8,19},{7,20}
};
ca.startAnalysis(cen);
ArrayList<DataPoint>[]v = ca.getClusterOutput();
for(int dti=0;dti<v.length;dti++){
ArrayList tempV = v[dti];
System.out.println("类簇类别"+dti+":");
Iterator iter = tempV.iterator();
//如果iter内还有元素可以进行迭代 则返回true
while(iter.hasNext())
{
DataPoint dpTemp =(DataPoint)iter.next();
System.out.printf(dpTemp.getPointName()+"\t");
}
System.out.println();
}
}
}
/********************************************/
运行结果: