并行K-Means
K-Means算法
系统采用vm下ubuntu16.04
一、 实验内容与要求
在Eclipse环境下编写实现K-means算法。
二、 实验数据与目标
1、实验数据
来源于http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data。实验数据文件名为iris.txt(鸢尾花数据集),数据大小为4.6KB。数据的格式下图所示,每行代表一个数据点对象,数值为对象属性值,字符串为类型。总共分为3类,每类50条。
DFS Locations
|--hadoopTest
|--(1)
|--user(1)
|--hk(1)
|--KMeans_in(1)
|--iris.txt
2、实验目标
输入簇中心个数3和迭代次数I,用MapReduce来实现Kmeans算法,对实验数据进行分类。并查看同簇中,不同类型数据的数量。理想结果为分了3簇,每簇中对象类型基本相同。
三、 实现思路:
1、实现可行性分析:
在进行K-Means聚类中,在处理每一个数据点时,只需要知道各个cluster的中心信息(簇ID, 簇中点个数,簇中心点对象属性),不需要知道关于其他数据点的任何信息。数据中所有点对象不互相影响,因此可以进行Hadoop并行处理。
2、MapReduce并行化KMeans算法设计思路:
(1)将所有的点对象数据分布到不同的MapReduce节点上,每个节点只对自己的数据进行计算。
(2)每个Map节点能够读取上一次迭代生成的cluster中心点信息,并判断自己的各个数据点应该属于哪一个cluster。
(3)Reduce节点累加属于某个cluster的每个数据点,计算出新的cluster中心点,输出到指定序列号文件中,作为下一次迭代的输入。
Map用来找到每个点对象所属的簇ID,Reduce 将相同簇ID点对象数据合并生成新簇对象
3、代码实现流程思路:
(1)初始化:根据传入K值,在源数据中随机选择K个点对象作为初始的簇中心,生成簇对象(簇ID, 簇中点个数,簇中心点对象属性)存入输出文件中。
(2)迭代:遍历源数据点对象,将其中的每个点对象与上一次输出文件中的簇对象记录进行比较根据最短欧式距离分配到不同的簇中,并计算生成下次迭代用到的新簇对象,存入对应索引输出文件中,以便下次迭代使用。
(3)输出最终结果:遍历源数据点对象,根据最后一次迭代输出的簇对象来分配到不同的簇,然后输出分类结果。
四、 代码实现
(1)Point类
package edu.hk.kmeans;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;import org.apache.hadoop.io.Writable;
public class Point implements Writable {
//属性特征值
ArrayList<Double> values;
String type;public Point() { values = new ArrayList<Double>(); } public Point(ArrayList<Double> tempValues) { values =tempValues; } //读取数据,每一行的数据都是一个对象的属性值 public Point(String line) { String[] valueString = line.split(","); values = new ArrayList<Double>(); for (int i = 0; i < valueString.length-1; i++) { values.add(Double.parseDouble(valueString[i])); } type = valueString[valueString.length-1]; } public void setValues(ArrayList<Double> tempValue) { values = tempValue; } public ArrayList<Double> getValues() { return values; } //将属性中转换为string值输出 public String toString() { String s = new String(); for (int i = 0; i < values.size() ; i++) { s += (values.get(i) + ","); } s +=type; return s; } @Override public void write(DataOutput out) throws IOException { // TODO Auto-generated method stub out.writeInt(values.size()); for (int i = 0; i < values.size(); i++) { out.writeDouble(values.get(i)); } } @Override public void readFields(DataInput in) throws IOException { // TODO Auto-generated method stub int size = 0; values = new ArrayList<Double>(); if ((size = in.readInt()) != 0) { for (int i = 0; i < size; i++) { values.add(in.readDouble()); } } }
}
(2)Cluster类
package edu.hk.kmeans;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;import org.apache.hadoop.io.Writable;
//簇
public class Cluster implements Writable{
//簇Id
private int clusterID;
//簇中点数
private long numOfPoints;
//簇中心
private Point center;public Cluster(){ this.setClusterID(-1); this.setNumOfPoints(0); this.setCenter(new Point()); } public Cluster(int clusterID,Point center){ this.setClusterID(clusterID); this.setNumOfPoints(0); this.setCenter(center); } public Cluster(String line){ String[] value = line.split(",",3); clusterID = Integer.parseInt(value[0]); numOfPoints = Long.parseLong(value[1]); center = new Point(value[2]); } public String toString(){ String result = String.valueOf(clusterID) + "," + String.valueOf(numOfPoints) + "," + center.toString(); return result; } public int getClusterID() { return clusterID; } public void setClusterID(int clusterID) { this.clusterID = clusterID; } public long getNumOfPoints() { return numOfPoints; } public void setNumOfPoints(long numOfPoints) { this.numOfPoints = numOfPoints; } public Point getCenter() { return center; } public void setCenter(Point center) { this.center = center; }
// public void observeInstance(Point instance){
// try {
// Point sum = center.multiply(numOfPoints).add(instance);
// numOfPoints++;
// center = sum.divide(numOfPoints);
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
public void write(DataOutput out) throws IOException {
out.writeInt(clusterID);
out.writeLong(numOfPoints);
center.write(out);
}
@Override
public void readFields(DataInput in) throws IOException {
clusterID = in.readInt();
numOfPoints = in.readLong();
center.readFields(in);
}
public static void main(String[] args) {
new Cluster("2,1,3,4,1,4");
}
}
(3)RandomClusterGenerator类
package edu.hk.kmeans; import java.io.IOException; import java.net.URI; import java.util.ArrayList; import java.util.Random;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.LineReader;//随机生成K个簇
public final class RandomClusterGenerator {
private int k;
private FileStatus[] fileList;
private FileSystem fs;
private ArrayList<Cluster> kClusters;
private Configuration conf;//随机簇生成器 public RandomClusterGenerator(Configuration conf,String filePath,int k){ this.k = k; try { fs = FileSystem.get(URI.create(filePath),conf); fileList = fs.listStatus((new Path(filePath))); //构造初始容量为K的空arraylist,kcluster.size =0 kClusters = new ArrayList<Cluster>(k); this.conf = conf; } catch (IOException e) { e.printStackTrace(); } } //读取数据,构造初始簇 public void generateInitialCluster(String destinationPath){ Text line = new Text(); FSDataInputStream fsi = null; try { for(int i = 0;i < fileList.length;i++){ int count = 0; fsi = fs.open(fileList[i].getPath()); LineReader lineReader = new LineReader(fsi,conf); while(lineReader.readLine(line) > 0){//读取数据,当数据不为空时 System.out.println("read a line:" + line); if(line.toString().length()==0) { continue; } //每行数据,即每个对象的属性值 Point point = new Point(line.toString()); //count 0-149 makeDecision(point,count); count++; } } } catch (IOException e) { e.printStackTrace(); } finally { try { fsi.close(); } catch (IOException e) { e.printStackTrace(); } } //将数据写回到文件 writeBackToFile(destinationPath); } //决策判断,新增或替换簇中心 public void makeDecision(Point point){ if(kClusters.size() < k){//如果当前簇中心集合个数小于k,将传入的对象作为新簇中心 Cluster cluster = new Cluster(kClusters.size() + 1, point); kClusters.add(cluster); }else{//如果簇中心已经满了,随机替换当前簇中心的一个 int choice = randomChoose(k); if(!(choice == -1)){ int id = kClusters.get(choice).getClusterID(); kClusters.remove(choice); Cluster cluster = new Cluster(id, point); kClusters.add(cluster); } } } public void makeDecision(Point point,int count){ //0,1.2 int split = count/50; if(kClusters.size() == split){//如果当前簇中心集合个数等于分段数,则要添加 Cluster cluster = new Cluster(kClusters.size() + 1, point); kClusters.add(cluster); }else{//如果分配给该分段的簇中心已将分配,按概率来重新计算 int choice = randomChoose(50); if(!(choice == -1)){ int id = split+1; kClusters.remove(split); Cluster cluster = new Cluster(id, point); kClusters.add(cluster); } } } /*以1/(1+k)的概率返回一个[0,k-1]中的正整数,以k/k+1的概率返回-1. */ public int randomChoose(int k){ Random random = new Random(); if(random.nextInt(k + 1) == 0){ return new Random().nextInt(k); }else return -1; } //将随机生成的簇中心写回到文件中,以便下一次迭代的时候使用 public void writeBackToFile(String destinationPath){ Path path = new Path(destinationPath + "cluster-0/clusters"); FSDataOutputStream fsi = null; try { fsi = fs.create(path); for(Cluster cluster : kClusters){ fsi.write((cluster.toString() + "\n").getBytes()); } } catch (IOException e) { e.printStackTrace(); } finally { try { fsi.close(); } catch (IOException e) { e.printStackTrace(); } } }
}
(4)KMeans类
package edu.hk.kmeans;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;public class KMeans {
public static Integer valueNumber = new Integer(0);public static class KMeansMapper extends Mapper<LongWritable,Text,IntWritable,Cluster>{ //存储上一次迭代得到簇对象集合 private ArrayList<Cluster> kClusters = new ArrayList<Cluster>(); //分类对象属性个数 //此方法被MapReduce框架仅且执行一次,在执行Map任务前,进行相关变量或者资源的集中初始化工作。 //若是将资源初始化工作放在方法map()中,导致Mapper任务在解析每一行输入时都会进行资源初始化工作,导致重复,程序运行效率不高 @Override protected void setup(Context context) throws IOException,InterruptedException{ super.setup(context); valueNumber = new Integer(context.getConfiguration().get("valueNumber")); if(valueNumber == 0 ) { System.out.println("对象属性个数异常!"); return; } FileSystem fs = FileSystem.get(context.getConfiguration()); //从簇中心文件中读取簇中心数据 FileStatus[] fileList = fs.listStatus(new Path(context.getConfiguration().get("clusterPath"))); BufferedReader in = null; FSDataInputStream fsi = null; String line = null; for(int i = 0; i < fileList.length; i++){ if(!fileList[i].isDirectory()){ fsi = fs.open(fileList[i].getPath()); in = new BufferedReader(new InputStreamReader(fsi,"UTF-8")); while((line = in.readLine()) != null){ System.out.println("read a cluster line:" + line); //读取上一次迭代的选出的簇中心,来初始化簇 //簇ID,簇中点个数,簇中心对象属性值 Cluster cluster = new Cluster(line); kClusters.add(cluster); } } } in.close(); fsi.close(); } //遍历Points.txt,找到每个点最近的簇ID,然后以该点为中心构建新簇对象,这里相当于有多少个对象就有多少个簇对象,很多对象簇ID相同 public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException{ //从Points.txt中读行 Point point = new Point(value.toString()); if(point.getValues().size() != valueNumber){ System.out.println("异常记录!"); return ; } int clusterID; try { //找到最近的簇中心的ID clusterID = getNearestClusterID(point); if(clusterID == -1) throw new InterruptedException("查找最近cluster失败!"); else{ //以当前对象为中心,构造簇对象,簇中点个数为1 Cluster cluster = new Cluster(clusterID, point); cluster.setNumOfPoints(1); System.out.println("Map out cluster:" + cluster.toString()); //各点分到新簇的结果(id,簇),这里簇点个数为1,簇中心为当前遍历到的点 //簇ID,簇中对象个数1,对象的属性 context.write(new IntWritable(clusterID), cluster); } } catch (Exception e) { e.printStackTrace(); } } //返回最近的簇ID public int getNearestClusterID(Point point) throws Exception{ int clusterID = -1; double minDis = Double.MAX_VALUE; double newDis = 0.0; for(Cluster cluster : kClusters){ //计算欧式距离 for(int i = 0;i < point.getValues().size(); i++) { newDis += Math.pow((cluster.getCenter().getValues().get(i)-point.getValues().get(i)), 2); } newDis = Math.sqrt(newDis); if(newDis < minDis){ clusterID = cluster.getClusterID(); minDis = newDis; } } return clusterID; } } /* * Map Map Map * | | | * Combiner Combiner Combiner(对数量为1的相同ID簇对象进行合并,Combiner对每个Map执行0次或1次) * | | | * |__________|___________| * | * Reducer * */ //合并,减少传送到Reducer的数据数量,其输入与Reducer相同,输出与Map相同,而Map的输出为Reduce的输入,所以输入输出类型相同 public static class KMeansCombiner extends Reducer<IntWritable,Cluster,IntWritable,Cluster>{ public void reduce(IntWritable key, Iterable<Cluster> value, Context context)throws IOException, InterruptedException{ //用来存放累计各个属性值 ArrayList<Double> values = new ArrayList<Double>(); for(int i =0 ;i< valueNumber;i++) { values.add(0.0); } int numOfPoints = 0; //遍历相同ID簇对象,计数,累加各点属性 for(Cluster cluster : value){ //计数,同一个ID簇中对象个数,每个簇对象中点个数都是1 numOfPoints ++; //对象属性值累加 for (int i = 0; i < values.size(); i++) { values.set(i,values.get(i)+cluster.getCenter().getValues().get(i)); System.out.println("Combiner value"+i+": " + values.get(i)); } } System.out.println(numOfPoints); //如果簇中点个数大于0 if(numOfPoints > 0) { //构造新簇中心,累计值除以对象个数 for (int i = 0; i < values.size(); i++) { values.set(i, values.get(i)/numOfPoints); } } //新簇 Cluster cluster = new Cluster(key.get(),new Point(values)); //新簇中点个数 cluster.setNumOfPoints(numOfPoints); System.out.println("combiner 合并后新簇对象:" + cluster.toString()); //簇ID,簇中对象个数,簇中心属性(相同ID的,数量为1的簇的中心属性累计平均值) context.write(key, cluster); } } //输出新cluster,簇ID,合并不同节点的Combiner结果,这些Combiner最后输出的Key可能有几个时相同的要放在这里做最后的合并 public static class KMeansReducer extends Reducer<IntWritable,Cluster,NullWritable,Cluster>{ public void reduce(IntWritable key, Iterable<Cluster> value, Context context)throws IOException, InterruptedException{ //用来存放累计各个属性值 ArrayList<Double> values = new ArrayList<Double>(); for(int i =0 ;i< valueNumber;i++) { values.add(0.0); } int numOfPoints = 0; for(Cluster cluster : value){ numOfPoints += cluster.getNumOfPoints(); //对象属性值累加 for (int i = 0; i < values.size(); i++) { values.set(i,values.get(i)+cluster.getCenter().getValues().get(i)*cluster.getNumOfPoints()); System.out.println("Reducer value"+i+": " + values.get(i)); } } //如果簇中点个数大于0 if(numOfPoints > 0) { //构造新簇中心,累计值除以对象个数 for (int i = 0; i < values.size(); i++) { values.set(i, values.get(i)/numOfPoints); } } Cluster cluster = new Cluster(key.get(),new Point(values)); cluster.setNumOfPoints(numOfPoints); System.out.println("Reducer cluster"+cluster.toString()); context.write(NullWritable.get(), cluster); } }
}
(5)KMeansClusterView类
package edu.hk.kmeans; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;public class KMeansClustersView {
public static class KMeansClusterMapper extends Mapper<LongWritable, Text, IntWritable, Text>{
private ArrayList<Cluster> kClusters = new ArrayList<Cluster>();//读取最后一次得到的簇中心结果 protected void setup(Context context) throws IOException,InterruptedException{ super.setup(context); FileSystem fs = FileSystem.get(context.getConfiguration()); FileStatus[] fileList = fs.listStatus(new Path(context.getConfiguration().get("clusterPath"))); BufferedReader in = null; FSDataInputStream fsi = null; String line = null; for(int i = 0; i < fileList.length; i++){ if(!fileList[i].isDirectory()){ fsi = fs.open(fileList[i].getPath()); in = new BufferedReader(new InputStreamReader(fsi,"UTF-8")); while((line = in.readLine()) != null){ System.out.println("read a line:" + line); Cluster cluster = new Cluster(line); kClusters.add(cluster); } } } in.close(); fsi.close(); } //为源数据中每个对象找到最近的簇,<簇ID,数据对象字符串> public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException{ Point instance = new Point(value.toString()); int id; try { id = getNearestClusterID(instance); if(id == -1) throw new InterruptedException("id == -1"); else{ context.write(new IntWritable(id), value); } } catch (Exception e) { e.printStackTrace(); } } //返回最近的簇ID public int getNearestClusterID(Point point) throws Exception{ int clusterID = -1; double minDis = Double.MAX_VALUE; double newDis = 0.0; for(Cluster cluster : kClusters){ //计算欧式距离 for(int i = 0;i < point.getValues().size(); i++) { newDis += Math.pow((cluster.getCenter().getValues().get(i)-point.getValues().get(i)), 2); } newDis = Math.sqrt(newDis); if(newDis < minDis){ clusterID = cluster.getClusterID(); minDis = newDis; } } return clusterID; } } public static class KMeansClusterReducer extends Reducer<IntWritable, Text,Text, Text>{ public void reduce(IntWritable key, Iterable<Text> value, Context context)throws IOException, InterruptedException{ String str="\n"; int count = 0; for(Text textIntance: value){ str= str + textIntance.toString()+"\n"; count++; } context.write(new Text("簇"+key+"中共有"+count+"个对象:"),new Text(str)); } }
}
(6)KMeansController类
package edu.hk.kmeans; import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class KMeansController {
private int k;
private int iterationNum;
private String sourcePath;
private String outputPath;
private Configuration conf;
public KMeansController(int k, int iterationNum, String sourcePath, String outputPath, Configuration conf){
this.k = k;
this.iterationNum = iterationNum;
this.sourcePath = sourcePath;
this.outputPath = outputPath;
this.conf = conf;
}
//簇中心进行迭代求解
public void clusterCenterJob() throws IOException, InterruptedException, ClassNotFoundException{
//进行iterationNum次的迭代,每次都取前一次的簇中心结果来进行新簇的计算
for(int i = 0;i < iterationNum; i++){
Job clusterCenterJob = Job.getInstance(conf,"clusterCenterJob" + i);
clusterCenterJob .setJarByClass(KMeans.class);
clusterCenterJob.getConfiguration().set("clusterPath", outputPath + "/cluster-" + i +"/");
clusterCenterJob.setMapperClass(KMeans.KMeansMapper.class);
clusterCenterJob.setMapOutputKeyClass(IntWritable.class);
clusterCenterJob.setMapOutputValueClass(Cluster.class);
clusterCenterJob.setCombinerClass(KMeans.KMeansCombiner.class);
clusterCenterJob.setReducerClass(KMeans.KMeansReducer .class);
clusterCenterJob.setOutputKeyClass(NullWritable.class);
clusterCenterJob.setOutputValueClass(Cluster.class);
FileInputFormat.addInputPath(clusterCenterJob, new Path(sourcePath));
FileOutputFormat.setOutputPath(clusterCenterJob, new Path(outputPath + "/cluster-" + (i + 1) +"/"));
clusterCenterJob.waitForCompletion(true);
System.out.println("finished!");
}
}//根据最后一次的结果来重新对原始的数据进行分类 public void KMeansClusterJob() throws IOException, InterruptedException, ClassNotFoundException{ Job kMeansClusterJob = Job.getInstance(conf,"KMeansClusterJob"); kMeansClusterJob.setJarByClass(KMeansClustersView.class); kMeansClusterJob.getConfiguration().set("clusterPath", outputPath + "/cluster-" + (iterationNum - 1) +"/"); kMeansClusterJob.setMapperClass(KMeansClustersView.KMeansClusterMapper.class); kMeansClusterJob.setMapOutputKeyClass(IntWritable.class); kMeansClusterJob.setMapOutputValueClass(Text.class); kMeansClusterJob.setReducerClass(KMeansClustersView.KMeansClusterReducer.class); kMeansClusterJob.setOutputKeyClass(Text.class); kMeansClusterJob.setOutputValueClass(Text.class); //kMeansClusterJob.setNumReduceTasks(0); FileInputFormat.addInputPath(kMeansClusterJob, new Path(sourcePath)); FileOutputFormat.setOutputPath(kMeansClusterJob, new Path(outputPath + "/clusteredInstances" + "/")); kMeansClusterJob.waitForCompletion(true); System.out.println("finished!"); } //生成初始化的K个簇中心,将他们写到cluster-0文件中 public void generateInitialCluster(){ //构造初使化簇生产器,配置文件,原数据路径,初始化簇中心个数 RandomClusterGenerator generator = new RandomClusterGenerator(conf, sourcePath, k); //初始化簇 generator.generateInitialCluster(outputPath + "/"); } public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException{ System.out.println("start"); Configuration conf = new Configuration(); conf.set("fs.defaultFS", "hdfs://localhost:9000"); conf.set("mapred.job.tracker", "localhost:9001"); conf.set("valueNumber","4"); int k = 3;//Integer.parseInt(args[0]); 中心个数 int iterationNum = 3;//Integer.parseInt(args[1]); 迭代次数 String[] myArgs= { "KMeans_in", "KMeans_out"}; String sourcePath = myArgs[0]; String outputPath = myArgs[1]; KMeansController controller = new KMeansController(k, iterationNum, sourcePath, outputPath, conf); //读取输入数据,并完成数据的初始化处理 controller.generateInitialCluster(); System.out.println("initial cluster finished"); controller.clusterCenterJob(); controller.KMeansClusterJob(); }
}
五、 实验结果
这里的实验结果和迭代次数,开始选择的迭代中心有关