KMeans聚类算法Hadoop实现
Assistance.java 辅助类,功能详见注释
package KMeans; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.util.LineReader; import java.io.IOException; import java.util.*; public class Assistance { //读取聚类中心点信息:聚类中心ID、聚类中心点 public static List<ArrayList<Float>> getCenters(String inputpath){ List<ArrayList<Float>> result = new ArrayList<ArrayList<Float>>(); Configuration conf = new Configuration(); try { FileSystem hdfs = FileSystem.get(conf); Path in = new Path(inputpath); FSDataInputStream fsIn = hdfs.open(in); LineReader lineIn = new LineReader(fsIn, conf); Text line = new Text(); while (lineIn.readLine(line) > 0){ String record = line.toString(); /* 因为Hadoop输出键值对时会在键跟值之间添加制表符, 所以用空格代替之。 */ String[] fields = record.replace("\t", " ").split(" "); List<Float> tmplist = new ArrayList<Float>(); for (int i = 0; i < fields.length; ++i){ tmplist.add(Float.parseFloat(fields[i])); } result.add((ArrayList<Float>) tmplist); } fsIn.close(); } catch (IOException e){ e.printStackTrace(); } return result; } //删除上一次MapReduce作业的结果 public static void deleteLastResult(String path){ Configuration conf = new Configuration(); try { FileSystem hdfs = FileSystem.get(conf); Path path1 = new Path(path); hdfs.delete(path1, true); } catch (IOException e){ e.printStackTrace(); } } //计算相邻两次迭代结果的聚类中心的距离,判断是否满足终止条件 public static boolean isFinished(String oldpath, String newpath, int k, float threshold) throws IOException{ List<ArrayList<Float>> oldcenters = Assistance.getCenters(oldpath); List<ArrayList<Float>> newcenters = Assistance.getCenters(newpath); float distance = 0; for (int i = 0; i < k; ++i){ for (int j = 1; j < oldcenters.get(i).size(); ++j){ float tmp = Math.abs(oldcenters.get(i).get(j) - newcenters.get(i).get(j)); distance += Math.pow(tmp, 2); } } System.out.println("Distance = " + distance + " Threshold = " + threshold); if (distance < threshold) return true; /* 如果不满足终止条件,则用本次迭代的聚类中心更新聚类中心 */ Assistance.deleteLastResult(oldpath); Configuration conf = new Configuration(); FileSystem hdfs = FileSystem.get(conf); hdfs.copyToLocalFile(new Path(newpath), new Path("/home/hadoop/class/oldcenter.data")); hdfs.delete(new Path(oldpath), true); hdfs.moveFromLocalFile(new Path("/home/hadoop/class/oldcenter.data"), new Path(oldpath)); return false; } }
package KMeans; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import java.io.IOException; public class KMeansDriver{ public static void main(String[] args) throws Exception{ int repeated = 0; /* 不断提交MapReduce作业指导相邻两次迭代聚类中心的距离小于阈值或到达设定的迭代次数 */ do { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 6){ System.err.println("Usage: <int> <out> <oldcenters> <newcenters> <k> <threshold>"); System.exit(2); } conf.set("centerpath", otherArgs[2]); conf.set("kpath", otherArgs[4]); Job job = new Job(conf, "KMeansCluster");//新建MapReduce作业 job.setJarByClass(KMeansDriver.class);//设置作业启动类 Path in = new Path(otherArgs[0]); Path out = new Path(otherArgs[1]); FileInputFormat.addInputPath(job, in);//设置输入路径 FileSystem fs = FileSystem.get(conf); if (fs.exists(out)){//如果输出路径存在,则先删除之 fs.delete(out, true); } FileOutputFormat.setOutputPath(job, out);//设置输出路径 job.setMapperClass(KMeansMapper.class);//设置Map类 job.setReducerClass(KMeansReducer.class);//设置Reduce类 job.setOutputKeyClass(IntWritable.class);//设置输出键的类 job.setOutputValueClass(Text.class);//设置输出值的类 job.waitForCompletion(true);//启动作业 ++repeated; System.out.println("We have repeated " + repeated + " times."); } while (repeated < 10 && (Assistance.isFinished(args[2], args[3], Integer.parseInt(args[4]), Float.parseFloat(args[5])) == false)); //根据最终得到的聚类中心对数据集进行聚类 Cluster(args); } public static void Cluster(String[] args) throws IOException, InterruptedException, ClassNotFoundException{ Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 6){ System.err.println("Usage: <int> <out> <oldcenters> <newcenters> <k> <threshold>"); System.exit(2); } conf.set("centerpath", otherArgs[2]); conf.set("kpath", otherArgs[4]); Job job = new Job(conf, "KMeansCluster"); job.setJarByClass(KMeansDriver.class); Path in = new Path(otherArgs[0]); Path out = new Path(otherArgs[1]); FileInputFormat.addInputPath(job, in); FileSystem fs = FileSystem.get(conf); if (fs.exists(out)){ fs.delete(out, true); } FileOutputFormat.setOutputPath(job, out); //因为只是将样本点聚类,不需要reduce操作,故不设置Reduce类 job.setMapperClass(KMeansMapper.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); job.waitForCompletion(true); } }
KMeansMapper.java
package KMeans; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class KMeansMapper extends Mapper<Object, Text, IntWritable, Text> { public void map(Object key, Text value, Context context) throws IOException, InterruptedException{ String line = value.toString(); String[] fields = line.split(" "); List<ArrayList<Float>> centers = Assistance.getCenters(context.getConfiguration().get("centerpath")); int k = Integer.parseInt(context.getConfiguration().get("kpath")); float minDist = Float.MAX_VALUE; int centerIndex = k; //计算样本点到各个中心的距离,并把样本聚类到距离最近的中心点所属的类 for (int i = 0; i < k; ++i){ float currentDist = 0; for (int j = 0; j < fields.length; ++j){ float tmp = Math.abs(centers.get(i).get(j + 1) - Float.parseFloat(fields[j])); currentDist += Math.pow(tmp, 2); } if (minDist > currentDist){ minDist = currentDist; centerIndex = i; } } context.write(new IntWritable(centerIndex), new Text(value)); } }
KMeansReducer.java
package KMeans; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class KMeansReducer extends Reducer<IntWritable, Text, IntWritable, Text> { public void reduce(IntWritable key, Iterable<Text> value, Context context) throws IOException, InterruptedException{ List<ArrayList<Float>> assistList = new ArrayList<ArrayList<Float>>(); String tmpResult = ""; for (Text val : value){ String line = val.toString(); String[] fields = line.split(" "); List<Float> tmpList = new ArrayList<Float>(); for (int i = 0; i < fields.length; ++i){ tmpList.add(Float.parseFloat(fields[i])); } assistList.add((ArrayList<Float>) tmpList); } //计算新的聚类中心 for (int i = 0; i < assistList.get(0).size(); ++i){ float sum = 0; for (int j = 0; j < assistList.size(); ++j){ sum += assistList.get(j).get(i); } float tmp = sum / assistList.size(); if (i == 0){ tmpResult += tmp; } else{ tmpResult += " " + tmp; } } Text result = new Text(tmpResult); context.write(key, result); } }
作业运行情况:
hadoop@shaobo-ThinkPad-E420:~/class$ hadoop jar KMeans.jar KMeans.KMeansDriver input/iris.data output input/oldcenter.data output/part-r-00000 3 0.0001 14/04/17 16:15:50 INFO input.FileInputFormat: Total input paths to process : 1 14/04/17 16:15:51 INFO mapred.JobClient: Running job: job_201404171511_0012 14/04/17 16:15:52 INFO mapred.JobClient: map 0% reduce 0% 14/04/17 16:16:07 INFO mapred.JobClient: map 100% reduce 0% 14/04/17 16:16:19 INFO mapred.JobClient: map 100% reduce 100% 14/04/17 16:16:24 INFO mapred.JobClient: Job complete: job_201404171511_0012 14/04/17 16:16:24 INFO mapred.JobClient: Counters: 25 14/04/17 16:16:24 INFO mapred.JobClient: Job Counters 14/04/17 16:16:24 INFO mapred.JobClient: Launched reduce tasks=1 14/04/17 16:16:24 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=12041 14/04/17 16:16:24 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0 14/04/17 16:16:24 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0 14/04/17 16:16:24 INFO mapred.JobClient: Launched map tasks=1 14/04/17 16:16:24 INFO mapred.JobClient: Data-local map tasks=1 14/04/17 16:16:24 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=10030 14/04/17 16:16:24 INFO mapred.JobClient: File Output Format Counters 14/04/17 16:16:24 INFO mapred.JobClient: Bytes Written=125 14/04/17 16:16:24 INFO mapred.JobClient: FileSystemCounters 14/04/17 16:16:24 INFO mapred.JobClient: FILE_BYTES_READ=3306 14/04/17 16:16:24 INFO mapred.JobClient: HDFS_BYTES_READ=11214 14/04/17 16:16:24 INFO mapred.JobClient: FILE_BYTES_WRITTEN=48901 14/04/17 16:16:24 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=125 14/04/17 16:16:24 INFO mapred.JobClient: File Input Format Counters 14/04/17 16:16:24 INFO mapred.JobClient: Bytes Read=2550 14/04/17 16:16:24 INFO mapred.JobClient: Map-Reduce Framework 14/04/17 16:16:24 INFO mapred.JobClient: Reduce input groups=3 14/04/17 16:16:24 INFO mapred.JobClient: Map output materialized bytes=3306 14/04/17 16:16:24 INFO mapred.JobClient: Combine output records=0 14/04/17 16:16:24 INFO mapred.JobClient: Map input records=150 14/04/17 16:16:24 INFO mapred.JobClient: Reduce shuffle bytes=0 14/04/17 16:16:24 INFO mapred.JobClient: Reduce output records=3 14/04/17 16:16:24 INFO mapred.JobClient: Spilled Records=300 14/04/17 16:16:24 INFO mapred.JobClient: Map output bytes=3000 14/04/17 16:16:24 INFO mapred.JobClient: Combine input records=0 14/04/17 16:16:24 INFO mapred.JobClient: Map output records=150 14/04/17 16:16:24 INFO mapred.JobClient: SPLIT_RAW_BYTES=114 14/04/17 16:16:24 INFO mapred.JobClient: Reduce input records=150 We have repeated 1 times. Distance = 0.35025704 Threshold = 1.0E-4 14/04/17 16:16:24 INFO input.FileInputFormat: Total input paths to process : 1 14/04/17 16:16:25 INFO mapred.JobClient: Running job: job_201404171511_0013 14/04/17 16:16:26 INFO mapred.JobClient: map 0% reduce 0% 14/04/17 16:16:40 INFO mapred.JobClient: map 100% reduce 0% 14/04/17 16:16:52 INFO mapred.JobClient: map 100% reduce 100% 14/04/17 16:16:57 INFO mapred.JobClient: Job complete: job_201404171511_0013 14/04/17 16:16:57 INFO mapred.JobClient: Counters: 25 14/04/17 16:16:57 INFO mapred.JobClient: Job Counters 14/04/17 16:16:57 INFO mapred.JobClient: Launched reduce tasks=1 14/04/17 16:16:57 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=12077 14/04/17 16:16:57 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0 14/04/17 16:16:57 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0 14/04/17 16:16:57 INFO mapred.JobClient: Launched map tasks=1 14/04/17 16:16:57 INFO mapred.JobClient: Data-local map tasks=1 14/04/17 16:16:57 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=10048 14/04/17 16:16:57 INFO mapred.JobClient: File Output Format Counters 14/04/17 16:16:57 INFO mapred.JobClient: Bytes Written=116 14/04/17 16:16:57 INFO mapred.JobClient: FileSystemCounters 14/04/17 16:16:57 INFO mapred.JobClient: FILE_BYTES_READ=3306 14/04/17 16:16:57 INFO mapred.JobClient: HDFS_BYTES_READ=21414 14/04/17 16:16:57 INFO mapred.JobClient: FILE_BYTES_WRITTEN=48901 14/04/17 16:16:57 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=116 14/04/17 16:16:57 INFO mapred.JobClient: File Input Format Counters 14/04/17 16:16:57 INFO mapred.JobClient: Bytes Read=2550 14/04/17 16:16:57 INFO mapred.JobClient: Map-Reduce Framework 14/04/17 16:16:57 INFO mapred.JobClient: Reduce input groups=3 14/04/17 16:16:57 INFO mapred.JobClient: Map output materialized bytes=3306 14/04/17 16:16:57 INFO mapred.JobClient: Combine output records=0 14/04/17 16:16:57 INFO mapred.JobClient: Map input records=150 14/04/17 16:16:57 INFO mapred.JobClient: Reduce shuffle bytes=3306 14/04/17 16:16:57 INFO mapred.JobClient: Reduce output records=3 14/04/17 16:16:57 INFO mapred.JobClient: Spilled Records=300 14/04/17 16:16:57 INFO mapred.JobClient: Map output bytes=3000 14/04/17 16:16:57 INFO mapred.JobClient: Combine input records=0 14/04/17 16:16:57 INFO mapred.JobClient: Map output records=150 14/04/17 16:16:57 INFO mapred.JobClient: SPLIT_RAW_BYTES=114 14/04/17 16:16:57 INFO mapred.JobClient: Reduce input records=150 We have repeated 2 times. Distance = 0.006297064 Threshold = 1.0E-4 14/04/17 16:16:57 INFO input.FileInputFormat: Total input paths to process : 1 14/04/17 16:16:58 INFO mapred.JobClient: Running job: job_201404171511_0014 14/04/17 16:16:59 INFO mapred.JobClient: map 0% reduce 0% 14/04/17 16:17:14 INFO mapred.JobClient: map 100% reduce 0% 14/04/17 16:17:25 INFO mapred.JobClient: map 100% reduce 100% 14/04/17 16:17:30 INFO mapred.JobClient: Job complete: job_201404171511_0014 14/04/17 16:17:30 INFO mapred.JobClient: Counters: 25 14/04/17 16:17:30 INFO mapred.JobClient: Job Counters 14/04/17 16:17:30 INFO mapred.JobClient: Launched reduce tasks=1 14/04/17 16:17:30 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=12046 14/04/17 16:17:30 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0 14/04/17 16:17:30 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0 14/04/17 16:17:30 INFO mapred.JobClient: Launched map tasks=1 14/04/17 16:17:30 INFO mapred.JobClient: Data-local map tasks=1 14/04/17 16:17:30 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=10051 14/04/17 16:17:30 INFO mapred.JobClient: File Output Format Counters 14/04/17 16:17:30 INFO mapred.JobClient: Bytes Written=116 14/04/17 16:17:30 INFO mapred.JobClient: FileSystemCounters 14/04/17 16:17:30 INFO mapred.JobClient: FILE_BYTES_READ=3306 14/04/17 16:17:30 INFO mapred.JobClient: HDFS_BYTES_READ=20064 14/04/17 16:17:30 INFO mapred.JobClient: FILE_BYTES_WRITTEN=48901 14/04/17 16:17:30 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=116 14/04/17 16:17:30 INFO mapred.JobClient: File Input Format Counters 14/04/17 16:17:30 INFO mapred.JobClient: Bytes Read=2550 14/04/17 16:17:30 INFO mapred.JobClient: Map-Reduce Framework 14/04/17 16:17:30 INFO mapred.JobClient: Reduce input groups=3 14/04/17 16:17:30 INFO mapred.JobClient: Map output materialized bytes=3306 14/04/17 16:17:30 INFO mapred.JobClient: Combine output records=0 14/04/17 16:17:30 INFO mapred.JobClient: Map input records=150 14/04/17 16:17:30 INFO mapred.JobClient: Reduce shuffle bytes=0 14/04/17 16:17:30 INFO mapred.JobClient: Reduce output records=3 14/04/17 16:17:30 INFO mapred.JobClient: Spilled Records=300 14/04/17 16:17:30 INFO mapred.JobClient: Map output bytes=3000 14/04/17 16:17:30 INFO mapred.JobClient: Combine input records=0 14/04/17 16:17:30 INFO mapred.JobClient: Map output records=150 14/04/17 16:17:30 INFO mapred.JobClient: SPLIT_RAW_BYTES=114 14/04/17 16:17:30 INFO mapred.JobClient: Reduce input records=150 We have repeated 3 times. Distance = 0.0 Threshold = 1.0E-4 14/04/17 16:17:30 INFO input.FileInputFormat: Total input paths to process : 1 14/04/17 16:17:30 INFO mapred.JobClient: Running job: job_201404171511_0015 14/04/17 16:17:31 INFO mapred.JobClient: map 0% reduce 0% 14/04/17 16:17:47 INFO mapred.JobClient: map 100% reduce 0% 14/04/17 16:17:59 INFO mapred.JobClient: map 100% reduce 100% 14/04/17 16:18:04 INFO mapred.JobClient: Job complete: job_201404171511_0015 14/04/17 16:18:04 INFO mapred.JobClient: Counters: 25 14/04/17 16:18:04 INFO mapred.JobClient: Job Counters 14/04/17 16:18:04 INFO mapred.JobClient: Launched reduce tasks=1 14/04/17 16:18:04 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=12036 14/04/17 16:18:04 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0 14/04/17 16:18:04 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0 14/04/17 16:18:04 INFO mapred.JobClient: Launched map tasks=1 14/04/17 16:18:04 INFO mapred.JobClient: Data-local map tasks=1 14/04/17 16:18:04 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=10050 14/04/17 16:18:04 INFO mapred.JobClient: File Output Format Counters 14/04/17 16:18:04 INFO mapred.JobClient: Bytes Written=2700 14/04/17 16:18:04 INFO mapred.JobClient: FileSystemCounters 14/04/17 16:18:04 INFO mapred.JobClient: FILE_BYTES_READ=3306 14/04/17 16:18:04 INFO mapred.JobClient: HDFS_BYTES_READ=20064 14/04/17 16:18:04 INFO mapred.JobClient: FILE_BYTES_WRITTEN=48717 14/04/17 16:18:04 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=2700 14/04/17 16:18:04 INFO mapred.JobClient: File Input Format Counters 14/04/17 16:18:04 INFO mapred.JobClient: Bytes Read=2550 14/04/17 16:18:04 INFO mapred.JobClient: Map-Reduce Framework 14/04/17 16:18:04 INFO mapred.JobClient: Reduce input groups=3 14/04/17 16:18:04 INFO mapred.JobClient: Map output materialized bytes=3306 14/04/17 16:18:04 INFO mapred.JobClient: Combine output records=0 14/04/17 16:18:04 INFO mapred.JobClient: Map input records=150 14/04/17 16:18:04 INFO mapred.JobClient: Reduce shuffle bytes=0 14/04/17 16:18:04 INFO mapred.JobClient: Reduce output records=150 14/04/17 16:18:04 INFO mapred.JobClient: Spilled Records=300 14/04/17 16:18:04 INFO mapred.JobClient: Map output bytes=3000 14/04/17 16:18:04 INFO mapred.JobClient: Combine input records=0 14/04/17 16:18:04 INFO mapred.JobClient: Map output records=150 14/04/17 16:18:04 INFO mapred.JobClient: SPLIT_RAW_BYTES=114 14/04/17 16:18:04 INFO mapred.JobClient: Reduce input records=150