通过MapReduce实现 TF-IDF值的统计
数据:文章ID 文件内容
3823890378201539 今天约了姐妹去逛街吃美食,周末玩得很开心啊! ...... ......
结果数据:
3823890378201539 开心:0.28558719539400335 吃:0.21277211221173534 了:0.1159152517783012 美食:0.29174432675350614 去:0.18044286652763497 玩:0.27205714412756765 啊:0.26272169358877784 姐妹:0.3983823545319593 逛街:0.33320559604063593 得很:0.45170136842118586 周末:0.2672478858982343 今天:0.16923426566752778 约:0.0946874743049455 ...... ......
在整个的处理过程中通过两步来完成
第一步主要生成三种格式的文件
1、使用分词工具将文章内容进行拆分成多个词条;并记录文章的总词条数 关于分词工具的使用请参考 TF-IDF
第一步处理后结果:
今天_3823890378201539 A:1,B:13, 周末_3823890378201539 A:1,B:13, 得很_3823890378201539 A:1,B:13, 约_3823890378201539 B:13,A:1, ......
2、记录词条在多少篇文章中出现过
处理后结果:
今天 118 周末 33 约 311 ......
3、记录文章总数
处理后结果:
counter 1065
第二步将文件2,3的内容加载到缓存,利用2,3文件的内容对文件1的内容通过mapreduce进行计算
针对数据量不是很大的数据可以加载到缓存,如果数据量过大,不考虑这种方式;
源码
Step1.java:
1 import org.apache.hadoop.conf.Configuration; 2 import org.apache.hadoop.fs.FileSystem; 3 import org.apache.hadoop.fs.Path; 4 import org.apache.hadoop.io.LongWritable; 5 import org.apache.hadoop.io.Text; 6 import org.apache.hadoop.mapreduce.Job; 7 import org.apache.hadoop.mapreduce.Mapper; 8 import org.apache.hadoop.mapreduce.Reducer; 9 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 10 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 11 import org.wltea.analyzer.core.IKSegmenter; 12 import org.wltea.analyzer.core.Lexeme; 13 14 import java.io.IOException; 15 import java.io.StringReader; 16 import java.util.HashMap; 17 import java.util.Map; 18 import java.util.Map.Entry; 19 20 /** 21 * Created by Edward on 2016/7/21. 22 */ 23 public class Step1 { 24 25 public static void main(String[] args) 26 { 27 //access hdfs's user 28 //System.setProperty("HADOOP_USER_NAME","root"); 29 30 Configuration conf = new Configuration(); 31 conf.set("fs.defaultFS", "hdfs://node1:8020"); 32 33 try { 34 FileSystem fs = FileSystem.get(conf); 35 36 Job job = Job.getInstance(conf); 37 job.setJarByClass(RunJob.class); 38 job.setMapperClass(MyMapper.class); 39 job.setReducerClass(MyReducer.class); 40 job.setPartitionerClass(FilterPartition.class); 41 42 //需要指定 map out 的 key 和 value 43 job.setOutputKeyClass(Text.class); 44 job.setOutputValueClass(Text.class); 45 46 //设置reduce task的数量 47 job.setNumReduceTasks(4); 48 49 FileInputFormat.addInputPath(job, new Path("/test/tfidf/input")); 50 51 Path path = new Path("/test/tfidf/output"); 52 if(fs.exists(path))//如果目录存在,则删除目录 53 { 54 fs.delete(path,true); 55 } 56 FileOutputFormat.setOutputPath(job, path); 57 58 boolean b = job.waitForCompletion(true); 59 if(b) 60 { 61 System.out.println("OK"); 62 } 63 64 } catch (Exception e) { 65 e.printStackTrace(); 66 } 67 } 68 69 public static class MyMapper extends Mapper<LongWritable, Text, Text, Text > { 70 @Override 71 protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 72 Map<String, Integer> map = new HashMap<String, Integer>(); 73 74 String[] str = value.toString().split("\t"); 75 StringReader stringReader = new StringReader(str[1]); 76 IKSegmenter ikSegmenter = new IKSegmenter(stringReader, true); 77 Lexeme lexeme = null; 78 Long count = 0l; 79 while((lexeme = ikSegmenter.next())!=null) { 80 String word = lexeme.getLexemeText(); 81 if(map.containsKey(word)) { 82 map.put(word, map.get(word)+1); 83 } 84 else{ 85 map.put(word, 1); 86 } 87 count++; 88 } 89 for(Entry<String, Integer> entry: map.entrySet()) 90 { 91 context.write(new Text(entry.getKey()+"_"+str[0]), new Text("A:"+entry.getValue()));//tf词条在此文章中的个数 92 context.write(new Text(entry.getKey()+"_"+str[0]), new Text("B:"+count));//此文章中的总词条数 93 context.write(new Text(entry.getKey()),new Text("1"));//词条在此文章中出现+1,计算词条在那些文章中出现过 94 } 95 context.write(new Text("counter"), new Text(1+""));//文章数累加器 96 } 97 } 98 99 public static class MyReducer extends Reducer<Text, Text, Text, Text> { 100 @Override 101 protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { 102 103 //计算总文章数 104 if(key.toString().equals("conter")) { 105 long sum = 0l; 106 for(Text v :values) 107 { 108 sum += Long.parseLong(v.toString()); 109 } 110 context.write(key, new Text(sum+"")); 111 } 112 else{ 113 if(key.toString().contains("_")) { 114 StringBuilder stringBuilder = new StringBuilder(); 115 for (Text v : values) { 116 stringBuilder.append(v.toString()); 117 stringBuilder.append(","); 118 } 119 context.write(key, new Text(stringBuilder.toString())); 120 } 121 else {//计算词条在那些文章中出现过 122 long sum = 0l; 123 for(Text v :values) 124 { 125 sum += Long.parseLong(v.toString()); 126 } 127 context.write(key, new Text(sum+"")); 128 } 129 } 130 } 131 } 132 }
FilterPartition.java
1 import org.apache.hadoop.io.Text; 2 import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner; 3 4 /** 5 * Created by Edward on 2016/7/22. 6 */ 7 public class FilterPartition extends HashPartitioner<Text, Text> { 8 9 @Override 10 public int getPartition(Text key, Text value, int numReduceTasks) { 11 12 if(key.toString().contains("counter")) 13 { 14 return numReduceTasks-1; 15 } 16 17 if(key.toString().contains("_")) 18 { 19 return super.getPartition(key, value, numReduceTasks-2); 20 } 21 else 22 { 23 return numReduceTasks-2; 24 } 25 } 26 }
Step2.java
1 import org.apache.hadoop.conf.Configuration; 2 import org.apache.hadoop.fs.FileSystem; 3 import org.apache.hadoop.fs.Path; 4 import org.apache.hadoop.io.LongWritable; 5 import org.apache.hadoop.io.Text; 6 import org.apache.hadoop.mapreduce.Job; 7 import org.apache.hadoop.mapreduce.Mapper; 8 import org.apache.hadoop.mapreduce.Reducer; 9 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 10 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 11 12 import java.io.BufferedReader; 13 import java.io.FileReader; 14 import java.io.IOException; 15 import java.net.URI; 16 import java.util.HashMap; 17 import java.util.Map; 18 19 /** 20 * Created by Edward on 2016/7/22. 21 */ 22 public class Step2 { 23 public static void main(String[] args) 24 { 25 //access hdfs's user 26 //System.setProperty("HADOOP_USER_NAME","root"); 27 28 Configuration conf = new Configuration(); 29 conf.set("fs.defaultFS", "hdfs://node1:8020"); 30 31 try { 32 FileSystem fs = FileSystem.get(conf); 33 34 Job job = Job.getInstance(conf); 35 job.setJarByClass(RunJob.class); 36 job.setMapperClass(MyMapper.class); 37 job.setReducerClass(MyReducer.class); 38 39 //需要指定 map out 的 key 和 value 40 job.setOutputKeyClass(Text.class); 41 job.setOutputValueClass(Text.class); 42 43 //分布式缓存,每个slave都能读到数据 44 //词条在多少文章中出现过 45 job.addCacheFile(new Path("/test/tfidf/output/part-r-00002").toUri()); 46 //文章的总数 47 job.addCacheFile(new Path("/test/tfidf/output/part-r-00003").toUri()); 48 49 FileInputFormat.addInputPath(job, new Path("/test/tfidf/output")); 50 51 Path path = new Path("/test/tfidf/output1"); 52 if(fs.exists(path))//如果目录存在,则删除目录 53 { 54 fs.delete(path,true); 55 } 56 FileOutputFormat.setOutputPath(job, path); 57 58 boolean b = job.waitForCompletion(true); 59 if(b) 60 { 61 System.out.println("OK"); 62 } 63 } catch (Exception e) { 64 e.printStackTrace(); 65 } 66 } 67 68 69 public static class MyMapper extends Mapper<LongWritable, Text, Text, Text > { 70 71 public static Map<String, Double> dfmap = new HashMap<String, Double>(); 72 73 public static Map<String, Double> totalmap = new HashMap<String, Double>(); 74 75 @Override 76 protected void setup(Context context) throws IOException, InterruptedException { 77 URI[] cacheFiles = context.getCacheFiles(); 78 Path pArtNum = new Path(cacheFiles[0].getPath()); 79 Path pArtTotal = new Path(cacheFiles[1].getPath()); 80 81 //加载词条在多少篇文章中出现过 82 BufferedReader buffer = new BufferedReader(new FileReader(pArtNum.getName())); 83 String line = null; 84 while((line = buffer.readLine()) != null){ 85 String[] str = line.split("\t"); 86 dfmap.put(str[0], Double.parseDouble(str[1])); 87 } 88 89 //加载文章总数 90 buffer = new BufferedReader(new FileReader(pArtTotal.getName())); 91 line = null; 92 while((line = buffer.readLine()) != null){ 93 String[] str = line.split("\t"); 94 totalmap.put(str[0], Double.parseDouble(str[1])); 95 } 96 } 97 98 @Override 99 protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 100 101 String[] strings = value.toString().split("\t"); 102 String k = strings[0]; 103 104 if(k.contains("counter")) { 105 //过滤掉 文章总数 106 } 107 else if(k.contains("_")){ 108 String word = k.split("_")[0]; 109 String[] info = strings[1].split(","); 110 String n=null; 111 String num=null; 112 if(info[0].contains("A")){ 113 n = info[0].substring(info[0].indexOf(":")+1); 114 num = info[1].substring(info[0].indexOf(":")+1); 115 } 116 if(info[0].contains("B")){ 117 num = info[0].substring(info[0].indexOf(":")+1); 118 n = info[1].substring(info[0].indexOf(":")+1); 119 } 120 double result = 0l; 121 122 result = (Double.parseDouble(n)/Double.parseDouble(num)) * Math.log( totalmap.get("counter")/dfmap.get(word)); 123 System.out.println("n=" + Double.parseDouble(n)); 124 System.out.println("num=" + Double.parseDouble(num)); 125 System.out.println("counter=" + totalmap.get("counter")); 126 System.out.println("wordnum=" + dfmap.get(word)); 127 context.write(new Text(k.split("_")[1]), new Text(word+":"+result)); 128 } 129 else{ 130 //过滤掉 词条在多少篇文章中出现过 131 } 132 } 133 } 134 135 public static class MyReducer extends Reducer<Text, Text, Text, Text> { 136 @Override 137 protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { 138 139 StringBuilder stringBuilder = new StringBuilder(); 140 for(Text t: values){ 141 stringBuilder.append(t.toString()); 142 stringBuilder.append("\t"); 143 } 144 context.write(key, new Text(stringBuilder.toString()) ); 145 } 146 } 147 }