hadoop模板分析

如下是一个模板。解释在注释里。代码是从网上找的
 1 package hadoop_homework;

 6 import java.util.ArrayList;
 7 import java.io.IOException;
 8 import java.util.Iterator;
 9 import java.util.StringTokenizer;
10 import org.apache.hadoop.conf.Configuration;
11 import org.apache.hadoop.fs.Path;
12 import org.apache.hadoop.io.IntWritable;
13 import org.apache.hadoop.io.LongWritable;
14 import org.apache.hadoop.io.Text;
15 import org.apache.hadoop.mapreduce.Job;
16 import org.apache.hadoop.mapreduce.Mapper;
17 import org.apache.hadoop.mapreduce.Reducer;
18 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
19 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
20 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
21 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
22 import org.apache.hadoop.util.GenericOptionsParser;
// 这行以上的东西基本都是差不多的。
23 public class wordCount{
24     public static class Map extends
26             Mapper<LongWritable, Text, Text, IntWritable> {
　　　　　　/*前两个参数基本不用管，有的程序是把LongWritable改为Object。后两个是map产生的那些个对的两个值。这里的意思是(Text, IntWritable)本map程序产生的是
　　　　　　key值是Text类型，value是IntWritable类型的。这里可以改类型。如果不嫌麻烦，可以都改成Text，反正数字转字符串
　　　　　　或者反过来，都不过是一个方法的问题而已。
　　　　　　*/28          
29         public void map(LongWritable key, Text value, Context context)
30                 throws IOException, InterruptedException {
　　　　　　/*这里的map函数基本就可以理解为获取文件里的每一行。基本上算是吧。
　　　　　　前两个参数要注意和第26行那里匹配。*/

32             String line = value.toString();
33         /*基本上第一句话都是这个。是每一次从文件里获取的那一行。*/
　　　　　　/*从这句话开始，你就可以以line为原材料，开始编辑自己的逻辑了*/   
 　　　　　　　　
　　　　　　　　context.write(new Text(""), new IntWritable(1));
　　　　　　　　/*map的输出，也就是结果的记录。注意，这里的write方法参数要和26行的匹配。*/
51         }
52     }
53     
54     public static class Reduce extends
57             Reducer<Text, IntWritable, Text, IntWritable> {
　　　　　　　/*前两个是从前面来的context输入的格式，后两个是此次context输出的格式。基本可以这么认为*/


58         
59         public void reduce(Text key, Iterable<IntWritable> values,
60                 Context context) throws IOException, InterruptedException {
　　　　　　　　/*和map差不多。*/


61             Iterator<IntWritable> iterator = values.iterator();
　　　　　　　　/*iterator里的东西，是同一个key的所有组合的value值。所以要用迭代器来遍历里面的东西*/
63             while (iterator.hasNext()){
　　　　　　　　　　/*这里可以有自己的逻辑，比如累加啊之类的。*/
65             }

　　　　　　　　/*遍历完了之后还可以根据需要进行操作。*/
　　　　　　　　　
66             context.write(key, new IntWritable(sum));
　　　　　　　　/*和map方法一样。reduce的write输入，如果重复了的（。。。，。。。）组合会被去掉，所以可以用来去重。*/
　　　　　　　　
　　　　　　　　/*
　　　　　　　　从61行到66行，也可以用for(IntWritable t : values){     /*逻辑*/}来遍历
　　　　　　　　　　*/

67             
68         }
69     }
70     public static void main(String[] args) throws Exception {
71         Configuration conf = new Configuration();
72         /*以下代码几乎都是照搬的，每次都要小小地修改一下。具体道理我也不是太懂。照搬就好*/
73         conf.set("mapred.job.tracker", "localhost:9000");
74         String[] ioArgs = new String[] { "score_in", "score_out1" };/*这里是输入文件夹和输出文件夹的位置。都是本地的。*/
75         String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();
76         if (otherArgs.length != 2) {
77             System.err.println("Usage: Score Average <in> <out>");
78             System.exit(2);
79         }
80         Job job = new Job(conf, "Score Average");
81         job.setJarByClass(friendCount.class);
82         // 设置Map、Combine和Reduce处理类
83         job.setMapperClass(Map.class);
84         job.setCombinerClass(Reduce.class);
85         job.setReducerClass(Reduce.class);/*这里的combiner可以不止一个。combiner我感觉一般是跟reducer一样的，就当是一次
　　　　　　　　　　　　　　　　　　　　　　　　　　　　reduce不完的东西，多次来reduce。所以一个程序不是只能有一个map和一个reduce两个类，
　　　　　　　　　　　　　　　　　　　　　　　　　　　　完全可以有更多。看需要吧。*/
86         // 设置输出类型
87         job.setOutputKeyClass(Text.class);
88         job.setOutputValueClass(IntWritable.class);/*这俩要和最终reducer匹配*/
89         // 将输入的数据集分割成小数据块splites，提供一个RecordReder的实现
90         job.setInputFormatClass(TextInputFormat.class);
91         // 提供一个RecordWriter的实现，负责数据输出
92         job.setOutputFormatClass(TextOutputFormat.class);
93         // 设置输入和输出目录
94         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
95         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
96         System.exit(job.waitForCompletion(true) ? 0 : 1);
97     }
98 }
posted @ 2016-10-22 23:01 编程浪剩阅读(305) 评论(1) 收藏举报
刷新页面返回顶部
编程浪剩

hadoop模板分析

公告