
  • MapReduce的整个过程分为map和reduce两个阶段,通俗的说就是map(映射)用来准备需要处理的数据,reduce(归纳)用来实际处理数据。当然,另外还需要一些用来调度作业的代码。


  • mapper接口是一个泛型接口,其中有四个参数。前两个参数是输入的一对键与值,后两个参数是输出的一对键与值。其中输入键是一个长整数偏移量(类型为LongWritable),输入值是一行文本;输出键与值类型由开发者自行定义。
  • 比如类 public static class Map extends Mapper<LongWritable, Text, Text, Text>
  • 其中的map方法定义为 public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException
  • 在老版本的hadoop中,map方法包括输入键与值,用来写入输出内容的OutputCollector实例与用来控制的reporter实例。新版本的hadoop把后两者合并为一个context(上下文)实例,完成同样的功能。


  • reducer接口同样是泛型接口,其中的四个参数,前两个是由map的输出产生的输入键与值,类型务必要与map的输出数据(叫做中间数据)类型相同;后两个是reduce处理后得到的结果键与值,类型由开发者自行定义。
  • 类的写法为 public static class Reduce extends Reducer<Text, Text, Text, Text>
  • 其中的reduce方法定义为 public void reduce(Text key, Iterable values,Context context) throws IOException, InterruptedException


  • combine继承reducer接口,其中要重写的方法是reduce方法。combine本质上就是在本地执行的简单的reduce操作,以减少网络带宽占用。
  • 经过实际测试,在集群环境下执行指定程序时,经过combine进行本地处理后程序效率提高10%左右。


  • Configuration conf = new Configuration(); //不再使用JobConf类,而是用其父类Configuration进行配置
  • FileSystem.get(conf).delete(new Path(args[1]), true); //删除输出路径下的文件
  • String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); //从命令行获取参数。这是hadoop提供的辅助类,相关介绍点这里
  • Job job = new Job(conf, "product pv uv"); //配置一个新job
  • //以下都是字面含义
  • job.setJarByClass(ProductPvUv.class);
  • job.setMapperClass(Map.class);
  • job.setReducerClass(Reduce.class);
  • job.setMapOutputKeyClass(Text.class);
  • job.setMapOutputValueClass(Text.class);
  • job.setOutputKeyClass(Text.class);
  • job.setOutputValueClass(Text.class);
  • FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
  • FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
  • //至此准备工作完成
  • System.exit(job.waitForCompletion(true)?0:1) //job完成后exit



  • map的输入键是一个长整数偏移量,所以不能把输入键类型(第一个参数)定义为Text等非LongWritable类型,否则会出java.lang.ClassCastException: org.apache.hadoop.io.LongWritable cannot be cast to org.apache.hadoop.io.Text异常(0.5天)
  • 变量名方法名类名要规范。
  • 程序要考虑容错。输入数据可能不规范,要做细节处理。
  1 package com;
  3 import java.io.DataInput;
  4 import java.io.DataOutput;
  5 import java.io.IOException;
  6 import java.io.PrintStream;
  7 import java.util.ArrayList;
  8 import java.util.Collections;
  9 import java.util.List;
 10 import java.util.regex.Matcher;
 11 import java.util.regex.Pattern;
 13 import org.apache.hadoop.conf.Configuration;
 14 import org.apache.hadoop.fs.FileSystem;
 15 import org.apache.hadoop.fs.Path;
 16 import org.apache.hadoop.io.LongWritable;
 17 import org.apache.hadoop.io.Text;
 18 import org.apache.hadoop.io.WritableComparable;
 19 import org.apache.hadoop.mapreduce.Job;
 20 import org.apache.hadoop.mapreduce.Mapper;
 21 import org.apache.hadoop.mapreduce.Reducer;
 22 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 23 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 24 import org.apache.hadoop.util.GenericOptionsParser;
 26 public class ProductPvUv {
 27     private static String item = new String();
 29     public static class Map extends Mapper<LongWritable, Text, Text, ValuePair> {
 30         @Override
 31         public void map(LongWritable key, Text value, Context context)
 32                 throws IOException, InterruptedException {
 33             Text word = new Text();
 34             ValuePair text = new ValuePair();
 35             String aLine = value.toString();
 36             String url = getAField(aLine, 5);
 37             Pattern p = Pattern.compile("(?<=\\b[pP]roduct_id=)\\d+\\b");
 38             Matcher m = p.matcher(url);
 39             if (!m.find()) return;
 40             String product_id = m.group();
 41             if (!item.isEmpty() && !item.equals(product_id)) return;
 42             word.set(product_id);
 43             String type = getAField(aLine, 10);
 45             if ("1".equals(type)) {
 46                 String permanent_id = getAField(aLine, 7);
 47                 if (permanent_id.equals("") || permanent_id.equals("0")
 48                         || permanent_id.equals("null")) {
 49                     String ip = getAField(aLine, 3);
 50                     String http_ua = getAField(aLine, 8);
 51                     permanent_id = ip.concat(http_ua);
 52                     permanent_id = (permanent_id.hashCode() & 0x7fffffff) + "";
 53                 } else {
 54                     Pattern pat = Pattern.compile("^\\d+$");
 55                     Matcher mat = pat.matcher(permanent_id);
 56                     if (!mat.find()) {
 57                         permanent_id = "0" + permanent_id.hashCode();
 58                     }
 59                 }
 60                 text.setPv(1);
 61                 text.setPermanent_id(permanent_id);
 62                 context.write(word, text);
 63             }
 64         }
 66     }
 68     public static class Reduce extends Reducer<Text, ValuePair, Text, Text> {
 69         @Override
 70         public void reduce(Text key, Iterable<ValuePair> values, Context context)
 71                 throws IOException, InterruptedException {
 73             Text word = new Text();
 74             Text text = new Text();
 75             int pv = 0, uv = 0;
 76             word = key;
 77             //System.out.println("***");
 78             List<ValuePair> list = new ArrayList<ValuePair>();
 79             for (ValuePair pv_Permanent_id : values) {
 80                 list.add(pv_Permanent_id);
 81             }
 82             Collections.sort(list);
 83             String lastPermanent_id = "";
 84             for (ValuePair pv_Permanent_id : list) {
 86                 String permanent_id = pv_Permanent_id.getPermanent_id();
 87                 int npv = pv_Permanent_id.getPv();
 88                 pv = npv;
 89                 //System.out.println(permanent_id + "\t" + lastPermanent_id);
 90                 if (!permanent_id.equals(lastPermanent_id)) {
 91                     uv++;
 92                     lastPermanent_id = permanent_id;
 93                     text.set(pv + "\t" + uv);
 94                     context.write(word, text);
 95                 }
 96             }
 97         }
 98     }
100     public static class Combine extends
101             Reducer<Text, ValuePair, Text, ValuePair> {
102         @Override
103         public void reduce(Text key, Iterable<ValuePair> values, Context context)
104                 throws IOException, InterruptedException {
105             int pv = 1;
106             //System.out.println("***");
107             List<ValuePair> list = new ArrayList<ValuePair>();
108             for (ValuePair pv_Permanent_id : values) {
109                 list.add(pv_Permanent_id);
110             }
112             Collections.sort(list);
113             String lastPermanent_id = "";
114             for (ValuePair pv_Permanent_id : list) {
115                 String permanent_id = pv_Permanent_id.getPermanent_id();
116                 if (!permanent_id.equals(lastPermanent_id)) {
117                     lastPermanent_id = permanent_id;
118                     ValuePair result = new ValuePair(pv, permanent_id);
119                     context.write(key, result);
120                     pv = 1;
121                 } else
122                     pv++;
123             }
124         }
125     }
127     private static class ValuePair implements WritableComparable<ValuePair> {
128         int pv;
129         String permanent_id;
131         public ValuePair() {
132             pv = 1;
133             permanent_id = new String();
134         }
136         public ValuePair(int npv, String npermanent_id) {
137             pv = npv;
138             permanent_id = npermanent_id;
139         }
141         @Override
142         public String toString() {
143             return permanent_id + pv;
144         }
146         @Override
147         public int compareTo(ValuePair v) {
148             int i = this.permanent_id.compareTo(v.permanent_id);
149             if (i > 0)
150                 return 1;
151             else if (i < 0)
152                 return -1;
153             else
154                 return 0;
155         }
157         @Override
158         public void readFields(DataInput in) throws IOException {
159             // TODO Auto-generated method stub
160             permanent_id = in.readUTF();
161             pv = in.readInt();
162         }
164         @Override
165         public void write(DataOutput out) throws IOException {
166             // TODO Auto-generated method stub
167             out.writeUTF(permanent_id);
168             out.writeInt(pv);
169         }
171         @Override
172         public int hashCode() {
173             final int prime = 31;
174             int result = 1;
175             result = prime * result + pv;
176             result = prime * result
177                     + ((permanent_id == null) ? 0 : permanent_id.hashCode());
178             return result;
179         }
181         @Override
182         public boolean equals(Object obj) {
183             if (this == obj)
184                 return true;
185             if (obj == null)
186                 return false;
187             if (getClass() != obj.getClass())
188                 return false;
189             ValuePair other = (ValuePair) obj;
190             if (pv != other.pv)
191                 return false;
192             if (permanent_id == null) {
193                 if (other.permanent_id != null)
194                     return false;
195             } else if (!permanent_id.equals(other.permanent_id))
196                 return false;
197             return true;
198         }
200         public int getPv() {
201             return pv;
202         }
204         public void setPv(int pv) {
205             this.pv = pv;
206         }
208         public String getPermanent_id() {
209             return permanent_id;
210         }
212         public void setPermanent_id(String permanent_id) {
213             this.permanent_id = permanent_id;
214         }
215     }
217     private static final class Timer {
218         private long startTime;
219         private long endTime;
221         public Timer() {
222             reset();
223         }
225         public void start() {
226             System.gc();
227             startTime = System.currentTimeMillis();
228         }
230         public void end() {
231             System.gc();
232             endTime = System.currentTimeMillis();
233         }
235         public long duration() {
236             return (endTime - startTime);
237         }
239         public void printDuration(PrintStream out) {
240             long elapsedTimeInSecond = duration() / 1000;
241             long remainderInMillis = duration() % 1000;
242             out.println("\nTotal execution time:" + elapsedTimeInSecond + "."
243                     + remainderInMillis + " seconds");
244         }
246         public void reset() {
247             startTime = 0;
248             endTime = 0;
249         }
250     }
252     public static void main(String[] args) throws Exception {
253         Timer timer = new Timer();
254         timer.start();
255         Configuration conf = new Configuration();
256         FileSystem.get(conf).delete(new Path(args[1]), true);
257         String[] otherArgs = new GenericOptionsParser(conf, args)
258                 .getRemainingArgs();
259         if (otherArgs.length == 3)
260             item = otherArgs[2];
261         Job job = new Job(conf, "product pv uv");
262         job.setJarByClass(ProductPvUv.class);
263         job.setMapperClass(Map.class);
264         job.setReducerClass(Reduce.class);
265         job.setCombinerClass(Combine.class);
266         job.setMapOutputKeyClass(Text.class);
267         job.setMapOutputValueClass(ValuePair.class);
268         job.setOutputKeyClass(Text.class);
269         job.setOutputValueClass(Text.class);
270         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
271         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
273         if (job.waitForCompletion(true)) {
274             timer.end();
275             timer.printDuration(System.out);
276             System.exit(0);
277         }
278     }
279 }
