第二周
- MapReduce的整个过程分为map和reduce两个阶段,通俗的说就是map(映射)用来准备需要处理的数据,reduce(归纳)用来实际处理数据。当然,另外还需要一些用来调度作业的代码。
map
- mapper接口是一个泛型接口,其中有四个参数。前两个参数是输入的一对键与值,后两个参数是输出的一对键与值。其中输入键是一个长整数偏移量(类型为LongWritable),输入值是一行文本;输出键与值类型由开发者自行定义。
- 比如类 public static class Map extends Mapper<LongWritable, Text, Text, Text>
- 其中的map方法定义为 public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException
- 在老版本的hadoop中,map方法包括输入键与值,用来写入输出内容的OutputCollector实例与用来控制的reporter实例。新版本的hadoop把后两者合并为一个context(上下文)实例,完成同样的功能。
reduce
- reducer接口同样是泛型接口,其中的四个参数,前两个是由map的输出产生的输入键与值,类型务必要与map的输出数据(叫做中间数据)类型相同;后两个是reduce处理后得到的结果键与值,类型由开发者自行定义。
- 类的写法为 public static class Reduce extends Reducer<Text, Text, Text, Text>
- 其中的reduce方法定义为 public void reduce(Text key, Iterable values,Context context) throws IOException, InterruptedException
combine
- combine继承reducer接口,其中要重写的方法是reduce方法。combine本质上就是在本地执行的简单的reduce操作,以减少网络带宽占用。
- 经过实际测试,在集群环境下执行指定程序时,经过combine进行本地处理后程序效率提高10%左右。
main
在主方法中配置的东西如下
- Configuration conf = new Configuration(); //不再使用JobConf类,而是用其父类Configuration进行配置
- FileSystem.get(conf).delete(new Path(args[1]), true); //删除输出路径下的文件
- String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); //从命令行获取参数。这是hadoop提供的辅助类,相关介绍点这里
- Job job = new Job(conf, "product pv uv"); //配置一个新job
- //以下都是字面含义
- job.setJarByClass(ProductPvUv.class);
- job.setMapperClass(Map.class);
- job.setReducerClass(Reduce.class);
- job.setMapOutputKeyClass(Text.class);
- job.setMapOutputValueClass(Text.class);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(Text.class);
- FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
- FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
- //至此准备工作完成
- System.exit(job.waitForCompletion(true)?0:1) //job完成后exit
tips
- map的输入键是一个长整数偏移量,所以不能把输入键类型(第一个参数)定义为Text等非LongWritable类型,否则会出java.lang.ClassCastException: org.apache.hadoop.io.LongWritable cannot be cast to org.apache.hadoop.io.Text异常(0.5天)
- 变量名方法名类名要规范。
- 程序要考虑容错。输入数据可能不规范,要做细节处理。
1 package com; 2 3 import java.io.DataInput; 4 import java.io.DataOutput; 5 import java.io.IOException; 6 import java.io.PrintStream; 7 import java.util.ArrayList; 8 import java.util.Collections; 9 import java.util.List; 10 import java.util.regex.Matcher; 11 import java.util.regex.Pattern; 12 13 import org.apache.hadoop.conf.Configuration; 14 import org.apache.hadoop.fs.FileSystem; 15 import org.apache.hadoop.fs.Path; 16 import org.apache.hadoop.io.LongWritable; 17 import org.apache.hadoop.io.Text; 18 import org.apache.hadoop.io.WritableComparable; 19 import org.apache.hadoop.mapreduce.Job; 20 import org.apache.hadoop.mapreduce.Mapper; 21 import org.apache.hadoop.mapreduce.Reducer; 22 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 23 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 24 import org.apache.hadoop.util.GenericOptionsParser; 25 26 public class ProductPvUv { 27 private static String item = new String(); 28 29 public static class Map extends Mapper<LongWritable, Text, Text, ValuePair> { 30 @Override 31 public void map(LongWritable key, Text value, Context context) 32 throws IOException, InterruptedException { 33 Text word = new Text(); 34 ValuePair text = new ValuePair(); 35 String aLine = value.toString(); 36 String url = getAField(aLine, 5); 37 Pattern p = Pattern.compile("(?<=\\b[pP]roduct_id=)\\d+\\b"); 38 Matcher m = p.matcher(url); 39 if (!m.find()) return; 40 String product_id = m.group(); 41 if (!item.isEmpty() && !item.equals(product_id)) return; 42 word.set(product_id); 43 String type = getAField(aLine, 10); 44 45 if ("1".equals(type)) { 46 String permanent_id = getAField(aLine, 7); 47 if (permanent_id.equals("") || permanent_id.equals("0") 48 || permanent_id.equals("null")) { 49 String ip = getAField(aLine, 3); 50 String http_ua = getAField(aLine, 8); 51 permanent_id = ip.concat(http_ua); 52 permanent_id = (permanent_id.hashCode() & 0x7fffffff) + ""; 53 } else { 54 Pattern pat = Pattern.compile("^\\d+$"); 55 Matcher mat = pat.matcher(permanent_id); 56 if (!mat.find()) { 57 permanent_id = "0" + permanent_id.hashCode(); 58 } 59 } 60 text.setPv(1); 61 text.setPermanent_id(permanent_id); 62 context.write(word, text); 63 } 64 } 65 66 } 67 68 public static class Reduce extends Reducer<Text, ValuePair, Text, Text> { 69 @Override 70 public void reduce(Text key, Iterable<ValuePair> values, Context context) 71 throws IOException, InterruptedException { 72 73 Text word = new Text(); 74 Text text = new Text(); 75 int pv = 0, uv = 0; 76 word = key; 77 //System.out.println("***"); 78 List<ValuePair> list = new ArrayList<ValuePair>(); 79 for (ValuePair pv_Permanent_id : values) { 80 list.add(pv_Permanent_id); 81 } 82 Collections.sort(list); 83 String lastPermanent_id = ""; 84 for (ValuePair pv_Permanent_id : list) { 85 86 String permanent_id = pv_Permanent_id.getPermanent_id(); 87 int npv = pv_Permanent_id.getPv(); 88 pv = npv; 89 //System.out.println(permanent_id + "\t" + lastPermanent_id); 90 if (!permanent_id.equals(lastPermanent_id)) { 91 uv++; 92 lastPermanent_id = permanent_id; 93 text.set(pv + "\t" + uv); 94 context.write(word, text); 95 } 96 } 97 } 98 } 99 100 public static class Combine extends 101 Reducer<Text, ValuePair, Text, ValuePair> { 102 @Override 103 public void reduce(Text key, Iterable<ValuePair> values, Context context) 104 throws IOException, InterruptedException { 105 int pv = 1; 106 //System.out.println("***"); 107 List<ValuePair> list = new ArrayList<ValuePair>(); 108 for (ValuePair pv_Permanent_id : values) { 109 list.add(pv_Permanent_id); 110 } 111 112 Collections.sort(list); 113 String lastPermanent_id = ""; 114 for (ValuePair pv_Permanent_id : list) { 115 String permanent_id = pv_Permanent_id.getPermanent_id(); 116 if (!permanent_id.equals(lastPermanent_id)) { 117 lastPermanent_id = permanent_id; 118 ValuePair result = new ValuePair(pv, permanent_id); 119 context.write(key, result); 120 pv = 1; 121 } else 122 pv++; 123 } 124 } 125 } 126 127 private static class ValuePair implements WritableComparable<ValuePair> { 128 int pv; 129 String permanent_id; 130 131 public ValuePair() { 132 pv = 1; 133 permanent_id = new String(); 134 } 135 136 public ValuePair(int npv, String npermanent_id) { 137 pv = npv; 138 permanent_id = npermanent_id; 139 } 140 141 @Override 142 public String toString() { 143 return permanent_id + pv; 144 } 145 146 @Override 147 public int compareTo(ValuePair v) { 148 int i = this.permanent_id.compareTo(v.permanent_id); 149 if (i > 0) 150 return 1; 151 else if (i < 0) 152 return -1; 153 else 154 return 0; 155 } 156 157 @Override 158 public void readFields(DataInput in) throws IOException { 159 // TODO Auto-generated method stub 160 permanent_id = in.readUTF(); 161 pv = in.readInt(); 162 } 163 164 @Override 165 public void write(DataOutput out) throws IOException { 166 // TODO Auto-generated method stub 167 out.writeUTF(permanent_id); 168 out.writeInt(pv); 169 } 170 171 @Override 172 public int hashCode() { 173 final int prime = 31; 174 int result = 1; 175 result = prime * result + pv; 176 result = prime * result 177 + ((permanent_id == null) ? 0 : permanent_id.hashCode()); 178 return result; 179 } 180 181 @Override 182 public boolean equals(Object obj) { 183 if (this == obj) 184 return true; 185 if (obj == null) 186 return false; 187 if (getClass() != obj.getClass()) 188 return false; 189 ValuePair other = (ValuePair) obj; 190 if (pv != other.pv) 191 return false; 192 if (permanent_id == null) { 193 if (other.permanent_id != null) 194 return false; 195 } else if (!permanent_id.equals(other.permanent_id)) 196 return false; 197 return true; 198 } 199 200 public int getPv() { 201 return pv; 202 } 203 204 public void setPv(int pv) { 205 this.pv = pv; 206 } 207 208 public String getPermanent_id() { 209 return permanent_id; 210 } 211 212 public void setPermanent_id(String permanent_id) { 213 this.permanent_id = permanent_id; 214 } 215 } 216 217 private static final class Timer { 218 private long startTime; 219 private long endTime; 220 221 public Timer() { 222 reset(); 223 } 224 225 public void start() { 226 System.gc(); 227 startTime = System.currentTimeMillis(); 228 } 229 230 public void end() { 231 System.gc(); 232 endTime = System.currentTimeMillis(); 233 } 234 235 public long duration() { 236 return (endTime - startTime); 237 } 238 239 public void printDuration(PrintStream out) { 240 long elapsedTimeInSecond = duration() / 1000; 241 long remainderInMillis = duration() % 1000; 242 out.println("\nTotal execution time:" + elapsedTimeInSecond + "." 243 + remainderInMillis + " seconds"); 244 } 245 246 public void reset() { 247 startTime = 0; 248 endTime = 0; 249 } 250 } 251 252 public static void main(String[] args) throws Exception { 253 Timer timer = new Timer(); 254 timer.start(); 255 Configuration conf = new Configuration(); 256 FileSystem.get(conf).delete(new Path(args[1]), true); 257 String[] otherArgs = new GenericOptionsParser(conf, args) 258 .getRemainingArgs(); 259 if (otherArgs.length == 3) 260 item = otherArgs[2]; 261 Job job = new Job(conf, "product pv uv"); 262 job.setJarByClass(ProductPvUv.class); 263 job.setMapperClass(Map.class); 264 job.setReducerClass(Reduce.class); 265 job.setCombinerClass(Combine.class); 266 job.setMapOutputKeyClass(Text.class); 267 job.setMapOutputValueClass(ValuePair.class); 268 job.setOutputKeyClass(Text.class); 269 job.setOutputValueClass(Text.class); 270 FileInputFormat.addInputPath(job, new Path(otherArgs[0])); 271 FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 272 273 if (job.waitForCompletion(true)) { 274 timer.end(); 275 timer.printDuration(System.out); 276 System.exit(0); 277 } 278 } 279 }