MapReduce清洗日志数据统计PV量
1 package mapreduce.webpv; 2 3 import java.io.IOException; 4 import org.apache.commons.lang.StringUtils; 5 import org.apache.hadoop.conf.Configuration; 6 import org.apache.hadoop.conf.Configured; 7 import org.apache.hadoop.fs.Path; 8 import org.apache.hadoop.io.IntWritable; 9 import org.apache.hadoop.io.LongWritable; 10 import org.apache.hadoop.io.Text; 11 import org.apache.hadoop.mapreduce.Job; 12 import org.apache.hadoop.mapreduce.Mapper; 13 import org.apache.hadoop.mapreduce.Reducer; 14 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 15 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 16 import org.apache.hadoop.util.Tool; 17 import org.apache.hadoop.util.ToolRunner; 18 19 public class WebPvMapReduce extends Configured implements Tool { 20 21 // step 1: Mapper 22 public static class WebPvMapper extends 23 Mapper<LongWritable, Text, IntWritable, IntWritable> { 24 private IntWritable mapOutputKey = new IntWritable(); 25 private IntWritable mapOutputValue = new IntWritable(1); 26 27 @Override 28 public void map(LongWritable key, Text value, Context context) 29 throws IOException, InterruptedException { 30 31 // line value 32 String lineValue = value.toString(); 33 34 // spilt 35 String[] values = lineValue.split("\t"); 36 37 // url 38 String urlValue = values[1]; 39 40 if (StringUtils.isBlank(urlValue)) { 41 // conuter 42 context.getCounter("WEBPVMAPPER_CUUNTERS", "URL_BLANK") 43 .increment(1L); 44 return; 45 } 46 47 if (30 > values.length) { 48 49 // conuter 50 context.getCounter("WEBPVMAPPER_CUUNTERS", "LENGTH_LT_30") 51 .increment(1L); 52 53 return; 54 } 55 56 // province id 57 String provinceIdValue = values[23]; 58 59 if (StringUtils.isBlank(provinceIdValue)) { 60 // conuter 61 context.getCounter("WEBPVMAPPER_CUUNTERS", "PROVINCEID_BLANK") 62 .increment(1L); 63 return; 64 } 65 66 Integer provinceId = Integer.MAX_VALUE; 67 try { 68 provinceId = Integer.valueOf(provinceIdValue); 69 } catch (Exception e) { 70 // conuter 71 context.getCounter("WEBPVMAPPER_CUUNTERS", 72 "PROVINCEID_NOT_NUMBER").increment(1L); 73 return; 74 } 75 76 // map outpu key 77 mapOutputKey.set(provinceId); 78 79 context.write(mapOutputKey, mapOutputValue); 80 } 81 } 82 83 // step 2: Reducer 84 public static class WebPvReducer extends 85 Reducer<IntWritable, IntWritable, IntWritable, IntWritable> { 86 private IntWritable outputValue = new IntWritable(); 87 88 @Override 89 protected void reduce(IntWritable key, Iterable<IntWritable> values, 90 Context context) throws IOException, InterruptedException { 91 // temp sum 92 int sum = 0; 93 94 // iterator 95 for (IntWritable value : values) { 96 sum += value.get(); 97 } 98 99 // set output 100 outputValue.set(sum); 101 102 context.write(key, outputValue); 103 } 104 } 105 106 // step 3: Driver 107 public int run(String[] args) throws Exception { 108 109 Configuration configuration = this.getConf(); 110 111 Job job = Job.getInstance(configuration, this.getClass() 112 .getSimpleName()); 113 job.setJarByClass(WebPvMapReduce.class); 114 115 // set job 116 // input 117 Path inpath = new Path(args[0]); 118 FileInputFormat.addInputPath(job, inpath); 119 120 // output 121 Path outPath = new Path(args[1]); 122 FileOutputFormat.setOutputPath(job, outPath); 123 124 // Mapper 125 job.setMapperClass(WebPvMapper.class); 126 job.setMapOutputKeyClass(IntWritable.class); 127 job.setMapOutputValueClass(IntWritable.class); 128 129 // Reducer 130 job.setReducerClass(WebPvReducer.class); 131 job.setOutputKeyClass(IntWritable.class); 132 job.setOutputValueClass(IntWritable.class); 133 134 // submit job -> YARN 135 boolean isSuccess = job.waitForCompletion(true); 136 return isSuccess ? 0 : 1; 137 } 138 139 public static void main(String[] args) throws Exception { 140 141 Configuration configuration = new Configuration(); 142 143 args = new String[] { 144 "hdfs://beifeng01:8020//user/beifeng01/mapreduce/input/testdata/2015082818", 145 "hdfs://beifeng01:8020//user/beifeng01/mapreduce/output1" }; 146 147 int status = ToolRunner.run(configuration, new WebPvMapReduce(), args); 148 149 // exit program 150 System.exit(status); 151 } 152 }
查看结果
1 $ bin/hdfs dfs -text /user/beifeng01/mapreduce/output1/pa* 2 1 3527 3 2 1672 4 3 511 5 4 325 6 5 776 7 6 661 8 7 95 9 8 80 10 9 183 11 10 93 12 11 135 13 12 289 14 13 264 15 14 374 16 15 163 17 16 419 18 17 306 19 18 272 20 19 226 21 20 2861 22 21 124 23 22 38 24 23 96 25 24 100 26 25 20 27 26 157 28 27 49 29 28 21 30 29 85 31 30 42 32 32 173
posted on 2018-12-11 16:44 PerfectData 阅读(366) 评论(0) 编辑 收藏 举报