MapReduce清洗日志数据统计PV量

 

  1 package mapreduce.webpv;
  2 
  3 import java.io.IOException;
  4 import org.apache.commons.lang.StringUtils;
  5 import org.apache.hadoop.conf.Configuration;
  6 import org.apache.hadoop.conf.Configured;
  7 import org.apache.hadoop.fs.Path;
  8 import org.apache.hadoop.io.IntWritable;
  9 import org.apache.hadoop.io.LongWritable;
 10 import org.apache.hadoop.io.Text;
 11 import org.apache.hadoop.mapreduce.Job;
 12 import org.apache.hadoop.mapreduce.Mapper;
 13 import org.apache.hadoop.mapreduce.Reducer;
 14 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 15 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 16 import org.apache.hadoop.util.Tool;
 17 import org.apache.hadoop.util.ToolRunner;
 18 
 19 public class WebPvMapReduce extends Configured implements Tool {
 20 
 21     // step 1: Mapper
 22     public static class WebPvMapper extends
 23             Mapper<LongWritable, Text, IntWritable, IntWritable> {
 24         private IntWritable mapOutputKey = new IntWritable();
 25         private IntWritable mapOutputValue = new IntWritable(1);
 26 
 27         @Override
 28         public void map(LongWritable key, Text value, Context context)
 29                 throws IOException, InterruptedException {
 30 
 31             // line value
 32             String lineValue = value.toString();
 33 
 34             // spilt
 35             String[] values = lineValue.split("\t");
 36 
 37             // url
 38             String urlValue = values[1];
 39 
 40             if (StringUtils.isBlank(urlValue)) {
 41                 // conuter
 42                 context.getCounter("WEBPVMAPPER_CUUNTERS", "URL_BLANK")
 43                         .increment(1L);
 44                 return;
 45             }
 46 
 47             if (30 > values.length) {
 48 
 49                 // conuter
 50                 context.getCounter("WEBPVMAPPER_CUUNTERS", "LENGTH_LT_30")
 51                         .increment(1L);
 52 
 53                 return;
 54             }
 55 
 56             // province id
 57             String provinceIdValue = values[23];
 58 
 59             if (StringUtils.isBlank(provinceIdValue)) {
 60                 // conuter
 61                 context.getCounter("WEBPVMAPPER_CUUNTERS", "PROVINCEID_BLANK")
 62                         .increment(1L);
 63                 return;
 64             }
 65 
 66             Integer provinceId = Integer.MAX_VALUE;
 67             try {
 68                 provinceId = Integer.valueOf(provinceIdValue);
 69             } catch (Exception e) {
 70                 // conuter
 71                 context.getCounter("WEBPVMAPPER_CUUNTERS",
 72                         "PROVINCEID_NOT_NUMBER").increment(1L);
 73                 return;
 74             }
 75 
 76             // map outpu key
 77             mapOutputKey.set(provinceId);
 78 
 79             context.write(mapOutputKey, mapOutputValue);
 80         }
 81     }
 82 
 83     // step 2: Reducer
 84     public static class WebPvReducer extends
 85             Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {
 86         private IntWritable outputValue = new IntWritable();
 87 
 88         @Override
 89         protected void reduce(IntWritable key, Iterable<IntWritable> values,
 90                 Context context) throws IOException, InterruptedException {
 91             // temp sum
 92             int sum = 0;
 93 
 94             // iterator
 95             for (IntWritable value : values) {
 96                 sum += value.get();
 97             }
 98 
 99             // set output
100             outputValue.set(sum);
101 
102             context.write(key, outputValue);
103         }
104     }
105 
106     // step 3: Driver
107     public int run(String[] args) throws Exception {
108 
109         Configuration configuration = this.getConf();
110 
111         Job job = Job.getInstance(configuration, this.getClass()
112                 .getSimpleName());
113         job.setJarByClass(WebPvMapReduce.class);
114 
115         // set job
116         // input
117         Path inpath = new Path(args[0]);
118         FileInputFormat.addInputPath(job, inpath);
119 
120         // output
121         Path outPath = new Path(args[1]);
122         FileOutputFormat.setOutputPath(job, outPath);
123 
124         // Mapper
125         job.setMapperClass(WebPvMapper.class);
126         job.setMapOutputKeyClass(IntWritable.class);
127         job.setMapOutputValueClass(IntWritable.class);
128 
129         // Reducer
130         job.setReducerClass(WebPvReducer.class);
131         job.setOutputKeyClass(IntWritable.class);
132         job.setOutputValueClass(IntWritable.class);
133 
134         // submit job -> YARN
135         boolean isSuccess = job.waitForCompletion(true);
136         return isSuccess ? 0 : 1;
137     }
138 
139     public static void main(String[] args) throws Exception {
140 
141         Configuration configuration = new Configuration();
142 
143         args = new String[] {
144                 "hdfs://beifeng01:8020//user/beifeng01/mapreduce/input/testdata/2015082818",
145                 "hdfs://beifeng01:8020//user/beifeng01/mapreduce/output1" };
146 
147         int status = ToolRunner.run(configuration, new WebPvMapReduce(), args);
148 
149         // exit program
150         System.exit(status);
151     }
152 }

 查看结果

 1 $ bin/hdfs dfs -text /user/beifeng01/mapreduce/output1/pa*
 2 1       3527
 3 2       1672
 4 3       511
 5 4       325
 6 5       776
 7 6       661
 8 7       95
 9 8       80
10 9       183
11 10      93
12 11      135
13 12      289
14 13      264
15 14      374
16 15      163
17 16      419
18 17      306
19 18      272
20 19      226
21 20      2861
22 21      124
23 22      38
24 23      96
25 24      100
26 25      20
27 26      157
28 27      49
29 28      21
30 29      85
31 30      42
32 32      173

 

posted on 2018-12-11 16:44  PerfectData  阅读(361)  评论(0编辑  收藏  举报

导航