Partitioner
使用自定义partitioner来处理手机上网日志信息
为什么要使用分区?
1.根据业务需要,产生多个输出文件
2.多个reduce任务在运行,提高整体job的运行效率
1 package partitioner; 2 3 import java.io.DataInput; 4 import java.io.DataOutput; 5 import java.io.IOException; 6 7 import org.apache.hadoop.conf.Configuration; 8 import org.apache.hadoop.fs.Path; 9 import org.apache.hadoop.io.LongWritable; 10 import org.apache.hadoop.io.Text; 11 import org.apache.hadoop.io.Writable; 12 import org.apache.hadoop.mapreduce.Job; 13 import org.apache.hadoop.mapreduce.Mapper; 14 import org.apache.hadoop.mapreduce.Partitioner; 15 import org.apache.hadoop.mapreduce.Reducer; 16 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 17 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 18 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 19 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 20 import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner; 21 /** 22 * 分区必须打包jar运行 23 * 24 */ 25 public class KpiApp { 26 static final String INPUT_PATH = "hdfs://chaoren:9000/wlan";//wlan是个文件夹,日志文件放在/wlan目录下 27 static final String OUT_PATH = "hdfs://chaoren:9000/out"; 28 29 public static void main(String[] args) throws Exception { 30 final Job job = new Job(new Configuration(), 31 KpiApp.class.getSimpleName()); 32 33 //打包运行 34 job.setJarByClass(KpiApp.class); 35 36 // 1.1 指定输入文件路径 37 FileInputFormat.setInputPaths(job, INPUT_PATH); 38 // 指定哪个类用来格式化输入文件 39 job.setInputFormatClass(TextInputFormat.class); 40 41 // 1.2指定自定义的Mapper类 42 job.setMapperClass(MyMapper.class); 43 // 指定输出<k2,v2>的类型 44 job.setMapOutputKeyClass(Text.class); 45 job.setMapOutputValueClass(KpiWritable.class); 46 47 // 1.3 指定分区类 48 job.setPartitionerClass(KpiPartitioner.class); 49 job.setNumReduceTasks(2);//分成两个区 50 51 // 1.4 TODO 排序、分区 52 53 // 1.5 TODO (可选)归约 54 55 // 2.2 指定自定义的reduce类 56 job.setReducerClass(MyReducer.class); 57 // 指定输出<k3,v3>的类型 58 job.setOutputKeyClass(Text.class); 59 job.setOutputValueClass(KpiWritable.class); 60 61 // 2.3 指定输出到哪里 62 FileOutputFormat.setOutputPath(job, new Path(OUT_PATH)); 63 // 设定输出文件的格式化类 64 job.setOutputFormatClass(TextOutputFormat.class); 65 66 // 把代码提交给JobTracker执行 67 job.waitForCompletion(true); 68 } 69 70 static class KpiPartitioner extends HashPartitioner<Text, KpiWritable>{ 71 72 @Override 73 public int getPartition(Text key, KpiWritable value, int numReduceTasks) { 74 return (key.toString().length() == 11) ? 0 : 1;//0代表的是手机号码 1代表非手机号码 75 } 76 77 } 78 79 static class MyMapper extends Mapper<LongWritable, Text, Text, KpiWritable> { 80 protected void map( 81 LongWritable key, 82 Text value, 83 org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, Text, KpiWritable>.Context context) 84 throws IOException, InterruptedException { 85 final String[] splited = value.toString().split("\t"); 86 final String msisdn = splited[1]; 87 final Text k2 = new Text(msisdn); 88 final KpiWritable v2 = new KpiWritable(splited[6], splited[7], 89 splited[8], splited[9]); 90 context.write(k2, v2); 91 }; 92 } 93 94 static class MyReducer extends 95 Reducer<Text, KpiWritable, Text, KpiWritable> { 96 /** 97 * @param k2 98 * 表示整个文件中不同的手机号码 99 * @param v2s 100 * 表示该手机号在不同时段的流量的集合 101 */ 102 protected void reduce( 103 Text k2, 104 java.lang.Iterable<KpiWritable> v2s, 105 org.apache.hadoop.mapreduce.Reducer<Text, KpiWritable, Text, KpiWritable>.Context context) 106 throws IOException, InterruptedException { 107 long upPackNum = 0L; 108 long downPackNum = 0L; 109 long upPayLoad = 0L; 110 long downPayLoad = 0L; 111 112 for (KpiWritable kpiWritable : v2s) { 113 upPackNum += kpiWritable.upPackNum; 114 downPackNum += kpiWritable.downPackNum; 115 upPayLoad += kpiWritable.upPayLoad; 116 downPayLoad += kpiWritable.downPayLoad; 117 } 118 119 final KpiWritable v3 = new KpiWritable(upPackNum + "", downPackNum 120 + "", upPayLoad + "", downPayLoad + ""); 121 context.write(k2, v3); 122 }; 123 } 124 } 125 126 class KpiWritable implements Writable { 127 long upPackNum; 128 long downPackNum; 129 long upPayLoad; 130 long downPayLoad; 131 132 public KpiWritable() { 133 } 134 135 public KpiWritable(String upPackNum, String downPackNum, String upPayLoad, 136 String downPayLoad) { 137 this.upPackNum = Long.parseLong(upPackNum); 138 this.downPackNum = Long.parseLong(downPackNum); 139 this.upPayLoad = Long.parseLong(upPayLoad); 140 this.downPayLoad = Long.parseLong(downPayLoad); 141 } 142 143 public void readFields(DataInput in) throws IOException { 144 this.upPackNum = in.readLong(); 145 this.downPackNum = in.readLong(); 146 this.upPayLoad = in.readLong(); 147 this.downPayLoad = in.readLong(); 148 } 149 150 public void write(DataOutput out) throws IOException { 151 out.writeLong(upPackNum); 152 out.writeLong(downPackNum); 153 out.writeLong(upPayLoad); 154 out.writeLong(downPayLoad); 155 } 156 157 @Override 158 public String toString() { 159 return upPackNum + "\t" + downPackNum + "\t" + upPayLoad + "\t" 160 + downPayLoad; 161 } 162 }
将上面代码打包导出,复制到Linux中,然后在命令行下执行并查看结果,也可以在chaoren:50030中查看到作业的相关情况
一起学习,一起进步