hadoop序列化
所谓序列化就是将内存中的数据转化为字节码文件
特点:
(1) 紧凑:数据内存占用小
(2) 快速:读写数据开销小
(3) 互操性:支持多种语言,使用Java序列化,使用c++反序列化
今天学习了hadoop的序列化,并做了一个案例
(1) 必须实现Writable接口
(2) 反序列化,需要反射空参构造函数,无参构造
(3) 重写序列化方法
(4) 重写反序列化方法
(5) 反序列化的顺序必须和序列化顺序一致
(6) 想将结果进行显示,重写toString()方法
(7) 如果需要将自定义的bean放在key中传输,还需要实现Comparable接口,Mapreduce框架中shuffle要求对key必须能排序
mapper的输出数据为reducer的输入数据,在官方文档中,mapper和reducer只能输出基本数据类型,对于对象的输出只能自定义封装
flowbean
package com.mapreduce.Writable; //统计手机流量 import org.apache.hadoop.io.Writable; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; /** * 1、实现Writable接口 * 2、重写序列化和反序列化方法 * 3、重写空参构造 * 4、重写toString()方法 */ public class FlowBean implements Writable { private long upflow;//上行流量 private long downflow;//下行流量 private long sumflow;//总流量 public FlowBean() { } public long getUpflow() { return upflow; } public void setUpflow(long upflow) { this.upflow = upflow; } public long getDownflow() { return downflow; } public void setDownflow(long downflow) { this.downflow = downflow; } public long getSumflow() { return sumflow; } public void setSumflow(long sumflow) { this.sumflow = sumflow; }
//在之后的代码中直接调用即可 public void setSumflow() { this.sumflow = this.upflow + this.downflow; } @Override public void write(DataOutput dataOutput) throws IOException { dataOutput.writeLong(upflow); dataOutput.writeLong(downflow); dataOutput.writeLong(sumflow); } @Override public void readFields(DataInput dataInput) throws IOException { this.upflow = dataInput.readLong(); this.downflow = dataInput.readLong(); this.sumflow = dataInput.readLong(); } @Override public String toString() { return upflow + "\t" + downflow + "\t" + sumflow; } }
flowMapper
package com.mapreduce.Writable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; public class FlowMapper extends Mapper<LongWritable, Text,Text,FlowBean> { private Text outkey = new Text(); private FlowBean outvalue = new FlowBean(); @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, FlowBean>.Context context) throws IOException, InterruptedException { // 1 获取一行信息 // 1363157985066 13726230503 120.196.100.82 2481 24681 200 String line = value.toString(); // 切割 String[] split = line.split("\t"); // 抓取想要的数据 手机号,上行流量,下行流量 String phone = split[0]; String upflow = split[split.length - 3]; String downflow = split[split.length - 2]; // 封装 outkey.set(phone); outvalue.setUpflow(Long.parseLong(upflow)); outvalue.setDownflow(Long.parseLong(downflow)); outvalue.setSumflow(); // 写出 context.write(outkey,outvalue); } }
FlowReducer
package com.mapreduce.Writable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public class FlowReducer extends Reducer<Text,FlowBean,Text,FlowBean> { private FlowBean outvalue = new FlowBean(); @Override protected void reduce(Text key, Iterable<FlowBean> values, Reducer<Text, FlowBean, Text, FlowBean>.Context context) throws IOException, InterruptedException { // 遍历集合累加值 long totalup = 0; long totaldown = 0; for (FlowBean value : values) { totalup += value.getUpflow(); totaldown += value.getDownflow(); } // 封装 outvalue.setUpflow(totalup); outvalue.setDownflow(totaldown); outvalue.setSumflow(); // 写出 context.write(key,outvalue); } }
FlowDriver
package com.mapreduce.Writable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; public class FlowDriver { public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { // 1 获取job Configuration conf = new Configuration(); Job job = Job.getInstance(conf); // 2 设置jar job.setJarByClass(FlowDriver.class); // 3 关联mapper reducer job.setMapperClass(FlowMapper.class); job.setReducerClass(FlowReducer.class); // 4 设置mapper输出的key和value job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(FlowBean.class); // 5 设置最终输出key value类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(FlowBean.class); // 6 设置路径 FileInputFormat.setInputPaths(job,new Path("D:\\hadoop\\inputflow")); FileOutputFormat.setOutputPath(job,new Path("D:\\hadoop\\outputflow")); // 7 提交job boolean result = job.waitForCompletion(true); System.exit(result ? 0 : 1); } }
这是我使用的数据文件phone_data.txt
1363157985066 13726230503 120.196.100.82 2481 24681 200 1363157995052 13826544101 120.197.40.4 264 0 200 1363157991076 13926435656 120.196.100.99 132 1512 200 1363154400022 13926251106 120.197.40.4 240 0 200 1363157993044 18211575961 120.196.100.99 1527 2106 200 1363157995074 84138413 120.197.40.4 4116 1432 200 1363157993055 13560439658 120.196.100.99 1116 954 200 1363157995033 15920133257 120.197.40.4 3156 2936 200 1363157983019 13719199419 120.196.100.82 240 0 200 1363157984041 13660577991 120.197.40.4 6960 690 200 1363157973098 15013685858 120.197.40.4 3659 3538 200 1363157986029 15989002119 120.196.100.99 1938 180 200 1363157992093 13560439658 120.196.100.99 4938 2345 200 1363157986041 13480253104 120.197.40.4 180 2001 200 1363157984040 13602846565 120.197.40.4 1938 2910 200 1363157995093 13922314466 120.196.100.82 3008 3720 200 1363157982040 13502468823 120.196.100.99 7335 110349 200 1363157986072 18320173382 120.196.100.99 9531 2412 200 1363157990043 13925057413 120.196.100.55 11058 48243 200 1363157988072 13760778710 120.196.100.82 120 2300 500 1363157985066 13560436666 120.196.100.82 2481 24681 200 1363157993055 13560436666 120.196.100.99 954 2010 400
排序之后的内容如下
1363154400022 240 0 240 1363157973098 3659 3538 7197 1363157982040 7335 110349 117684 1363157983019 240 0 240 1363157984040 1938 2910 4848 1363157984041 6960 690 7650 1363157985066 4962 49362 54324 1363157986029 1938 180 2118 1363157986041 180 2001 2181 1363157986072 9531 2412 11943 1363157988072 120 2300 2420 1363157990043 11058 48243 59301 1363157991076 132 1512 1644 1363157992093 4938 2345 7283 1363157993044 1527 2106 3633 1363157993055 2070 2964 5034 1363157995033 3156 2936 6092 1363157995052 264 0 264 1363157995074 4116 1432 5548 1363157995093 3008 3720 6728