Hadoop 学习笔记(十二)Hadoop序列化

1、Hadoop 序列化简介

序列化:将内存中的对象转换成字节序列(或其它支持网络传输的数据),以便于存储到磁盘或网络传输,

反序列化:将收到的字节序列或者持久化在磁盘中的数据转换成内存中的对象;

Hadoop 序列化特点:

  • 紧凑:高效使用存储空间;
  • 高效:读写数据额外开销小
  • 可扩展:随着通信协议的升级而升级;
  • 互操作性:支持多语言环境操作

 2、Haddop 示例

1    13736230513    192.196.100.1    www.wx,tv.com    2481    24681    200
2    13846544121    192.196.100.2            264    0    200
3     13956435636    192.196.100.3            132    1512    200
4     13966251146    192.168.100.1            240    0    404
5     18271575951    192.168.100.2    www.wx,tv.com    1527    2106    200
6     84188413    192.168.100.3    www.wx,tv.com    4116    1432    200
7     13590439668    192.168.100.4            1116    954    200
8     15910133277    192.168.100.5    www.hao123.com    3156    2936    200
9     13729199489    192.168.100.6            240    0    200
10     13630577991    192.168.100.7    www.shouhu.com    6960    690    200
11     15043685818    192.168.100.8    www.baidu.com    3659    3538    200
12     15959002129    192.168.100.9    www.wx,tv.com    1938    180    500
13     13560439638    192.168.100.10            918    4938    200
14     13470253144    192.168.100.11            180    180    200
15     13682846555    192.168.100.12    www.qq.com    1938    2910    200
16     13992314666    192.168.100.13    www.gaga.com    3008    3720    200
17     13509468723    192.168.100.14    www.qinghua.com    7335    110349    404
18     18390173782    192.168.100.15    www.sogou.com    9531    2412    200
19     13975057813    192.168.100.16    www.baidu.com    11058    48243    200
20     13768778790    192.168.100.17            120    120    200
21     13568436656    192.168.100.18    www.alibaba.com    2481    24681    200
22     13568436656    192.168.100.19            1116    954    200

编写程序统计所有号码的上行流量,下行流量和总流量:

输入数据格式:

7      13560436666     120.196.100.99           1116          954                   200

id      手机号码           网络ip                        上行流量  下行流量     网络状态码

  期望输出数据格式

13560436666             1116                954                          2070

手机号码               上行流量        下行流量                  总流量

 

实现代码如下:

序列化实体 FlowBean

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class FlowBean implements Writable {

    private long upFlow;// 上行流量
    private long downFlow;// 下行流量
    private long sumFlow;// 总流量

    // 空参构造,后续反射使用
    public FlowBean() {
        super();
    }

    public FlowBean(long upFlow, long downFlow, long sumFlow) {
        super();
        this.upFlow = upFlow;
        this.downFlow = downFlow;
        sumFlow = upFlow + downFlow;
    }

    @Override
    public String toString() {
        return upFlow + "\t" + downFlow + "\t" + sumFlow + "\t";
    }

    public long getUpFlow() {
        return upFlow;
    }

    public void setUpFlow(long upFlow) {
        this.upFlow = upFlow;
    }

    public long getDownFlow() {
        return downFlow;
    }

    public void setDownFlow(long downFlow) {
        this.downFlow = downFlow;
    }

    public long getSumFlow() {
        return sumFlow;
    }

    public void setSumFlow(long sumFlow) {
        this.sumFlow = sumFlow;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        // TODO Auto-generated method stub
        out.writeLong(upFlow);
        out.writeLong(downFlow);
        out.writeLong(sumFlow);
    }

    // 反序列化方法
    @Override
    public void readFields(DataInput in) throws IOException {
        // TODO Auto-generated method stub
        // 需要和序列化方法顺序一致
        upFlow = in.readLong();
        downFlow = in.readLong();
        sumFlow = in.readLong();
    }

    public void set(long sum_upflow, long sum_downflow) {
        // TODO Auto-generated method stub
        upFlow = sum_upflow;
        downFlow = sum_downflow;
        sumFlow = sum_upflow + sum_downflow;
    }

}
View Code

Map :FlowCountMapper

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
    Text k = new Text();
    FlowBean v = new FlowBean();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 1、获取一行
        String line = value.toString();
        // 2、切割 \t
        String[] fields = line.split("\t");
        // 3、封装对象
        k.set(fields[1]);
        long upFlow = Long.parseLong(fields[fields.length - 3]);
        long downFlow = Long.parseLong(fields[fields.length - 2]);
        v.setUpFlow(upFlow);
        v.setDownFlow(downFlow);
        // 4、写出
        context.write(k, v);
    }
}
View Code

Reduce:FlowCountReduce

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;


public class FlowCountReduce extends Reducer<Text, FlowBean, Text, FlowBean> {
    FlowBean v = new FlowBean();

    @Override
    protected void reduce(Text key, Iterable<FlowBean> values, Context context)
            throws IOException, InterruptedException {
        // 1、累加求和
        long sum_upflow = 0;
        long sum_downflow = 0;
        for (FlowBean flowBean : values) {
            sum_upflow += flowBean.getUpFlow();
            sum_downflow = flowBean.getDownFlow();
        }
        v.set(sum_upflow, sum_downflow);
        // 2、写出
        context.write(key, v);
    }
}
View Code

Driver:FlowsumDriver

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class FlowsumDriver {
    public static void main(String[] args) throws Exception {
        args = new String[] {"E:/input1","E:/output1"};
        // 1、获取 job 对象
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);
        // 2、设置 jar 路径
        job.setJarByClass(FlowsumDriver.class);
        // 3、关联 MR
        job.setMapperClass(FlowCountMapper.class);
        job.setReducerClass(FlowCountReduce.class);
        // 4、设置 Map 输出 KV 类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(FlowBean.class);
        // 5、设设置 Reduce 输出 KV 类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(FlowBean.class);
        // 6、设置输入输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        // 7、提交 job
        job.waitForCompletion(true);
    }
}
View Code

运行程序:得到结果文件:

13470253144    180    180    360    
13509468723    7335    110349    117684    
13560439638    918    4938    5856    
13568436656    3597    954    4551    
13590439668    1116    954    2070    
13630577991    6960    690    7650    
13682846555    1938    2910    4848    
13729199489    240    0    240    
13736230513    2481    24681    27162    
13768778790    120    120    240    
13846544121    264    0    264    
13956435636    132    1512    1644    
13966251146    240    0    240    
13975057813    11058    48243    59301    
13992314666    3008    3720    6728    
15043685818    3659    3538    7197    
15910133277    3156    2936    6092    
15959002129    1938    180    2118    
18271575951    1527    2106    3633    
18390173782    9531    2412    11943    
84188413    4116    1432    5548    

 

posted @ 2020-10-17 17:16  晓枫的春天  阅读(225)  评论(0编辑  收藏  举报