自定义分区机制

分区数与reduce任务数必须一致

MyPartitioner类

package com.sxuek.partitiontest;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

/*
自定义的分区类
自己控制map阶段输出的key=value数据发送到哪个分区去

分区类有两个泛型 是map阶段输出的key-value类型
 */
public class MyPartitioner extends Partitioner<Text, FlowBean> {
    public int getPartition(Text text, FlowBean flowBean, int i) {
        String head = text.toString().substring(0, 3);
        if ("134".equals(head)) {
            return 0;
        } else if ("135".equals(head)) {
            return 1;
        } else if ("136".equals(head)) {
            return 2;
        } else if ("137".equals(head)) {
            return 3;
        }
        return 4;
    }
}

Driver类

添加代码：
// 自定义分区需要设置的
job.setNumReduceTasks(5);
job.setPartitionerClass(MyPartitioner.class);

package com.sxuek.partitiontest;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

public class FlowDriver {
    public static void main(String[] args) throws IOException, URISyntaxException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://node1:9000");

        FileSystem fs = FileSystem.get(new URI("hdfs://node1:9000"), conf, "root");

        Job job = Job.getInstance(conf);
        job.setJarByClass(FlowDriver.class);

        job.setMapperClass(FlowMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(FlowBean.class);

        job.setReducerClass(FlowReducer.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(FlowBean.class);

        // 自定义分区需要设置的
        job.setNumReduceTasks(5);
        job.setPartitionerClass(MyPartitioner.class);

        FileInputFormat.setInputPaths(job, new Path("/phone_data.txt"));

        Path path = new Path("/output");
        if (fs.exists(path)) {
            fs.delete(path, true);
        }
        FileOutputFormat.setOutputPath(job, path);
        boolean flag = job.waitForCompletion(true);

        System.out.println(flag);
    }
}

FlowMapper

package com.sxuek.partitiontest;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class FlowMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] words = line.split(" ");

        String phoneNumber = words[1];
        long upFlow = Long.parseLong(words[words.length-2]);
        long downFlow = Long.parseLong(words[words.length-3]);

        FlowBean flowBean = new FlowBean();
        flowBean.setPhoneNumber(phoneNumber);
        flowBean.setUpFlow(upFlow);
        flowBean.setDownFlow(downFlow);
        flowBean.setSumFlow(upFlow+downFlow);

        context.write(new Text(phoneNumber), flowBean);
    }
}

FlowReducer

package com.sxuek.partitiontest;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class FlowReducer extends Reducer<Text, FlowBean, NullWritable, FlowBean> {
    @Override
    protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
        FlowBean flowBean = new FlowBean();
        flowBean.setPhoneNumber(key.toString());
        for (FlowBean fb : values) {
            flowBean.setUpFlow(flowBean.getUpFlow()+fb.getUpFlow());
            flowBean.setDownFlow(flowBean.getDownFlow()+fb.getDownFlow());
            flowBean.setSumFlow(flowBean.getSumFlow()+fb.getSumFlow());
        }
        context.write(NullWritable.get(), flowBean);
    }
}

FlowBean

package com.sxuek.partitiontest;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * 1. Hadoop序列化有要求，如果是我们自定义的JavaBean对象，必须实现writable接口
 * 2. JavaBean可以当value也可以当key
 * 如果只当value只需要序列化即可
 * 如果当key必须还要实现比较接口，如果你只当reducer阶段的key不需要比较接口
 * map阶段输出的数据需要排序，为了让reducer获取数据的时候速度快一点
 */
public class FlowBean implements WritableComparable<FlowBean> {
    public int compareTo(FlowBean o) {
        return 0;
    }
    // long默认是null值，如果直接用new出来的对象相加，会报错
    private Long upFlow = 0L;
    private Long downFlow = 0L;
    private Long sumFlow = 0L;
    private String phoneNumber;

    /*
     如果我们想要将Javabean对象当作reduce阶段的输出，将JavaBean对象数据写出到文件中，
     那Hadoop默认情况下会将JavaBean对象的toString方法调用一下，
     然后将toString结果写出到文件中
     */
    @Override
    public String toString() {
        return phoneNumber + "\t" + upFlow + "\t" + downFlow + "\t" + sumFlow;
    }

    /**
     * 用于实现序列化与反序列化
     */
    public FlowBean() {

    }

    /**
     * javabean对象序列化写出的方法
     * @param dataOutput
     * @throws IOException
     */
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(phoneNumber);
        dataOutput.writeLong(upFlow);
        dataOutput.writeLong(downFlow);
        dataOutput.writeLong(sumFlow);
    }

    /**
     * javabean对象反序列化回来的方法
     * @param dataInput
     * @throws IOException
     */
    public void readFields(DataInput dataInput) throws IOException {
        phoneNumber = dataInput.readUTF();
        upFlow = dataInput.readLong();
        downFlow = dataInput.readLong();
        sumFlow = dataInput.readLong();
    }
    public String getPhoneNumber() {
        return phoneNumber;
    }

    public void setPhoneNumber(String phoneNumber) {
        this.phoneNumber = phoneNumber;
    }

    public Long getUpFlow() {
        return upFlow;
    }

    public void setUpFlow(Long upFlow) {
        this.upFlow = upFlow;
    }

    public Long getDownFlow() {
        return downFlow;
    }

    public void setDownFlow(Long downFlow) {
        this.downFlow = downFlow;
    }

    public Long getSumFlow() {
        return sumFlow;
    }

    public void setSumFlow(Long sumFlow) {
        this.sumFlow = sumFlow;
    }

}

posted @ 2022-07-27 13:27 jsqup 阅读(24) 评论(0) 编辑收藏举报

刷新页面返回顶部

jsqup

自定义分区机制

分区数与reduce任务数必须一致

MyPartitioner类

Driver类

FlowMapper

FlowReducer

FlowBean

公告