大数据之Hadoop集群中MapReduce的Join操作

需求分析

如下两张输入表格

order表

id	pid	amount
1001	01	1
1002	02	2
1003	03	3
1004	01	4
1005	02	5
1006	03	6

pd表

pid	pname
01	小米
02	华为
03	格力

将商品信息表中数据根据商品pid合并的订单数据表中。原文：sw-code

id	pname	amount
1001	小米	1
1004	小米	4
1002	华为	2
1005	华为	5
1003	格力	3
1006	格力	6

Reduce Join

创建一个TableBean对象，其包含两个文件的所有属性，方便在map阶段封装数据

public class TableBean implements Writable {

    private String id;
    private String pid;
    private Integer amount;
    private String pname;
    private String flag;

    public TableBean() {
    }

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public String getPid() {
        return pid;
    }

    public void setPid(String pid) {
        this.pid = pid;
    }

    public Integer getAmount() {
        return amount;
    }

    public void setAmount(Integer amount) {
        this.amount = amount;
    }

    public String getPname() {
        return pname;
    }

    public void setPname(String pname) {
        this.pname = pname;
    }

    public String getFlag() {
        return flag;
    }

    public void setFlag(String flag) {
        this.flag = flag;
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(id);
        dataOutput.writeUTF(pid);
        dataOutput.writeInt(amount);
        dataOutput.writeUTF(pname);
        dataOutput.writeUTF(flag);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        this.id = dataInput.readUTF();
        this.pid = dataInput.readUTF();
        this.amount = dataInput.readInt();
        this.pname = dataInput.readUTF();
        this.flag = dataInput.readUTF();
    }

    @Override
    public String toString() {
        return id + '\t' + pname + '\t' + amount;
    }
}

在map阶段根据文件名来区分加载对象，setup方法一个文件只会执行一次，在该方法中获取文件名称，在map方法中根据文件名来执行不同的操作，值得注意的是属性不能为默认的NULL。

public class TableMapper extends Mapper<LongWritable, Text, Text, TableBean> {

    private String filename;
    private Text outK = new Text();
    private TableBean outV = new TableBean();
    @Override
    protected void setup(Mapper<LongWritable, Text, Text, TableBean>.Context context) throws IOException, InterruptedException {
        // 初始化
        FileSplit inputSplit = (FileSplit) context.getInputSplit();
        filename = inputSplit.getPath().getName();
    }

    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, TableBean>.Context context) throws IOException, InterruptedException {
        String line = value.toString();
        //判断是哪个文件
        if (filename.contains("order")) {
            String[] split = line.split("\t");
            // 封装k v
            outK.set(split[1]);
            outV.setId(split[0]);
            outV.setPid(split[1]);
            outV.setAmount(Integer.parseInt(split[2]));
            outV.setPname("");
            outV.setFlag("order");
        } else {
            String[] split = line.split("\t");
            // 封装k v
            outK.set(split[0]);
            outV.setId("");
            outV.setPid(split[0]);
            outV.setAmount(0);
            outV.setPname(split[1]);
            outV.setFlag("pd");
        }
        //写出
        context.write(outK, outV);
    }
}

由于使用pid为key，两个表中相同的pid会进入同一个reduce，再根据flag判断是哪个表中的数据，如果是order将其保存到数组中，如果是pd则获取其pname，循环order数组赋值。值得注意的是，由于values并非Java中默认的迭代器，如果只是add(value)赋值的是地址，无法达到预期要求。

public class TableReducer extends Reducer<Text, TableBean, TableBean, NullWritable> {

    @Override
    protected void reduce(Text key, Iterable<TableBean> values, Reducer<Text, TableBean, TableBean, NullWritable>.Context context) throws IOException, InterruptedException {
        ArrayList<TableBean> orderBeans = new ArrayList<>();
        TableBean pBean = new TableBean();
        for (TableBean value : values) {
            if ("order".equals(value.getFlag())) {
                TableBean tempTableBean = new TableBean();
                try {
                    BeanUtils.copyProperties(tempTableBean, value);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
                orderBeans.add(tempTableBean);
            } else {
                try {
                    BeanUtils.copyProperties(pBean, value);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
            }
        }
        // 遍历orderBeans
        for (TableBean orderBean : orderBeans) {
            orderBean.setPname(pBean.getPname());
            context.write(orderBean, NullWritable.get());
        }

    }
}

总结：如果数据量非常大，所有的压力都会来到reduce阶段，这样会导致数据倾斜。为了防止发生，可以将Join操作放到map阶段，因为map阶段处理的数据都是块大小128M。

Map Join

Map Join适用与一张十分小、一张很大的表的场景

在Map端缓存多张表，提前处理业务逻辑，这样增加Map端业务，减少Reduce端数据的压力，尽可能的减少数据倾斜。

采用DistributedCache的方法：

（1）在Mapper的setup阶段，将文件读取到缓存集合中

（2）在Driver驱动类中加载缓存

// 缓存普通文件到Task运行节点
job.addCacheFile(new URI("file:///e:/cache/pd.txt"));
// 如果是集群运行，需要设置HDFS路径
job.addCacheFile(new URI("hdfs://hadoop102:8020/cache/pd.txt"));

实操案例

Mapper

public class MapJoinMapper extends Mapper<LongWritable, Text, Text, NullWritable> {

    private HashMap<String, String> pdMap = new HashMap<>();
    private Text outK = new Text();


    @Override
    protected void setup(Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
        // 获取缓存文件，并把文件内容封装到集合中 pd.txt
        URI[] cacheFiles = context.getCacheFiles();
        URI cacheFile = cacheFiles[0];
        FileSystem fs = FileSystem.get(context.getConfiguration());
        FSDataInputStream fis = fs.open(new Path(cacheFile));

        // 从流中读取数据
        BufferedReader reader = new BufferedReader(new InputStreamReader(fis, "UTF-8"));
        String line;
        while (StringUtils.isNotEmpty(line=reader.readLine())) {
            // 切割
            String[] fields = line.split("\t");
            pdMap.put(fields[0], fields[1]);
        }
        IOUtils.closeStream(reader);
    }

    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {

        // 处理 order.txt
        String line = value.toString();
        String[] split = line.split("\t");
        String pName = pdMap.get(split[1]);

        outK.set(split[0] + "\t" + pName + "\t" + split[2]);
        context.write(outK, NullWritable.get());
    }
}

Driver

public static void main(String[] args) throws IOException, URISyntaxException, InterruptedException, ClassNotFoundException {
        Job job = Job.getInstance(new Configuration());
        job.setMapperClass(MapJoinMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        job.addCacheFile(new URI("file:///D:/hadoop/input/mapjoincache/pd.txt"));
    	// 不需要reduce阶段
        job.setNumReduceTasks(0);

        FileInputFormat.setInputPaths(job, new Path("D:\\hadoop\\input\\mapjoin"));
        FileOutputFormat.setOutputPath(job, new Path("D:\\hadoop\\output\\mapjoin"));

        boolean b = job.waitForCompletion(true);
        System.exit(b?0:1);
    }
}

posted @ 2022-06-17 20:59 sw-code 阅读(93) 评论(0) 编辑收藏举报

刷新页面返回顶部

Loading

sw-code