Hadoop（十二）：多数据源Join统计

多数据源

定义：在一个MR任务中可能需要访问和处理两个甚至更多个的数据集。Hadoop由于没有像关系型数据库那样join的连接处理操作，所以多数据源的连接处理会比较复杂，需要程序员自己实现。

Map端数据连接
Reduce端数据连接
总共包含了四种小的连接方式。

Map端数据连接(map side join)

一个大文件(file1)和一个小文件(file2)进行join操作的情况，file2的内容可以放到内存中。
将小表复制多份，让每个map task内存中存在一份（比如存放到hash table/hash map中）.
然后只扫描大表：对于大表中的每一条记录key/value，在hash table中查找是否有相同的key的记录，如果有，则连接后输出即可。

使用方式概述：

直接使用job.addCacheFile(URI)进行添加file2的文件地址。
在map中使用context.getCacheFiles()来获取缓存文件的地址信息
指定的文件地址要求是hdfs上的文件地址，比如:hdfs://hh:8020/beifeng/cache.file

优点：实现简单，shuffle数据量少。

缺点：如果小文件的内容太多，无法存放到内存中，就无法实现，对环境要求比较严格。

package com.beifeng;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;

/**
 * 演示map端数据join操作<br/>
 * file1是一个大表，file2是一个小表
 * 
 * @author gerry
 *
 */
public class Demo1 {

    public static void main(String[] args) throws Exception {
        //正常配置信息
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "test");
        job.setJarByClass(Demo1.class);
        job.setMapperClass(DemoMapper.class);
        //省略其他配置...
        
        //添加文件到内存中
        job.addCacheFile(new Path("hdfs://hh:8020/beifeng/file2.txt").toUri());

        job.waitForCompletion(true);
    }

    /**
     * 自定义的mapper类
     * 
     * @author gerry
     *
     */
    public static class DemoMapper extends Mapper<Object, Text, Object, Object> {
        // 保存file2的map对象，key是join操作的时候对应的字段信息，value是file2中其他字段信息
        private Map<Object, Object> cache = new HashMap<>();

        @Override
        protected void setup(Mapper<Object, Text, Object, Object>.Context context)
                throws IOException, InterruptedException {
            super.setup(context);

            URI[] uris = context.getCacheFiles();// 获取所有配置的cache file uri信息
            // 循环处理每个uris
            for (URI uri : uris) {
                Path path = new Path(uri);
                // 拿到文件输入流
                InputStream is = path.getFileSystem(context.getConfiguration()).open(path);
                // TODO 进行文件内容的解析操作，将解析结果保存到cache中
            }
        }

        @Override
        protected void map(Object key, Text value, Mapper<Object, Text, Object, Object>.Context context)
                throws IOException, InterruptedException {
            String line = value.toString(); // 拿到file1(大文件)中对应行的信息
            // 把文件中的key写道joinkey中
            Object joinKey = null; 
            if (this.cache.containsKey(joinKey)) {
                // 表示file2中存在这个joinKey，表示数据连接成功，进行输出操作
                Object outputKey = null; // 定义根据业务逻辑来
                Object outputValue = null; // 根据逻辑来
                context.write(outputKey, outputValue);
            }
        }
    }
}

Reduce端数据连接(reduce side join)

在map阶段，map函数同时读取两个文件File1和File2，为了区分两种来源的key/value数据对.
- 对每条数据打一个标签（tag）,比如：tag=1表示来自文件File1，tag=2表示来自文件File2。
- 即：map阶段的主要任务是对不同文件中的数据打标签。
在reduce阶段，reduce函数获取key相同的来自File1和File2文件的value list，然后对于同一个key，对File1和File2中的数据进行join（笛卡尔乘积）。
- 即：reduce阶段进行实际的连接操作。
优点：实现简单。

缺点：shuffle数据量大，对网络要求比较严格。对reduce端的内存有要求。会涉及到二次排序(可选)。

package com.beifeng;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

/**
 * 演示reduce端数据join操作
 * 
 * @author gerry
 *
 */
public class Demo2 {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "test");
        job.setJarByClass(Demo1.class);
        // 其他配置信息
        job.setMapperClass(DemoMapper.class);
        // 其他配置信息
        // 设置二次排序的相关参数配置
        job.waitForCompletion(true);
    }

    /**
     * 自定义的mapper类
     * 
     * @author gerry
     *
     */
    public static class DemoMapper extends Mapper<Object, Text, MapperOutputKey, MapperOutputValue> {
        private MapperOutputValue ov = new MapperOutputValue();
        private MapperOutputKey ok = new MapperOutputKey();

        @Override
        protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString(); // 拿到file1(大文件)中对应行的信息
            FileSplit fs = (FileSplit) context.getInputSplit();
            if (fs.getPath().toString().contains("file1")) {
                // file1文件的处理方式

                // 采用二次排序
                this.ok.tag = 1;
                this.ov.tag = 0;

                // 不采用二次排序
                this.ok.tag = 0;
                this.ov.tag = 1;

                // 公用部分
                this.ok.joinKey = null; // TODO： 根据line信息获取join字段的对应信息
                this.ov.file1Value = line; // 设置file1的value
                this.ov.file2Value = null; // 覆盖
            } else {
                // file2文件的处理方式

                // 使用二次排序
                this.ok.tag = 2;
                this.ov.tag = 0;
                // 不使用二次排序
                this.ok.tag = 0;
                this.ov.tag = 2;

                // 都需要的部分
                this.ok.joinKey = null; // TODO： 根据line信息获取join字段的对应信息
                this.ov.file2Value = line; // 设置file2的value
                this.ov.file1Value = null; // 覆盖
            }

            context.write(ok, ov);
        }
    }

    /**
     * 输入到reduce的结果是按照joinKey进行分组分区的，按照joinKey+tag进行排序的，排序规则：先按照joinKey排序，
     * 然后在按照tag排序，tag为1的在前，为2的在后。
     * 
     * @author gerry
     *
     */
    public static class DemoReducer extends Reducer<MapperOutputKey, MapperOutputValue, Object, Object> {
        // 假设file1和file2之间是一对多的join操作，或者file1中几条数据对应file2中很多条数据join
        private Set<Object> file1Cache = new HashSet<>();

        @Override
        protected void reduce(MapperOutputKey key, Iterable<MapperOutputValue> values, Context context)
                throws IOException, InterruptedException {
            this.file1Cache.clear(); // 清空操作
            for (MapperOutputValue value : values) {
                if (key.tag == 1) {
                    // 表示还出于刚刚开始阶段
                    this.file1Cache.add(value.file1Value);
                } else {
                    // 如果tag不为1，那么表示file1的文件内容以及完全读取完成，开始和file2的文件内容进行join操作
                    Object file2v = value.file2Value;
                    for (Object file1v : this.file1Cache) {
                        // 对应file1v和file2v进行join操作，然后将结果进行输出操作即可
                        // 正式join操作的地方
                    }
                }
            }
        }
    }

    /**
     * 不使用二次排序，那么key中的tag标签在map的时候全部设置为0，在value中添加一个tag，含义和之前的一样
     * 
     * @author gerry
     *
     */
    public static class DemoReducer2 extends Reducer<MapperOutputKey, MapperOutputValue, Object, Object> {
        private Set<Object> file1Cache = new HashSet<>();
        private Set<Object> file2Cache = new HashSet<>();

        @Override
        protected void reduce(MapperOutputKey key, Iterable<MapperOutputValue> values, Context context)
                throws IOException, InterruptedException {
            this.file1Cache.clear(); // 清空操作
            this.file2Cache.clear();

            for (MapperOutputValue value : values) {
                if (value.tag == 1) {
                    // 表示还出于刚刚开始阶段
                    this.file1Cache.add(value.file1Value);
                } else {
                    this.file2Cache.add(value.file2Value);
                }
            }

            // 进行join操作
            for (Object v1 : this.file1Cache) {
                for (Object v2 : this.file2Cache) {
                    // 针对v1和v2进行join操作并输出
                }
            }
        }
    }

    public static class MapperOutputValue implements Writable {
        public Object file1Value;
        public Object file2Value;
        public int tag; // 当不适用二次排序的时候使用该值

        @Override
        public void write(DataOutput out) throws IOException {
            // TODO Auto-generated method stub

        }

        @Override
        public void readFields(DataInput in) throws IOException {
            // TODO Auto-generated method stub

        }

    }

    public static class MapperOutputKey implements WritableComparable<MapperOutputKey> {
        public Object joinKey;
        public int tag; // 标签

        @Override
        public void write(DataOutput out) throws IOException {
            // TODO Auto-generated method stub
        }

        @Override
        public void readFields(DataInput in) throws IOException {
            // TODO Auto-generated method stub
        }

        @Override
        public int compareTo(MapperOutputKey o) {
            // TODO Auto-generated method stub
            return 0;
        }

    }
}

半连接 Semi join

为了解决reduce join高网络传输量的一种解决方案。目标是能够在map端过滤掉不会参加join操作的数据，则可以大大节省网络IO。
选取一个小表，假设是File1，将其参与join的key抽取出来，保存到文件File3中，File3文件一般很小，可以放到内存中。
在map阶段，使用job.addCacheFile方法将File3复制到各个执行Task节点上，然后将File2中不在File3中的key对应的记录过滤掉，剩下的reduce阶段的工作与reduce side join相同。

其实这种方式就是结合map side join和reduce sid join的一个连接方式。

优点：减低shuffle网络传输量。

缺点：增加了编程复杂性，增加了对map端内存的压力。

package com.beifeng;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.HashSet;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

/**
 * 演示半连接<br/>
 * file1和file2进行join操作，然后file1中的joinKey提出出来形成file3<br/>
 * 一般情况下选择file1和file2文件中，文件数据相对而言少的文件进行joinKey提出形成file3文件的方式。
 * 
 * @author gerry
 *
 */
public class Demo3 {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "test");
        job.setJarByClass(Demo1.class);
        // 其他配置信息
        job.setMapperClass(DemoMapper.class);
        // 其他配置信息
        // 配置reduce side join时候需要的二次排序

        // 添加分布式缓存
        job.addCacheFile(new Path("hdfs://hh:8020/beifeng/file3.txt").toUri());

        job.waitForCompletion(true);
    }

    /**
     * 自定义的mapper类
     * 
     * @author gerry
     *
     */
    public static class DemoMapper extends Mapper<Object, Text, MapperOutputKey, MapperOutputValue> {
        private MapperOutputValue ov = new MapperOutputValue();
        private MapperOutputKey ok = new MapperOutputKey();

        // 保存file3的map对象，key是join操作的时候对应的字段信息
        private Set<Object> joinKeySet = new HashSet<>();

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            URI[] uris = context.getCacheFiles();// 获取所有配置的cache file uri信息
            // 循环处理每个uris
            for (URI uri : uris) {
                Path path = new Path(uri);
                // 拿到文件输入流
                InputStream is = path.getFileSystem(context.getConfiguration()).open(path);
                // 解析文件输入流，然后进行文件内容的解析操作，将解析结果保存到joinKeySet中
            }
        }

        @Override
        protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString(); // 拿到file1(大文件)中对应行的信息
            FileSplit fs = (FileSplit) context.getInputSplit();
            if (fs.getPath().toString().contains("file1")) {
                // file1文件的处理方式

                // 采用二次排序
                this.ok.tag = 1;
                this.ov.tag = 0;

                // 不采用二次排序
                this.ok.tag = 0;
                this.ov.tag = 1;

                // 公用部分
                this.ok.joinKey = null; // TODO： 根据line信息获取join字段的对应信息
                this.ov.file1Value = line; // 设置file1的value
                this.ov.file2Value = null; // 覆盖
                context.write(ok, ov);
            } else {
                // file2文件的处理方式
                Object joinKey = null;// TODO： 根据line信息获取join字段的对应信息
                if (this.joinKeySet.contains(joinKey)) {
                    // 如果存在，表示可以进行join操作，否则不会jion操作，直接过滤掉数据

                    // 使用二次排序
                    this.ok.tag = 2;
                    this.ov.tag = 0;
                    // 不使用二次排序
                    this.ok.tag = 0;
                    this.ov.tag = 2;

                    // 都需要的部分
                    this.ok.joinKey = joinKey;
                    this.ov.file2Value = line; // 设置file2的value
                    this.ov.file1Value = null; // 覆盖
                    context.write(ok, ov);
                }
            }

        }
    }

    /**
     * 输入到reduce的结果是按照joinKey进行分组分区的，按照joinKey+tag进行排序的，排序规则：先按照joinKey排序，
     * 然后在按照tag排序，tag为1的在前，为2的在后。
     * 
     * @author gerry
     *
     */
    public static class DemoReducer extends Reducer<MapperOutputKey, MapperOutputValue, Object, Object> {
        // 假设file1和file2之间是一对多的join操作，或者file1中几条数据对应file2中很多条数据join
        private Set<Object> file1Cache = new HashSet<>();

        @Override
        protected void reduce(MapperOutputKey key, Iterable<MapperOutputValue> values, Context context)
                throws IOException, InterruptedException {
            this.file1Cache.clear(); // 清空操作
            for (MapperOutputValue value : values) {
                if (key.tag == 1) {
                    // 表示还出于刚刚开始阶段
                    this.file1Cache.add(value.file1Value);
                } else {
                    // 如果tag不为1，那么表示file1的文件内容以及完全读取完成，开始和file2的文件内容进行join操作
                    Object file2v = value.file2Value;
                    for (Object file1v : this.file1Cache) {
                        // 对应file1v和file2v进行join操作，然后将结果进行输出操作即可
                        // 正式join操作的地方
                    }
                }
            }
        }
    }

    /**
     * 不使用二次排序，那么key中的tag标签在map的时候全部设置为0，在value中添加一个tag，含义和之前的一样
     * 
     * @author gerry
     *
     */
    public static class DemoReducer2 extends Reducer<MapperOutputKey, MapperOutputValue, Object, Object> {
        private Set<Object> file1Cache = new HashSet<>();
        private Set<Object> file2Cache = new HashSet<>();

        @Override
        protected void reduce(MapperOutputKey key, Iterable<MapperOutputValue> values, Context context)
                throws IOException, InterruptedException {
            this.file1Cache.clear(); // 清空操作
            this.file2Cache.clear();

            for (MapperOutputValue value : values) {
                if (value.tag == 1) {
                    // 表示还出于刚刚开始阶段
                    this.file1Cache.add(value.file1Value);
                } else {
                    this.file2Cache.add(value.file2Value);
                }
            }

            // 进行join操作
            for (Object v1 : this.file1Cache) {
                for (Object v2 : this.file2Cache) {
                    // 针对v1和v2进行join操作并输出
                }
            }
        }
    }

    public static class MapperOutputValue implements Writable {
        public Object file1Value;
        public Object file2Value;
        public int tag; // 当不适用二次排序的时候使用该值

        @Override
        public void write(DataOutput out) throws IOException {
            // TODO Auto-generated method stub

        }

        @Override
        public void readFields(DataInput in) throws IOException {
            // TODO Auto-generated method stub

        }

    }

    public static class MapperOutputKey implements WritableComparable<MapperOutputKey> {
        public Object joinKey;
        public int tag; // 标签

        @Override
        public void write(DataOutput out) throws IOException {
            // TODO Auto-generated method stub
        }

        @Override
        public void readFields(DataInput in) throws IOException {
            // TODO Auto-generated method stub
        }

        @Override
        public int compareTo(MapperOutputKey o) {
            // TODO Auto-generated method stub
            return 0;
        }

    }
}

Reduce端连接+BloomFilter过滤(reduce side join + BloomFilter)

某些情况下，Semi Join抽取出来的小表的key集合在内存中仍然存放不下，这时候可以使用BloomFilter以节省空间。
BloomFilter的主要作用是判断元素是否在一个集合中。BloomFilter不会出现false negative(返回false，但是结果存在)，会出现少量的true negative(返回true，但是结果不存在)。
因而可能有一些不在小表中的记录没有过滤掉（但是在小表中的记录一定不会过滤掉），这没什么关系，不会影响最终运算结果，只不过增加了少量的网络IO而已。其他处理方式同半连接。
其实和半连接一样，只是在map阶段不是把Join key放在文件中，而是放在BloomFilter。

优点：减低shuffle网络传输量，降低对map端内存的压力。

缺点：增加了编程复杂性。

package com.beifeng;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.HashSet;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.bloom.BloomFilter;
import org.apache.hadoop.util.bloom.Key;

/**
 * 演示BloomFilter应用于半连接的方式<br/>
 * 采用方式基本和半连接类型，除了在maper端进行过滤的方式不一样外
 * 
 * @author gerry
 *
 */
public class Demo4 {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "test");
        job.setJarByClass(Demo1.class);
        // 其他配置信息
        job.setMapperClass(DemoMapper.class);
        // 其他配置信息
        // 配置reduce side join时候需要的二次排序

        // 添加分布式缓存
        job.addCacheFile(new Path("hdfs://hh:8020/beifeng/file3.txt").toUri());

        job.waitForCompletion(true);
    }

    /**
     * 自定义的mapper类
     * 
     * @author gerry
     *
     */
    public static class DemoMapper extends Mapper<Object, Text, MapperOutputKey, MapperOutputValue> {
        private MapperOutputValue ov = new MapperOutputValue();
        private MapperOutputKey ok = new MapperOutputKey();

        // 保存file3的map对象，key是join操作的时候对应的字段信息
        private BloomFilter bf = null;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            URI[] uris = context.getCacheFiles();// 获取所有配置的cache file uri信息
            int line = 0;
            // 循环处理每个uris，获得文件的总体的行数
            for (URI uri : uris) {
                Path path = new Path(uri);
                // 拿到文件输入流
                InputStream is = path.getFileSystem(context.getConfiguration()).open(path);
            }

            this.bf = new BloomFilter(line, 2, 0); // 第三个参数必须为0或者1
            // 循环处理结果
            for (URI uri : uris) {
                Path path = new Path(uri);
                // 拿到文件输入流
                InputStream is = path.getFileSystem(context.getConfiguration()).open(path);
                // 对is进行数据处理操作，获取每一个值都添加到bf中去
                String le = null;
                // 循环处理le
                this.bf.add(new Key(le.getBytes())); // 添加每一个joinKey
            }
        }

        @Override
        protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString(); // 拿到file1(大文件)中对应行的信息
            FileSplit fs = (FileSplit) context.getInputSplit();
            if (fs.getPath().toString().contains("file1")) {
                // file1文件的处理方式

                // 采用二次排序
                this.ok.tag = 1;
                this.ov.tag = 0;

                // 不采用二次排序
                this.ok.tag = 0;
                this.ov.tag = 1;

                // 公用部分
                this.ok.joinKey = null; // TODO： 根据line信息获取join字段的对应信息
                this.ov.file1Value = line; // 设置file1的value
                this.ov.file2Value = null; // 覆盖
                context.write(ok, ov);
            } else {
                // file2文件的处理方式
                Object joinKey = null;// TODO： 根据line信息获取join字段的对应信息
                if (this.bf.membershipTest(new Key(joinKey.toString().getBytes()))) {
                    // 如果存在，表示可以进行join操作，否则不会jion操作，直接过滤掉数据

                    // 使用二次排序
                    this.ok.tag = 2;
                    this.ov.tag = 0;
                    // 不使用二次排序
                    this.ok.tag = 0;
                    this.ov.tag = 2;

                    // 都需要的部分
                    this.ok.joinKey = joinKey;
                    this.ov.file2Value = line; // 设置file2的value
                    this.ov.file1Value = null; // 覆盖
                    context.write(ok, ov);
                }
            }

        }
    }

    /**
     * 输入到reduce的结果是按照joinKey进行分组分区的，按照joinKey+tag进行排序的，排序规则：先按照joinKey排序，
     * 然后在按照tag排序，tag为1的在前，为2的在后。
     * 
     * @author gerry
     *
     */
    public static class DemoReducer extends Reducer<MapperOutputKey, MapperOutputValue, Object, Object> {
        // 假设file1和file2之间是一对多的join操作，或者file1中几条数据对应file2中很多条数据join
        private Set<Object> file1Cache = new HashSet<>();

        @Override
        protected void reduce(MapperOutputKey key, Iterable<MapperOutputValue> values, Context context)
                throws IOException, InterruptedException {
            this.file1Cache.clear(); // 清空操作
            for (MapperOutputValue value : values) {
                if (key.tag == 1) {
                    // 表示还出于刚刚开始阶段
                    this.file1Cache.add(value.file1Value);
                } else {
                    // 如果tag不为1，那么表示file1的文件内容以及完全读取完成，开始和file2的文件内容进行join操作
                    Object file2v = value.file2Value;
                    for (Object file1v : this.file1Cache) {
                        // 对应file1v和file2v进行join操作，然后将结果进行输出操作即可
                        // 正式join操作的地方
                    }
                }
            }
        }
    }

    /**
     * 不使用二次排序，那么key中的tag标签在map的时候全部设置为0，在value中添加一个tag，含义和之前的一样
     * 
     * @author gerry
     *
     */
    public static class DemoReducer2 extends Reducer<MapperOutputKey, MapperOutputValue, Object, Object> {
        private Set<Object> file1Cache = new HashSet<>();
        private Set<Object> file2Cache = new HashSet<>();

        @Override
        protected void reduce(MapperOutputKey key, Iterable<MapperOutputValue> values, Context context)
                throws IOException, InterruptedException {
            this.file1Cache.clear(); // 清空操作
            this.file2Cache.clear();

            for (MapperOutputValue value : values) {
                if (value.tag == 1) {
                    // 表示还出于刚刚开始阶段
                    this.file1Cache.add(value.file1Value);
                } else {
                    this.file2Cache.add(value.file2Value);
                }
            }

            // 进行join操作
            for (Object v1 : this.file1Cache) {
                for (Object v2 : this.file2Cache) {
                    // 针对v1和v2进行join操作并输出
                }
            }
        }
    }

    public static class MapperOutputValue implements Writable {
        public Object file1Value;
        public Object file2Value;
        public int tag; // 当不适用二次排序的时候使用该值

        @Override
        public void write(DataOutput out) throws IOException {
            // TODO Auto-generated method stub

        }

        @Override
        public void readFields(DataInput in) throws IOException {
            // TODO Auto-generated method stub

        }

    }

    public static class MapperOutputKey implements WritableComparable<MapperOutputKey> {
        public Object joinKey;
        public int tag; // 标签

        @Override
        public void write(DataOutput out) throws IOException {
            // TODO Auto-generated method stub
        }

        @Override
        public void readFields(DataInput in) throws IOException {
            // TODO Auto-generated method stub
        }

        @Override
        public int compareTo(MapperOutputKey o) {
            // TODO Auto-generated method stub
            return 0;
        }

    }
}

案例：倒排索引

一般正常的索引是根据记录来确定属性值，但是在实际的应用中，常常有需要根据属性值确定记录的情况，这种情况下，一般的索引无法进行，这个时候就需要引入倒排索引(inverted index)。

直白来讲倒排索引就是根据值查询记录的索引方式。

案例：

3个txt，里面有一些单词。

要求使用MR任务输出一个统计结果，按单词排序，每个单词显示来源的txt文件名，有多个txt的按出现次数排序。
这就是一个倒排索引，根据文档中的单词映射文档地址的倒排索引。
其他要求如下：

各个文件需要根据指定的关键词创建倒排索引
去掉文件中特殊字符(!,.)和黑名单字符(其他文件指定)（涉及到ChainMapper和map side join）

计算关键词在文档中出现次数，最终结果按照次数从高到低进行排序；格式为keyword -> file1,file2,file3(依赖关系组合MR任务解决该要求)

思路

建立filter Mapper，过滤不需要的单词。
- 使用map side join实现。
建立Count Mapper记录每个单词和文档。
- 两个Mapper的链接使用链式MR结构实现。Chain MR
建立Count Reducer统计每个单词出现的文档和次数。
- 因为是按单词+路径统计次数，所以输入的键是单词+路径，输入的次数是1。
建立Sort Mapper获取Count 的结果。
- Sort MR是依赖于Count MR完成的，使用组合式MR（顺序式也可以）
对Mapper排序分组，得到最终结果。

Count

filter Mapper

package com.rzp.fieldindex;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashSet;
import java.util.Set;
import java.util.StringTokenizer;

public class InvertedIndexFilterMapper extends Mapper<Object, Text, Text, Text> {

    private static final String[] removeChars = new String[] { "!", ",", "." };
    private Set<String> blackWordList = new HashSet<String>();
    private Text filePath = new Text();
    private Text word = new Text();

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        super.setup(context);

        Configuration conf = context.getConfiguration();
        URI[] uris = context.getCacheFiles();
        for (URI uri : uris) {
            Path path = new Path(uri);
            try(BufferedReader br = new BufferedReader(new InputStreamReader(path.getFileSystem(conf).open(path)))){
                String line = null;
                while ((line = br.readLine()) !=null){
                    blackWordList.add(line.toLowerCase());
                }
            }
        }
    }

    @Override
    protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
        FileSplit fsplit = (FileSplit) context.getInputSplit();
        String path = fsplit.getPath().toString();  //获取文件路径
        this.filePath.set(path);
        //开始解析
        String line = value.toString();
        StringTokenizer tokens = new StringTokenizer(line);//按照空格分割
        while (tokens.hasMoreTokens()) {
            String keyword = tokens.nextToken();//获取当前对象
            keyword = this.removeChar(keyword);//移除特殊字符
            if(this.isValidteKeyword(keyword)){
                //是有效keyword，进行操作
                this.word.set(keyword);
                context.write(this.filePath,this.word);
            }
        }
    }

    //移除特殊字符! ,
    private String removeChar(String keyword){
        if (StringUtils.isNotBlank(keyword)) {
            for (String ch : removeChars) {
                keyword = keyword.replace(ch, "");
            }
        }
        return keyword;
    }

    //判断keyword是否不为空而且不再黑名单.
    private boolean isValidteKeyword(String keyword){
        boolean result = StringUtils.isNotBlank(keyword);
        result = result && !this.blackWordList.contains(keyword.toLowerCase());
        return result;
    }
}

CountMapper

package com.rzp.fieldindex;

import com.rzp.pojo.InvertedIndexCountMapperKey;
import com.rzp.pojo.InvertedIndexCountValue;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class InvertedIndexCountMapper extends Mapper<Text, Text, InvertedIndexCountMapperKey, IntWritable> {
    private InvertedIndexCountMapperKey outputKey = new InvertedIndexCountMapperKey();
    private IntWritable one = new IntWritable(1);

    @Override
    protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
        this.outputKey.setPath(key.toString()); // 初始化path
        this.outputKey.setKeyword(value.toString()); // 初始化keyword
        context.write(this.outputKey, this.one);
    }
}

Mapper输出key

package com.rzp.pojo;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class InvertedIndexCountMapperKey implements WritableComparable <InvertedIndexCountMapperKey> {
    private String keyword;
    private String path;

    @Override
    public int compareTo(InvertedIndexCountMapperKey o) {
        int tmp = this.keyword.compareTo(o.keyword);
        if (tmp!=0) return tmp;
        tmp = this.path.compareTo(o.path);
        return tmp;

    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(this.keyword);
        out.writeUTF(this.path);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.keyword = in.readUTF();
        this.path = in.readUTF();
    }

    @Override
    public String toString() {
        return "InvertedIndexCountMapperKey{" +
                "keyword='" + keyword + '\'' +
                ", path='" + path + '\'' +
                '}';
    }

    public String getKeyword() {
        return keyword;
    }

    public void setKeyword(String keyword) {
        this.keyword = keyword;
    }

    public String getPath() {
        return path;
    }

    public void setPath(String path) {
        this.path = path;
    }
}

Reduce

package com.rzp.fieldindex;

import com.rzp.pojo.InvertedIndexCountMapperKey;
import com.rzp.pojo.InvertedIndexCountValue;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

//统计keyword在文档中出现的次数
public class InvertedIndexCountReducer extends Reducer<InvertedIndexCountMapperKey, IntWritable, NullWritable, InvertedIndexCountValue> {
    private InvertedIndexCountValue outputValue = new InvertedIndexCountValue();
    @Override
    protected void reduce(InvertedIndexCountMapperKey key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int sum = 0;
        for (IntWritable value : values) {
            sum += value.get();
        }
        //输出值初始化
        this.outputValue.setKeyword(key.getKeyword());
        this.outputValue.setPath(key.getPath());
        this.outputValue.setCount(sum);

        context.write(NullWritable.get(),this.outputValue);
    }
}

Reduce输出value，重写我们需要的toString

package com.rzp.pojo;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class InvertedIndexCountValue implements Writable {
    private String keyword; // 关键词
    private String path;
    private int count;

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(this.keyword);
        out.writeUTF(this.path);
        out.writeInt(count);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.keyword = in.readUTF();
        this.path = in.readUTF();
        this.count = in.readInt();
    }

    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder();
        sb.append(this.keyword).append("\t");
        sb.append(this.path).append("\t");
        sb.append(this.count);
        return sb.toString();
    }

    public String getKeyword() {
        return keyword;
    }

    public void setKeyword(String keyword) {
        this.keyword = keyword;
    }

    public String getPath() {
        return path;
    }

    public void setPath(String path) {
        this.path = path;
    }

    public int getCount() {
        return count;
    }

    public void setCount(int count) {
        this.count = count;
    }
}

Count Runner

package com.rzp.fieldindex;

import com.rzp.pojo.InvertedIndexCountMapperKey;
import com.rzp.pojo.InvertedIndexCountValue;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.chain.ChainMapper;
import org.apache.hadoop.mapreduce.lib.chain.ChainReducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;


public class InvertedIndexCountRunner {
    public static final String outputPath = "D:\\Hoptest\\fieldindex\\out";
    private Configuration conf = null;

    public InvertedIndexCountRunner(Configuration conf) {
        this.conf = conf;
    }

    public Job creatJob() throws IOException {
        Job job = Job.getInstance(this.conf,"job1");
        ChainMapper.addMapper(job,InvertedIndexFilterMapper.class,Object.class, Text.class, Text.class,Text.class, new Configuration(false));
        ChainMapper.addMapper(job,InvertedIndexCountMapper.class,Text.class, Text.class, InvertedIndexCountMapperKey.class, IntWritable.class, new Configuration(false));
        ChainReducer.setReducer(job,InvertedIndexCountReducer.class,InvertedIndexCountMapperKey.class,IntWritable.class, NullWritable.class, InvertedIndexCountValue.class,new Configuration(false));
        job.setMapOutputKeyClass(InvertedIndexCountMapperKey.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(InvertedIndexCountValue.class);

        FileInputFormat.addInputPath(job,new Path("D:\\Hoptest\\fieldindex\\in"));
        Path outputDir = new Path(outputPath);
        outputDir.getFileSystem(this.conf).delete(outputDir,true);
        FileOutputFormat.setOutputPath(job,outputDir);

        //添加side join的路径
        job.addCacheFile(new Path("D:\\Hoptest\\fieldindex\\in\\black_list.txt").toUri());
        return job;
    }
}

Sort

Mapper

package com.rzp.sort;

import com.rzp.pojo.InvertedIndexSortMapperKey;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class InvertedIndexSortMapper extends Mapper<Object, Text, InvertedIndexSortMapperKey,Text> {
    private InvertedIndexSortMapperKey outputKey = new InvertedIndexSortMapperKey();
    private Text outputValue = new Text();
    @Override
    protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
        String[] splits = value.toString().split("\t");
        if (splits.length !=3){
            throw new RuntimeException("本条数据异常");
        }

        this.outputKey.setKeyword(splits[0]);
        this.outputKey.setCount(Integer.valueOf(splits[2]));
        this.outputValue.set(splits[1]);
        context.write(this.outputKey,this.outputValue);
    }
}

Mapper输出key

package com.rzp.pojo;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class InvertedIndexSortMapperKey implements WritableComparable<InvertedIndexSortMapperKey> {
    private String keyword;
    private int count;

    @Override
    public int compareTo(InvertedIndexSortMapperKey o) {
        int tmp = this.keyword.compareTo(o.keyword);
        if (tmp!=0) return tmp;
        tmp = Integer.compare(this.count,o.count);
        return tmp;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(this.keyword);
        out.writeInt(this.count);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.keyword = in.readUTF();
        this.count = in.readInt();
    }

    @Override
    public String toString() {
        return "InvertedIndexSortMapperKey{" +
                "keyword='" + keyword + '\'' +
                ", count=" + count +
                '}';
    }

    public String getKeyword() {
        return keyword;
    }

    public void setKeyword(String keyword) {
        this.keyword = keyword;
    }

    public int getCount() {
        return count;
    }

    public void setCount(int count) {
        this.count = count;
    }
}

Reducer

package com.rzp.sort;

import com.rzp.pojo.InvertedIndexSortMapperKey;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class InvertedIndexSortReducer extends Reducer<InvertedIndexSortMapperKey,Text, Text, NullWritable> {

    private Text outputKey = new Text();

    @Override
    protected void reduce(InvertedIndexSortMapperKey key, Iterable<Text> values, Context context)throws IOException, InterruptedException {
        StringBuilder sb = new StringBuilder();
        sb.append(key.getKeyword()).append(" -> ");
        for (Text value : values) {
            sb.append(value.toString()).append(",");
        }

        this.outputKey.set(sb.substring(0, sb.length() - 1));
        context.write(this.outputKey, NullWritable.get());
    }
}

Runner

package com.rzp.sort;

import com.rzp.fieldindex.InvertedIndexCountRunner;
import com.rzp.pojo.InvertedIndexSortMapperKey;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;


public class InvertedIndexSortRunner {
    public static final String outputPath = "D:\\Hoptest\\fieldindex\\outsort";
    private Configuration conf = null;
    public InvertedIndexSortRunner(Configuration conf){
        this.conf = conf;
    }
    public Job creteJob() throws IOException {
        Job job = Job.getInstance(this.conf,"job2");
        job.setJarByClass(InvertedIndexSortRunner.class);
        job.setMapperClass(InvertedIndexSortMapper.class);
        job.setReducerClass(InvertedIndexSortReducer.class);

        job.setMapOutputKeyClass(InvertedIndexSortMapperKey.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.addInputPath(job,new Path(InvertedIndexCountRunner.outputPath));
        Path outputDir = new Path(outputPath);
        outputDir.getFileSystem(conf).delete(outputDir, true); // 刪除文件夾
        FileOutputFormat.setOutputPath(job, outputDir);

        // 進行二次排序設置
        job.setGroupingComparatorClass(InvertedIndexGroupingComparable.class);
        job.setNumReduceTasks(1); // 必須設置為一個

        return job;
    }
    /**
     * 自定義分組類
     */
    public static class InvertedIndexGroupingComparable extends WritableComparator {
        public InvertedIndexGroupingComparable() {
            super(InvertedIndexSortMapperKey.class, true);
        }

        @SuppressWarnings("rawtypes")
        @Override
        public int compare(WritableComparable a, WritableComparable b) {
            InvertedIndexSortMapperKey key1 = (InvertedIndexSortMapperKey) a;
            InvertedIndexSortMapperKey key2 = (InvertedIndexSortMapperKey) b;
            return key1.getKeyword().compareTo(key2.getKeyword());
        }
    }
}

整合测试

package com.rzp.service;

import com.rzp.fieldindex.InvertedIndexCountRunner;
import com.rzp.sort.InvertedIndexSortRunner;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

public class InvertedIndexRunner implements Tool {
    private Configuration conf = null;
    @Override
    public int run(String[] args) throws Exception {
        ControlledJob cJob1 = transformJob(new InvertedIndexCountRunner(conf).creatJob());
        ControlledJob cJob2 = transformJob(new InvertedIndexSortRunner(conf).creteJob());
        cJob2.addDependingJob(cJob1);

        JobControl jc = new JobControl("InvertedIndex");
        jc.addJob(cJob1);
        jc.addJob(cJob2);

        new Thread(jc).start();

        while (!jc.allFinished()&&jc.getFailedJobList().isEmpty()){
            Thread.sleep(1000);
        }
        jc.stop();
        System.out.println("执行:" + (jc.getFailedJobList().isEmpty() ? "成功" : "失败"));
        return 0;
    }

    private ControlledJob transformJob(Job job) throws IOException {
        ControlledJob cJob = new ControlledJob(job.getConfiguration());
        cJob.setJob(job);
        return cJob;
    }



    public void setConf(Configuration conf) {
        conf.set("mapreduce.framework.name","local");
        this.conf=conf;
    }

    public Configuration getConf() {
        return this.conf;
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new InvertedIndexRunner(),args);
    }
}

输出结果：

Count结果：

sort结果：

posted @ 2020-04-21 23:40 renzhongpei 阅读(183) 评论(0) 编辑收藏举报

刷新页面返回顶部