三、MapReduce编程实例

前文

MapReduce编程实例

@

前言

简介

讲解_Hadoop 中文网

Hadoop测试项目:HadoopDemo

注意事项

如果下载了HadoopDemo作为测试,用到HDFS_CRUD.java
需要提前准备winutils。最好对应版本。

单词统计 WordCount

WordCountMapper.java

package top.rabbitcrows.hadoop.mr;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * 这里就是MapReduce程序 Map阶段业务逻辑实现的类 Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
 * <p>
 * KEYIN:表示mapper数据输入时key的数据类型,在默认读取数据组件下,叫作ImportFormat,它的行为是每行读取待处理的数据
 * 读取一行,就返回一行给MR程序,这种情况下 KEYIN就表示每一行的起始偏移,因此数据类型是Long
 * <p>
 * VALUEIN:表示mapper数据输入的时候Value的数据类型,在默认读取数据组件下,
 * valueIN就表示读取的一行内容 因此数据类型是String
 * <p>
 * KEYOUT:表示mapper阶段数据输出的时候key的数据类型,在本案例中输出的key是单词,因此数据类型是String
 * ValueOUT:表示mapper阶段数据输出的时候value的数据类型,在本案例中输出的value是单次的此书,因此数据类型是Integer
 * <p>
 * 这里所说的数据类型String,Long都是JDK的自带的类型,
 * 数据在分布式系统中跨网络传输就需要将数据序列化,默认JDK序列化时效率低下,因此
 * 使用Hadoop封装的序列化类型。 long--LongWritable String --Text Integer intWritable ....
 *
 * @author LEHOSO
 */
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    /**
     * 这里就是mapper阶段具体业务逻辑实现的方法 该方法的调用取决于读取数据的组件有没有给MR传入数据
     * 如果有数据传入,每一个<k,v>对,map就会被调用一次
     */
    @Override
    protected void map(LongWritable key, Text value,
                       Mapper<LongWritable, Text, Text, IntWritable>.Context context)
            throws IOException, InterruptedException {
        // 拿到传入进来的一行内容,把数据类型转换为String
        String line = value.toString();
        // 将这行内容按照分隔符切割
        String[] words = line.split(" ");
        // 遍历数组,每出现一个单词就标记一个数组1 例如:<单词,1>
        for (String word : words) {
            // 使用MR上下文context,把Map阶段处理的数据发送给Reduce阶段作为输入数据
            context.write(new Text(word), new IntWritable(1));
            //第一行 hadoop hadoop spark  发送出去的是<hadoop,1><hadoop,1><spark,1>
        }
    }
}

WordCountReducer.java

package top.rabbitcrows.hadoop.mr;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

//都要继承Reducer 这就是我们所说的变成模型,只需要套模板就行了

/**
 * 这里是MR程序 reducer阶段处理的类
 * <p>
 * KEYIN:就是Reducer阶段输入的数据key类型,对应Mapper阶段输出KEY类型 ,在本案例中就是单词
 * <p>
 * VALUEIN:就是Reducer阶段输入的数据value类型,对应Mapper阶段输出VALUE类型 ,在本案例中就是个数
 * <p>
 * KEYOUT:就是Reducer阶段输出的数据key类型,在本案例中,就是单词 Text
 * <p>
 * VALUEOUT:reducer阶段输出的数据value类型,在本案例中,就是单词的总次数
 *
 * @author LEHOSO
 */
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

    /**
     * 这里是REDUCE阶段具体业务类的实现方法
     * 第一行 hadoop hadoop spark  发送出去的是<hadoop,1><hadoop,1><spark,1>
     * reduce接受所有来自Map阶段处理的数据之后,按照Key的字典序进行排序
     * 按照key是否相同作一组去调用reduce方法
     * 本方法的key就是这一组相同的kv对 共同的Key
     * 把这一组的所有v作为一个迭代器传入我们的reduce方法
     * <p>
     * 迭代器:<hadoop,[1,1]>
     */
    @Override
    protected void reduce(Text key, Iterable<IntWritable> value,
                          Reducer<Text, IntWritable, Text, IntWritable>.Context context)
            throws IOException, InterruptedException {
        //定义一个计数器
        int count = 0;
        //遍历一组迭代器,把每一个数量1累加起来就构成了单词的总次数

        //
        for (IntWritable iw : value) {
            count += iw.get();
        }
        context.write(key, new IntWritable(count));
    }
}

WordCountCombiner.java

package top.rabbitcrows.hadoop.mr;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class WordCountCombiner extends Reducer<Text, IntWritable, Text, IntWritable> {

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values,
                          Reducer<Text, IntWritable, Text, IntWritable>.Context context)
            throws IOException, InterruptedException {
        // 1.局部汇总
        int count = 0;
        for (IntWritable v : values) {
            count += v.get();
        }
        context.write(key, new IntWritable(count));
    }
}

WordCountDriver.java

package top.rabbitcrows.hadoop.mr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * Driver类就是MR程序运行的主类,本类中组装了一些程序运行时所需要的信息
 * 比如:使用的Mapper类是什么,Reducer类,数据在什么地方,输出在哪里
 *
 * @author LEHOSO
 */
public class WordCountDriver {

    public static void main(String[] args) throws Exception {
        // 通过Job来封装本次MR的相关信息
        Configuration conf = new Configuration();
        conf.set("mapreduce.framework.name", "local");
        Job wcjob = Job.getInstance(conf);

        // 指定MR Job jar包运行主类
        wcjob.setJarByClass(WordCountDriver.class);
        // 指定本次MR所有的Mapper Reducer类
        wcjob.setMapperClass(WordCountMapper.class);
        wcjob.setReducerClass(WordCountReducer.class);

        // 设置我们的业务逻辑 Mapper类的输出 key和 value的数据类型
        wcjob.setMapOutputKeyClass(Text.class);
        wcjob.setMapOutputValueClass(IntWritable.class);

        // 设置我们的业务逻辑 Reducer类的输出 key和 value的数据类型
        wcjob.setOutputKeyClass(Text.class);
        wcjob.setOutputValueClass(IntWritable.class);

        //设置Combiner组件
        wcjob.setCombinerClass(WordCountCombiner.class);

        // 指定要处理的数据所在的位置
        FileInputFormat.setInputPaths(wcjob, new Path("input/mr"));
        // 指定处理完成之后的结果所保存的位置
        FileOutputFormat.setOutputPath(wcjob, new Path("output/mr"));

        // 提交程序并且监控打印程序执行情况
        boolean res = wcjob.waitForCompletion(true);
        System.exit(res ? 0 : 1);
    }
}

MapReduce 经典案例——倒排索引

InvertedIndexMapper.java

package top.rabbitcrows.mr.InvertedIndex;

import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

/**
 * @author LEHOSO
 * @date 2021/11/2
 * @apinote
 */
public class InvertedIndexMapper extends Mapper<LongWritable, Text, Text, Text> {

    //存储单词和文档名称
    private static Text KeyInfo = new Text();

    //存储词频,初始化为1
    private static final Text valueInfo = new Text("1");

    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] fileds = StringUtils.split(line, " ");
        //得到这行数据所在的文件切片
        FileSplit fileSplit = (FileSplit) context.getInputSplit();
        //根据文件切片得到文件名
        String fileName = fileSplit.getPath().getName();
        for (String filed : fileds) {
            //key值由单词和文档名称组成,如“MapReduce:file1.txt”
            KeyInfo.set(filed + ":" + fileName);
            context.write(KeyInfo, valueInfo);
        }
    }
}

InvertedIndexCombiner.java

package top.rabbitcrows.mr.InvertedIndex;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * @author LEHOSO
 * @date 2021/11/2
 * @apinote
 */
public class InvertedIndexCombiner extends Reducer<Text, Text, Text, Text> {

    private static Text info = new Text();
    //输入:<MapReduce:file3.txt{1,1}>
    //输出:<MapReduce:file3.txt:2>

    @Override
    protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
        int sum = 0;    //统计词频
        for (Text value : values) {
            sum += Integer.parseInt(value.toString());
        }
        int splitIndex = key.toString().indexOf(":");
        //重新设置value值并由文档名称和词频组成
        info.set(key.toString().substring(splitIndex + 1) + ":" + sum);
        //重新设置key值为单词
        key.set(key.toString().substring(0, splitIndex));
        context.write(key, info);
    }
}

InvertedIndexReducer.java

package top.rabbitcrows.mr.InvertedIndex;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * @author LEHOSO
 * @date 2021/11/2
 * @apinote
 */
public class InvertedIndexReducer extends Reducer<Text, Text, Text, Text> {
    private static Text result = new Text();
    //输入:<MapReduce:file3.txt:2}>
    //输出:<MapReduce:file1.txt:1;file2.txt:1;file3.txt:2;>

    @Override
    protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
        //生成文档列表
        String fileList = new String();
        for (Text value : values) {
            fileList += value.toString() + ";";
        }
        result.set(fileList);
        context.write(key, result);
    }
}

InvertedIndexDriver.java

package top.rabbitcrows.mr.InvertedIndex;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * @author LEHOSO
 * @date 2021/11/2
 * @apinote
 */
public class InvertedIndexDriver {

    public static void main(String[] args)
            throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
//        conf.set("mapreduce.framework.name", "local");
        Job job = Job.getInstance(conf);

        job.setJarByClass(InvertedIndexDriver.class);

        job.setMapperClass(InvertedIndexMapper.class);
        job.setReducerClass(InvertedIndexReducer.class);
        job.setCombinerClass(InvertedIndexCombiner.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);


        // 指定要处理的数据所在的位置
        FileInputFormat.setInputPaths(job,
                new Path("input/InvertedIndex/"));
        // 指定处理完成之后的结果所保存的位置
        FileOutputFormat.setOutputPath(job,
                new Path("output/InvertedIndex"));

        // 提交程序并且监控打印程序执行情况
        boolean res = job.waitForCompletion(true);
        System.exit(res ? 0 : 1);
    }
}

MapReduce 经典案例——数据去重

DedupMapper.java

package top.rabbitcrows.mr.dedup;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * @author LEHOSO
 * @date 2021/11/5
 * @apinote
 */
public class DedupMapper extends Mapper<LongWritable, Text, Text, NullWritable> {

    private static Text field = new Text();

    //<0,2021-11-1 a><11,2021-11-2 b>
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
        field = value;
        //NullWritable.get()方法设置空值
        context.write(field, NullWritable.get());
        // <2018-3-3 c,null> <2018-3-4 d,null>

    }
}

DedupReducer.java

package top.rabbitcrows.mr.dedup;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * @author LEHOSO
 * @date 2021/11/5
 * @apinote
 */
public class DedupReducer extends Reducer<Text, NullWritable,Text,NullWritable> {
    //<2021-11-1,a,null><2021-11-2,b,null><2021-11-3,c,null>

    @Override
    protected void reduce(Text key, Iterable<NullWritable> values, Reducer<Text, NullWritable, Text, NullWritable>.Context context) throws IOException, InterruptedException {
        context.write(key,NullWritable.get());
    }
}

DedupDriver.java

package top.rabbitcrows.mr.dedup;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * @author LEHOSO
 * @date 2021/11/5
 * @apinote
 */
public class DedupDriver {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(DedupDriver.class);
        job.setMapperClass(DedupMapper.class);
        job.setReducerClass(DedupReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job, new Path("input/Dedup"));

        // 指定处理完成之后的结果所保存的位置
        FileOutputFormat.setOutputPath(job, new Path("output/Dedup"));

        job.waitForCompletion(true);

    }
}

MapReduce 经典案例——TopN

TopNMapper.java

package top.rabbitcrows.mr.topN;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.util.TreeMap;

/**
 * @author LEHOSO
 * @date 2021/11/5
 * @apinote
 */
public class TopNMapper extends Mapper<LongWritable, Text, NullWritable, IntWritable> {

    private TreeMap<Integer, String> repToRecordMap = new TreeMap<Integer, String>();

    // <0,10 3 8 7 6 5 1 2 9 4>
    // <xx,11 12 17 14 15 20>
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, NullWritable, IntWritable>.Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] nums = line.split(" ");
        for (String num : nums) {
            //读取每行数据写入TreeMap,超过5个就会移除最小的数值
            repToRecordMap.put(Integer.parseInt(num), " ");
            if (repToRecordMap.size() > 5) {
                repToRecordMap.remove(repToRecordMap.firstKey());
            }
        }
    }

    //重写cleanup()方法,读取完所有文件行数据后,再输出到Reduce阶段
    @Override
    protected void cleanup(Mapper<LongWritable, Text, NullWritable, IntWritable>.Context context) throws IOException, InterruptedException {
        for (Integer i : repToRecordMap.keySet()) {
            try {
                context.write(NullWritable.get(), new IntWritable(i));
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }
}

TopNReducer.java

package top.rabbitcrows.mr.topN;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.Comparator;
import java.util.TreeMap;

/**
 * @author LEHOSO
 * @date 2021/11/5
 * @apinote
 */
public class TopNReducer extends Reducer<NullWritable, IntWritable, NullWritable, IntWritable> {

    private TreeMap<Integer, String> repToRecordMap = new TreeMap<Integer, String>(new Comparator<Integer>() {

        //返回一个基本类型的整型,谁大谁排后面.
        //返回负数表示:o1 小于o2
        //返回0表示:表示:o1和o2相等
        //返回正数表示:o1大于o2。
        public int compare(Integer a, Integer b) {
            return b - a;
        }
    });

    public void reduce(NullWritable key, Iterable<IntWritable> values, Context context)
            throws IOException, InterruptedException {
        for (IntWritable value : values) {
            repToRecordMap.put(value.get(), " ");
            if (repToRecordMap.size() > 5) {
                repToRecordMap.remove(repToRecordMap.firstKey());
            }
        }
        for (Integer i : repToRecordMap.keySet()) {
            context.write(NullWritable.get(), new IntWritable(i));
        }
    }
}

TopNDriver.java

package top.rabbitcrows.mr.topN;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * @author LEHOSO
 * @date 2021/11/5
 * @apinote
 */
public class TopNDriver {

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance();
        job.setJarByClass(TopNDriver.class);
        job.setMapperClass(TopNMapper.class);
        job.setReducerClass(TopNReducer.class);
        job.setNumReduceTasks(1);
        //map阶段输出的key
        job.setMapOutputKeyClass(NullWritable.class);
        //map阶段输出的value
        job.setMapOutputValueClass(IntWritable.class);
        //reduce阶段输出的key
        job.setOutputKeyClass(NullWritable.class);
        //reduce阶段输出的value
        job.setMapOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path("input/TopN/num.txt"));
        FileOutputFormat.setOutputPath(job, new Path("output/TopN"));

        boolean res = job.waitForCompletion(true);
        System.out.println(res ? 0 : 1);

    }

}

Github下载地址

(HadoopDemo)[https://github.com/lehoso/HadoopDemo]

posted @ 2021-11-13 22:36  李好秀  阅读(265)  评论(0编辑  收藏  举报