Hadoop 学习之路 MapReduce编程案例

一，WordCount程序

　　案例分析

　　　需求：统计多个文件的每个单词的出现的次数。

　　　分析：

　　　　map阶段：将每一行文本数据变成<单词,1>这样的kv数据

　　　　 reduce阶段：将相同单词的一组kv数据进行聚合：累加所有的v

　　代码实现

　　　编码实现：Mapper类开发

package cn.edu360.mr.wc;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

/**
 * KEYIN ：是map task读取到的数据的key的类型，是一行的起始偏移量Long
 * VALUEIN:是map task读取到的数据的value的类型，是一行的内容String
 * 
 * KEYOUT：是用户的自定义map方法要返回的结果kv数据的key的类型，在wordcount逻辑中，我们需要返回的是单词String
 * VALUEOUT:是用户的自定义map方法要返回的结果kv数据的value的类型，在wordcount逻辑中，我们需要返回的是整数Integer
 * 但是，在mapreduce中，map产生的数据需要传输给reduce，需要进行序列化和反序列化，而jdk中的原生序列化机制产生的数据量比较冗余，就会导致数据在mapreduce运行过程中传输效率低下
 * 所以，hadoop专门设计了自己的序列化机制，那么，mapreduce中传输的数据类型就必须实现hadoop自己的序列化接口
 * 
 * hadoop为jdk中的常用基本类型Long String Integer Float等数据类型封住了自己的实现了hadoop序列化接口的类型：LongWritable,Text,IntWritable,FloatWritable
 * @author ThinkPad
 *
 */
public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
    @Override
    protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
        // 切单词
        String line = value.toString();
        String[] words = line.split(" ");
        for(String word:words){
            context.write(new Text(word), new IntWritable(1));
            
        }
    }
}

Mapper

　　　编码实现：Reduce类开发

package cn.edu360.mr.wc;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
    
    
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
    
        
        int count = 0;
        
        Iterator<IntWritable> iterator = values.iterator();
        while(iterator.hasNext()){
            
            IntWritable value = iterator.next();
            count += value.get();
        }
        
        
        context.write(key, new IntWritable(count));
        
    }
    
    

}

Reduce类

　编码实现：提交任务类开发

package cn.edu360.mr.wc;

import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 用于提交mapreduce job的客户端程序
 * 功能：
 *   1、封装本次job运行时所需要的必要参数
 *   2、跟yarn进行交互，将mapreduce程序成功的启动、运行
 * @author ThinkPad
 *
 */
public class JobSubmitter {
    
    public static void main(String[] args) throws Exception {
        
        // 在代码中设置JVM系统参数，用于给job对象来获取访问HDFS的用户身份
        System.setProperty("HADOOP_USER_NAME", "root");
        
        
        Configuration conf = new Configuration();
        // 1、设置job运行时要访问的默认文件系统
        conf.set("fs.defaultFS", "hdfs://hdp-01:9000");
        // 2、设置job提交到哪去运行
        conf.set("mapreduce.framework.name", "yarn");
        conf.set("yarn.resourcemanager.hostname", "hdp-01");
        // 3、如果要从windows系统上运行这个job提交客户端程序，则需要加这个跨平台提交的参数
        conf.set("mapreduce.app-submission.cross-platform","true");
        
        Job job = Job.getInstance(conf);
        
        // 1、封装参数：jar包所在的位置
        job.setJar("d:/wc.jar");
        //job.setJarByClass(JobSubmitter.class);
        
        // 2、封装参数： 本次job所要调用的Mapper实现类、Reducer实现类
        job.setMapperClass(WordcountMapper.class);
        job.setReducerClass(WordcountReducer.class);
        
        // 3、封装参数：本次job的Mapper实现类、Reducer实现类产生的结果数据的key、value类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        
        
        
        Path output = new Path("/wordcount/output");
        FileSystem fs = FileSystem.get(new URI("hdfs://hdp-01:9000"),conf,"root");
        if(fs.exists(output)){
            fs.delete(output, true);
        }
        
        // 4、封装参数：本次job要处理的输入数据集所在路径、最终结果的输出路径
        FileInputFormat.setInputPaths(job, new Path("/wordcount/input"));
        FileOutputFormat.setOutputPath(job, output);  // 注意：输出路径必须不存在
        
        
        // 5、封装参数：想要启动的reduce task的数量
        job.setNumReduceTasks(2);
        
        // 6、提交job给yarn
        boolean res = job.waitForCompletion(true);
        
        System.exit(res?0:-1);
        
    }
    
    

}

提交任务类

二，求WordCount的topN程序

　　案例分析

　　　需求：读取附件中的文件request.dat，求访问次数最多的topn个网站（只能有1个reduce worker）《全局TOPN》

　　　思路：　

　　　　　map阶段：解析数据，将域名作为key，1作为value

　　　　　reduce阶段：

　　　　　　　reduce方法中——对一个域名的一组1累加，然后将 <域名,总次数>放入一个成员变量Treemap中

　　　　　　　cleanup方法中——从treemap中挑出次数最高的n个域名作为结果输出

　　代码实现

　　　编码实现： PageCount类开发

package cn.edu360.mr.page.topn;

public class PageCount implements Comparable<PageCount>{
    
    private String page;
    private int count;
    
    public void set(String page, int count) {
        this.page = page;
        this.count = count;
    }
    
    public String getPage() {
        return page;
    }
    public void setPage(String page) {
        this.page = page;
    }
    public int getCount() {
        return count;
    }
    public void setCount(int count) {
        this.count = count;
    }

    @Override
    public int compareTo(PageCount o) {
        
        return o.getCount()-this.count==0?this.page.compareTo(o.getPage()):o.getCount()-this.count;
    }
    
    

}

PageCount

　　　编码实现：Mapper类开发

package cn.edu360.mr.page.topn;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class PageTopnMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
    
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
            throws IOException, InterruptedException {
        String line = value.toString();
        String[] split = line.split(" ");
        context.write(new Text(split[1]), new IntWritable(1));
    }

}

Mapper

　　　编码实现：Reduce类开发

package cn.edu360.mr.page.topn;

import java.io.IOException;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class PageTopnReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
    
    TreeMap<PageCount, Object> treeMap = new TreeMap<>();
    
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values,
            Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
        int count = 0;
        for (IntWritable value : values) {
            count += value.get();
        }
        PageCount pageCount = new PageCount();
        pageCount.set(key.toString(), count);
        
        treeMap.put(pageCount,null);
        
    }
    
    
    @Override
    protected void cleanup(Context context)
            throws IOException, InterruptedException {
        Configuration conf = context.getConfiguration();
        int topn = conf.getInt("top.n", 5);
        
        
        Set<Entry<PageCount, Object>> entrySet = treeMap.entrySet();
        int i= 0;
        
        for (Entry<PageCount, Object> entry : entrySet) {
            context.write(new Text(entry.getKey().getPage()), new IntWritable(entry.getKey().getCount()));
            i++;
            if(i==topn) return;
        }
        
        
    }
    

}

Reduce

　　　编码实现：提交任务类开发

package cn.edu360.mr.page.topn;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class JobSubmitter {

    public static void main(String[] args) throws Exception {

        /**
         * 通过加载classpath下的*-site.xml文件解析参数
         */
        Configuration conf = new Configuration();
        conf.addResource("xx-oo.xml");
        
        /**
         * 通过代码设置参数
         */
        //conf.setInt("top.n", 3);
        //conf.setInt("top.n", Integer.parseInt(args[0]));
        
        /**
         * 通过属性配置文件获取参数
         */
        /*Properties props = new Properties();
        props.load(JobSubmitter.class.getClassLoader().getResourceAsStream("topn.properties"));
        conf.setInt("top.n", Integer.parseInt(props.getProperty("top.n")));*/
        
        Job job = Job.getInstance(conf);

        job.setJarByClass(JobSubmitter.class);

        job.setMapperClass(PageTopnMapper.class);
        job.setReducerClass(PageTopnReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\url\\input"));
        FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\url\\output"));

        job.waitForCompletion(true);

    }



}

Submit

三，MapReduce中的自定义类型

　　案例分析

　　　需求：统计一下文件中，每一个用户所耗费的总上行流量，总下行流量，总流量

　　　思路：

　　　　　map阶段：将每一行按tab切分成各字段，提取其中的手机号作为输出key，流量信息封装到FlowBean对象中，作为输出的value　　

　　　　　reduce阶段：遍历一组数据的所有value（flowbean），进行累加，然后以手机号作为key输出，以总流量信息bean作为value输出

　　　　　注意：自定义类型如何实现hadoop的序列化接口。

　　　　　FlowBean这种自定义数据类型必须实现hadoop的序列化接口：Writable，实现其中的两个方法：

　　　　　readFields(in) 反序列化方法

　　　　　write(out) 序列化方法

　　代码实现

　　　编码实现： PageCount类开发

package cn.edu360.mr.flow;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

/**
 * 本案例的功能：演示自定义数据类型如何实现hadoop的序列化接口
 * 1、该类一定要保留空参构造函数
 * 2、write方法中输出字段二进制数据的顺序  要与  readFields方法读取数据的顺序一致
 * 
 * @author ThinkPad
 *
 */
public class FlowBean implements Writable {

    private int upFlow;
    private int dFlow;
    private String phone;
    private int amountFlow;

    public FlowBean(){}
    
    public FlowBean(String phone, int upFlow, int dFlow) {
        this.phone = phone;
        this.upFlow = upFlow;
        this.dFlow = dFlow;
        this.amountFlow = upFlow + dFlow;
    }

    public String getPhone() {
        return phone;
    }

    public void setPhone(String phone) {
        this.phone = phone;
    }

    public int getUpFlow() {
        return upFlow;
    }

    public void setUpFlow(int upFlow) {
        this.upFlow = upFlow;
    }

    public int getdFlow() {
        return dFlow;
    }

    public void setdFlow(int dFlow) {
        this.dFlow = dFlow;
    }

    public int getAmountFlow() {
        return amountFlow;
    }

    public void setAmountFlow(int amountFlow) {
        this.amountFlow = amountFlow;
    }

    /**
     * hadoop系统在序列化该类的对象时要调用的方法
     */
    @Override
    public void write(DataOutput out) throws IOException {

        out.writeInt(upFlow);
        out.writeUTF(phone);
        out.writeInt(dFlow);
        out.writeInt(amountFlow);

    }

    /**
     * hadoop系统在反序列化该类的对象时要调用的方法
     */
    @Override
    public void readFields(DataInput in) throws IOException {
        this.upFlow = in.readInt();
        this.phone = in.readUTF();
        this.dFlow = in.readInt();
        this.amountFlow = in.readInt();
    }

    @Override
    public String toString() {
         
        return this.phone + ","+this.upFlow +","+ this.dFlow +"," + this.amountFlow;
    }
    
}

FlowBean

　　　编码实现：Mapper类开发

package cn.edu360.mr.flow;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean>{
    
    
    @Override
    protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
        
        String line = value.toString();
        String[] fields = line.split("\t");
        
        String phone = fields[1];
        
        int upFlow = Integer.parseInt(fields[fields.length-3]);
        int dFlow = Integer.parseInt(fields[fields.length-2]);
        
        context.write(new Text(phone), new FlowBean(phone, upFlow, dFlow));
    }
    

}

Mapper

　　　编码实现：Reduce类开发

package cn.edu360.mr.flow;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean>{
    
    
    
    /**
     *  key：是某个手机号
     *  values：是这个手机号所产生的所有访问记录中的流量数据
     *  
     *  <135,flowBean1><135,flowBean2><135,flowBean3><135,flowBean4>
     */
    @Override
    protected void reduce(Text key, Iterable<FlowBean> values, Reducer<Text, FlowBean, Text, FlowBean>.Context context)
            throws IOException, InterruptedException {

        int upSum = 0;
        int dSum = 0;
        
        for(FlowBean value:values){
            upSum += value.getUpFlow();
            dSum += value.getdFlow();
        }
        
        
        context.write(key, new FlowBean(key.toString(), upSum, dSum));
        
    }
    

}

Reduce

　　　编码实现：提交任务类开发

package cn.edu360.mr.flow;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class JobSubmitter {

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(JobSubmitter.class);

        job.setMapperClass(FlowCountMapper.class);
        job.setReducerClass(FlowCountReducer.class);
        
        // 设置参数：maptask在做数据分区时，用哪个分区逻辑类  （如果不指定，它会用默认的HashPartitioner）
        job.setPartitionerClass(ProvincePartitioner.class);
        // 由于我们的ProvincePartitioner可能会产生6种分区号，所以，需要有6个reduce task来接收
        job.setNumReduceTasks(6);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(FlowBean.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(FlowBean.class);
        
    

        FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\flow\\input"));
        FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\flow\\province-output"));

        job.waitForCompletion(true);

    }

}

Submit

四，MapReduce中的自定义Partitioner

　　案例分析

　　　需求：统计每一个用户的总流量信息，并且按照其归属地，将统计结果输出在不同的文件中

　　　思路：

　　　我们知道分区不同对应到的reduce就不同，reduce不同输出的文件就不同，所以只要想办法让map端worker在将数据分区时，按照我们需要的按归属地划分。

　　　实现方式：自定义一个Partitioner

　　代码实现

　　　编码实现：FlowBean类开发

package cn.edu360.mr.flow;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

/**
 * 本案例的功能：演示自定义数据类型如何实现hadoop的序列化接口
 * 1、该类一定要保留空参构造函数
 * 2、write方法中输出字段二进制数据的顺序  要与  readFields方法读取数据的顺序一致
 * 
 * @author ThinkPad
 *
 */
public class FlowBean implements Writable {

    private int upFlow;
    private int dFlow;
    private String phone;
    private int amountFlow;

    public FlowBean(){}
    
    public FlowBean(String phone, int upFlow, int dFlow) {
        this.phone = phone;
        this.upFlow = upFlow;
        this.dFlow = dFlow;
        this.amountFlow = upFlow + dFlow;
    }

    public String getPhone() {
        return phone;
    }

    public void setPhone(String phone) {
        this.phone = phone;
    }

    public int getUpFlow() {
        return upFlow;
    }

    public void setUpFlow(int upFlow) {
        this.upFlow = upFlow;
    }

    public int getdFlow() {
        return dFlow;
    }

    public void setdFlow(int dFlow) {
        this.dFlow = dFlow;
    }

    public int getAmountFlow() {
        return amountFlow;
    }

    public void setAmountFlow(int amountFlow) {
        this.amountFlow = amountFlow;
    }

    /**
     * hadoop系统在序列化该类的对象时要调用的方法
     */
    @Override
    public void write(DataOutput out) throws IOException {

        out.writeInt(upFlow);
        out.writeUTF(phone);
        out.writeInt(dFlow);
        out.writeInt(amountFlow);

    }

    /**
     * hadoop系统在反序列化该类的对象时要调用的方法
     */
    @Override
    public void readFields(DataInput in) throws IOException {
        this.upFlow = in.readInt();
        this.phone = in.readUTF();
        this.dFlow = in.readInt();
        this.amountFlow = in.readInt();
    }

    @Override
    public String toString() {
         
        return this.phone + ","+this.upFlow +","+ this.dFlow +"," + this.amountFlow;
    }
    
}

FlowBean

　　　编码实现：Mapper类开发

package cn.edu360.mr.flow;

import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean>{
    @Override
    protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
        
        String line = value.toString();
        String[] fields = line.split("\t");
        String phone = fields[1];
        int upFlow = Integer.parseInt(fields[fields.length-3]);
        int dFlow = Integer.parseInt(fields[fields.length-2]);
        context.write(new Text(phone), new FlowBean(phone, upFlow, dFlow));
    }
}

Mapper

　　　编码实现：Reduce类开发

package cn.edu360.mr.flow;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean>{
    /**
     *  key：是某个手机号
     *  values：是这个手机号所产生的所有访问记录中的流量数据
     *  
     *  <135,flowBean1><135,flowBean2><135,flowBean3><135,flowBean4>
     */
    @Override
    protected void reduce(Text key, Iterable<FlowBean> values, Reducer<Text, FlowBean, Text, FlowBean>.Context context)
            throws IOException, InterruptedException {
        int upSum = 0;
        int dSum = 0;
        for(FlowBean value:values){
            upSum += value.getUpFlow();
            dSum += value.getdFlow();
        }
        context.write(key, new FlowBean(key.toString(), upSum, dSum));
    }
}

Reduce

　　　编码实现：Partitioner类开发

package cn.edu360.mr.flow;

import java.util.HashMap;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
/**
 * 本类是提供给MapTask用的
 * MapTask通过这个类的getPartition方法，来计算它所产生的每一对kv数据该分发给哪一个reduce task
 * @author ThinkPad
 *
 */
public class ProvincePartitioner extends Partitioner<Text, FlowBean>{
    static HashMap<String,Integer> codeMap = new HashMap<>();
    static{
        codeMap.put("135", 0);
        codeMap.put("136", 1);
        codeMap.put("137", 2);
        codeMap.put("138", 3);
        codeMap.put("139", 4);
        
    }
    @Override
    public int getPartition(Text key, FlowBean value, int numPartitions) {
        Integer code = codeMap.get(key.toString().substring(0, 3));
        return code==null?5:code;
    }
}

Partitioner

　　　编码实现：Submiit类开发

package cn.edu360.mr.flow;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class JobSubmitter {

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(JobSubmitter.class);
        job.setMapperClass(FlowCountMapper.class);
        job.setReducerClass(FlowCountReducer.class);
        // 设置参数：maptask在做数据分区时，用哪个分区逻辑类  （如果不指定，它会用默认的HashPartitioner）
        job.setPartitionerClass(ProvincePartitioner.class);
        // 由于我们的ProvincePartitioner可能会产生6种分区号，所以，需要有6个reduce task来接收
        job.setNumReduceTasks(6);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(FlowBean.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(FlowBean.class);
        FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\flow\\input"));
        FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\flow\\province-output"));

        job.waitForCompletion(true);

    }
}

Submit

五，MapReduce中的自定义排序

　　案例分析

　　　需求：统计request.dat中每个页面被访问的总次数，同时，要求输出结果文件中的数据按照次数大小倒序排序

　　　思路：

　　　　　关键点：

　　　　　mapreduce程序内置了一个排序机制：

　　　　　map worker 和reduce worker ，都会对数据按照key的大小来排序

　　　　　所以最终的输出结果中，一定是按照key有顺序的结果

　　　　　1、先写一个mr程序，将每个页面的访问总次数统计出来

　　　　　2、再写第二个mr程序：

　　　　　map阶段：读取第一个mr产生的结果文件，将每一条数据解析成一个java对象UrlCountBean（封装着一个url和它的总次数），然后将这个对象作为key，null作为value返回。

　　　　　　　注意：这个java对象要实现WritableComparable接口，以让worker可以调用对象的compareTo方法来进行排序

　　　　　reduce阶段：由于worker已经对收到的数据按照UrlCountBean的compareTo方法排了序，所以，在reduce方法中，只要将数据输出即可，最后的结果自然是按总次数大小的有序结果

　　代码实现

　　　编码实现：PageCount类开发

package cn.edu360.mr.page.count.sort;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class PageCount implements WritableComparable<PageCount>{
    
    private String page;
    private int count;
    
    public void set(String page, int count) {
        this.page = page;
        this.count = count;
    }
    
    public String getPage() {
        return page;
    }
    public void setPage(String page) {
        this.page = page;
    }
    public int getCount() {
        return count;
    }
    public void setCount(int count) {
        this.count = count;
    }

    @Override
    public int compareTo(PageCount o) {
        
        return o.getCount()-this.count==0?this.page.compareTo(o.getPage()):o.getCount()-this.count;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(this.page);
        out.writeInt(this.count);
        
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.page= in.readUTF();
        this.count = in.readInt();
        
    }
    
    
    @Override
    public String toString() {
        return this.page + "," + this.count;
    }
    

}

PageCount

　　　编码实现：MapRed1类开发

package cn.edu360.mr.page.count.sort;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class PageCountStep1 {
    
    public static class PageCountStep1Mapper extends Mapper<LongWritable, Text, Text, IntWritable>{
        
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            String line = value.toString();
            String[] split = line.split(" ");
            context.write(new Text(split[1]), new IntWritable(1));
        }
        
    }
    
    public static class PageCountStep1Reducer extends Reducer<Text, IntWritable, Text, IntWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values,
                Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {

            int count = 0;
            for (IntWritable v : values) {
                count += v.get();
            }
            
            context.write(key, new IntWritable(count));
            
        }
    }
    
    public static void main(String[] args) throws Exception {
        /**
         * 通过加载classpath下的*-site.xml文件解析参数
         */
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(PageCountStep1.class);
        job.setMapperClass(PageCountStep1Mapper.class);
        job.setReducerClass(PageCountStep1Reducer.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\url\\input"));
        FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\url\\countout"));
        job.setNumReduceTasks(3);
        job.waitForCompletion(true);
        
    }
}

MapReduce1

　　　编码实现：MapRed2类开发

package cn.edu360.mr.page.count.sort;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class PageCountStep2 {
    public static class PageCountStep2Mapper extends Mapper<LongWritable, Text, PageCount, NullWritable>{
        
        @Override
        protected void map(LongWritable key, Text value,
                Mapper<LongWritable, Text, PageCount, NullWritable>.Context context)
                throws IOException, InterruptedException {
            
            String[] split = value.toString().split("\t");
            PageCount pageCount = new PageCount();
            pageCount.set(split[0], Integer.parseInt(split[1]));
            context.write(pageCount, NullWritable.get());
        }
    }
    
    public static class PageCountStep2Reducer extends Reducer<PageCount, NullWritable, PageCount, NullWritable>{
        @Override
        protected void reduce(PageCount key, Iterable<NullWritable> values,
                Reducer<PageCount, NullWritable, PageCount, NullWritable>.Context context)
                throws IOException, InterruptedException {
            context.write(key, NullWritable.get());
        }
    }
    
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(PageCountStep2.class);
        job.setMapperClass(PageCountStep2Mapper.class);
        job.setReducerClass(PageCountStep2Reducer.class);
        job.setMapOutputKeyClass(PageCount.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(PageCount.class);
        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\url\\countout"));
        FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\url\\sortout"));
        
        job.setNumReduceTasks(1);
        job.waitForCompletion(true);
    }
}

MapReduce2

六，MapReduce中的自定义GroupingComparator

　　案例分析

　　　需求：有如下数据(本案例用了排序控制、分区控制、分组控制)

　　　思路：

　　　　　map阶段：读取数据切分字段，封装数据到一个bean中作为key传输，key要按照成交金额比大小

　　　　　reduce阶段：利用自定义GroupingComparator将数据按订单id进行分组，然后在reduce方法中输出每组数据的前N条即可

　　代码实现

　　　　编码实现：OrderBean类开发

package cn.edu360.mr.order.topn.grouping;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.Serializable;

import org.apache.hadoop.io.WritableComparable;

public class OrderBean implements WritableComparable<OrderBean>{

    private String orderId;
    private String userId;
    private String pdtName;
    private float price;
    private int number;
    private float amountFee;

    public void set(String orderId, String userId, String pdtName, float price, int number) {
        this.orderId = orderId;
        this.userId = userId;
        this.pdtName = pdtName;
        this.price = price;
        this.number = number;
        this.amountFee = price * number;
    }

    public String getOrderId() {
        return orderId;
    }

    public void setOrderId(String orderId) {
        this.orderId = orderId;
    }

    public String getUserId() {
        return userId;
    }

    public void setUserId(String userId) {
        this.userId = userId;
    }

    public String getPdtName() {
        return pdtName;
    }

    public void setPdtName(String pdtName) {
        this.pdtName = pdtName;
    }

    public float getPrice() {
        return price;
    }

    public void setPrice(float price) {
        this.price = price;
    }

    public int getNumber() {
        return number;
    }

    public void setNumber(int number) {
        this.number = number;
    }

    public float getAmountFee() {
        return amountFee;
    }

    public void setAmountFee(float amountFee) {
        this.amountFee = amountFee;
    }

    @Override
    public String toString() {
        return this.orderId + "," + this.userId + "," + this.pdtName + "," + this.price + "," + this.number + ","
                + this.amountFee;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(this.orderId);
        out.writeUTF(this.userId);
        out.writeUTF(this.pdtName);
        out.writeFloat(this.price);
        out.writeInt(this.number);

    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.orderId = in.readUTF();
        this.userId = in.readUTF();
        this.pdtName = in.readUTF();
        this.price = in.readFloat();
        this.number = in.readInt();
        this.amountFee = this.price * this.number;
    }

    // 比较规则：先比总金额，如果相同，再比商品名称
    @Override
    public int compareTo(OrderBean o) {
        return this.orderId.compareTo(o.getOrderId())==0?Float.compare(o.getAmountFee(), this.getAmountFee()):this.orderId.compareTo(o.getOrderId());
        
    }

}

OrderBean

　　　　编码实现：Partitioner类开发

package cn.edu360.mr.order.topn.grouping;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;

public class OrderIdPartitioner extends Partitioner<OrderBean, NullWritable>{

    @Override
    public int getPartition(OrderBean key, NullWritable value, int numPartitions) {
        // 按照订单中的orderid来分发数据
        return (key.getOrderId().hashCode() & Integer.MAX_VALUE) % numPartitions;
    }

}

Partitioner

　　　　编码实现：GroupingComparator类开发

package cn.edu360.mr.order.topn.grouping;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class OrderIdGroupingComparator extends WritableComparator{
    
    public OrderIdGroupingComparator() {
        super(OrderBean.class,true);
    }
    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        OrderBean o1 = (OrderBean) a;
        OrderBean o2 = (OrderBean) b;
        return o1.getOrderId().compareTo(o2.getOrderId());
    }
}

View Code

　　　　编码实现：MapperReduce类开发

package cn.edu360.mr.order.topn.grouping;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class OrderTopn {

    public static class OrderTopnMapper extends Mapper<LongWritable, Text, OrderBean, NullWritable>{
        OrderBean orderBean = new OrderBean();
        NullWritable v = NullWritable.get();
        @Override
        protected void map(LongWritable key, Text value,
                Mapper<LongWritable, Text, OrderBean, NullWritable>.Context context)
                throws IOException, InterruptedException {
            
            String[] fields = value.toString().split(",");
            
            orderBean.set(fields[0], fields[1], fields[2], Float.parseFloat(fields[3]), Integer.parseInt(fields[4]));
            
            context.write(orderBean,v);
        }
    }
    
    public static class OrderTopnReducer extends Reducer< OrderBean, NullWritable,  OrderBean, NullWritable>{
        
        /**
         * 虽然reduce方法中的参数key只有一个，但是只要迭代器迭代一次，key中的值就会变
         */
        @Override
        protected void reduce(OrderBean key, Iterable<NullWritable> values,
                Reducer<OrderBean, NullWritable, OrderBean, NullWritable>.Context context)
                throws IOException, InterruptedException {
            int i=0;
            for (NullWritable v : values) {
                context.write(key, v);
                if(++i==3) return;
            }
        }
    }
    
    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration(); // 默认只加载core-default.xml core-site.xml
        conf.setInt("order.top.n", 2);
        Job job = Job.getInstance(conf);

        job.setJarByClass(OrderTopn.class);
        job.setMapperClass(OrderTopnMapper.class);
        job.setReducerClass(OrderTopnReducer.class);
        job.setPartitionerClass(OrderIdPartitioner.class);
        job.setGroupingComparatorClass(OrderIdGroupingComparator.class);
        job.setNumReduceTasks(2);
        job.setMapOutputKeyClass(OrderBean.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(OrderBean.class);
        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\order\\input"));
        FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\order\\out-3"));

        job.waitForCompletion(true);
    }
}

MapReduce

七，MapReduce中的输入输出控制

　　案例分析

　　　需求：还是对案例3中的流量数据进行汇总，然后求出汇总结果中的流量最大的TOPN条

　　　步骤1

　　　　　思路：统计逻辑跟之前的流量统计一致：

　　　　　map：以手机号作为key，flowbean作为value

　　　　　注：步骤1输出的结果文件通过指定SequenceFileOutputFormat来产生SequenceFile文件；SequenceFile文件是hadoop定义的一种文件，里面存放的是大量key-value的对象序列化字节（文件头部还存放了key和value所属的类型名）；

　　　步骤2：

　　　　　思路：读取步骤1的SequenceFile结果文件，需要指定inputformatclass为SequenceFileInputFormat组件

　　　　　既然使用了这种输入组件，那么我们的map方法中直接就接收一对KEY-VALUE数据

　　　如何实现topn呢？

　　　通过把所有汇总数据发给同一个reduce端的worker，并且通过定义一个GroupingComparator来让这个worker把所有手机号的flowbean对象看成一组，调用一次reduce方法，我们就只需要在reduce方法中输出前n个即可

　　代码实现

　　　StepOne

package cn.edu360.mr.index.sequence;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;


public class IndexStepOne {

    public static class IndexStepOneMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

        // 产生 <hello-文件名，1> 
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            // 从输入切片信息中获取当前正在处理的一行数据所属的文件
            FileSplit inputSplit = (FileSplit) context.getInputSplit();
            String fileName = inputSplit.getPath().getName();

            String[] words = value.toString().split(" ");
            for (String w : words) {
                // 将"单词-文件名"作为key，1作为value，输出
                context.write(new Text(w + "-" + fileName), new IntWritable(1));
            }

        }
    }

    public static class IndexStepOneReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values,
                Reducer<Text, IntWritable, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {

            int count = 0;
            for (IntWritable value : values) {
                count += value.get();
            }

            context.write(key, new IntWritable(count));
        }
    }
    
    
    public static void main(String[] args) throws Exception{
        
        Configuration conf = new Configuration(); 
        
        Job job = Job.getInstance(conf);

        job.setJarByClass(IndexStepOne.class);

        job.setMapperClass(IndexStepOneMapper.class);
        job.setReducerClass(IndexStepOneReducer.class);

        job.setNumReduceTasks(3);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        
        // job.setOutputFormatClass(TextOutputFormat.class);  // 这是默认的输出组件
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        

        FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\index\\input"));
        FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\index\\out-seq-1"));

        job.waitForCompletion(true);
        
    }
}

StepOne

　　　StepTwo

package cn.edu360.mr.index.sequence;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class IndexStepTwo {

    public static class IndexStepTwoMapper extends Mapper<Text, IntWritable, Text, Text> {

        @Override
        protected void map(Text key, IntWritable value, Context context) throws IOException, InterruptedException {
            String[] split = key.toString().split("-");
            context.write(new Text(split[0]), new Text(split[1]+"-->"+value));
        }
    }

    public static class IndexStepTwoReducer extends Reducer<Text, Text, Text, Text> {

        // 一组数据：  <hello,a.txt-->4> <hello,b.txt-->4> <hello,c.txt-->4>
        @Override
        protected void reduce(Text key, Iterable<Text> values,Context context)
                throws IOException, InterruptedException {
            // stringbuffer是线程安全的，stringbuilder是非线程安全的，在不涉及线程安全的场景下，stringbuilder更快
            StringBuilder sb = new StringBuilder();
            
            for (Text value : values) {
                sb.append(value.toString()).append("\t");
            }
            
            context.write(key, new Text(sb.toString()));
        }
    }
    
    
    public static void main(String[] args) throws Exception{
        
        Configuration conf = new Configuration(); // 默认只加载core-default.xml core-site.xml
        
        Job job = Job.getInstance(conf);

        job.setJarByClass(IndexStepTwo.class);

        job.setMapperClass(IndexStepTwoMapper.class);
        job.setReducerClass(IndexStepTwoReducer.class);

        job.setNumReduceTasks(1);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        
        // job.setInputFormatClass(TextInputFormat.class); 默认的输入组件
        job.setInputFormatClass(SequenceFileInputFormat.class);
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\index\\out1"));
        FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\index\\out2"));

        job.waitForCompletion(true);
        
    }
}

StepTwo

八，案例所需文档

点击下载

posted @ 2019-05-09 11:09 他山之石·玉阅读(848) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

他山之石

Hadoop 学习之路 MapReduce编程案例

一，WordCount程序

案例分析

代码实现

二，求WordCount的topN程序

案例分析

代码实现

三，MapReduce中的自定义类型

案例分析

代码实现

四，MapReduce中的自定义Partitioner

案例分析

代码实现

五，MapReduce中的自定义排序

案例分析

代码实现

六，MapReduce中的自定义GroupingComparator

案例分析

代码实现

七，MapReduce中的输入输出控制

案例分析

代码实现

八，案例所需文档

公告

　　案例分析

　　代码实现

　　案例分析

　　代码实现

　　案例分析

　　代码实现

　　案例分析

　　代码实现

　　案例分析

　　代码实现

　　案例分析

　　代码实现

　　案例分析

　　代码实现