MapReduce TopN（自主复习）

1.MyTopN 主程序

package com.littlepage.topn;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.IOException;

public class MyTopN {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf=new Configuration(true);
        String[] other=new GenericOptionsParser(conf,args).getRemainingArgs();
        //设定本地环境运行，不进行集群运行
        conf.set("mapreduce.framework.name","local");
        //设定异构平台
        conf.set("mapreduce.app-submission.cross-platform","true");
        Job job=Job.getInstance(conf);
        job.setJarByClass(MyTopN.class);
        job.setJobName("TopN");
        //核心
        //map task
        //input,output
        TextInputFormat.addInputPath(job,new Path(other[0]));
        Path outPath=new Path(other[1]);
        if(outPath.getFileSystem(conf).exists(outPath)){
            outPath.getFileSystem(conf).delete(outPath,true);
        }
        //map
        job.setMapperClass(TopNMapper.class);
        job.setMapOutputKeyClass(TopNKey.class);
        job.setMapOutputValueClass(IntWritable.class);
        //partitioner
        //只需要满足相同的key获得相同的分区号
        job.setPartitionerClass(TopNPartitioner.class);
        //sortComparator
        job.setSortComparatorClass(TopNSortComparator.class);
        //combine

        //reducetask
        job.setReducerClass(TopNReducer.class);
        //groupingComparator
        job.setGroupingComparatorClass(TopNGroupingComparator.class);
        //output
        TextOutputFormat.setOutputPath(job,outPath);
        job.waitForCompletion(true);
    }
}

2.TopNKey

package com.littlepage.topn;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * 自定义类型必须实现接口：
 * 序列化/反序列化   比较器
 */

public class TopNKey implements WritableComparable<TopNKey> {

    private int year;
    private int month;
    private int day;
    private int template;

    public int getYear() {
        return year;
    }

    public void setYear(int year) {
        this.year = year;
    }

    public int getMonth() {
        return month;
    }

    public void setMonth(int month) {
        this.month = month;
    }

    public int getDay() {
        return day;
    }

    public void setDay(int day) {
        this.day = day;
    }

    public int getTemplate() {
        return template;
    }

    public void setTemplate(int template) {
        this.template = template;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(year);
        out.writeInt(month);
        out.writeInt(day);
        out.writeInt(template);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.year = in.readInt();
        this.month = in.readInt();
        this.day = in.readInt();
        this.template = in.readInt();
    }

    @Override
    public int compareTo(TopNKey that) {
        int c1 = Integer.compare(this.year,that.getYear());
        if(c1==0){
            int c2 = Integer.compare(this.month,that.getMonth());
            if(c2 == 0){
                return Integer.compare(this.day,that.getDay());
            }
            return c2;
        }
        return c1;
    }
}

3.TopNMapper

package com.littlepage.topn;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.StringUtils;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;

public class TopNMapper extends Mapper<LongWritable, Text,TopNKey, IntWritable> {
    TopNKey topNKey = new TopNKey();
    IntWritable intWritable = new IntWritable();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //开发习惯
        //value: 2019-6-1 22:22:22   1    31
        String[] strs = StringUtils.split(value.toString(), '\t');
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
        try{
            Date date = sdf.parse(strs[0]);
            Calendar cal=Calendar.getInstance();
            cal.setTime(date);
            topNKey.setYear(cal.get(Calendar.YEAR));
            topNKey.setMonth(cal.get(Calendar.MONTH)+1);
            topNKey.setDay(cal.get(Calendar.DAY_OF_MONTH));
            int template=Integer.parseInt(strs[2]);
            topNKey.setTemplate(template);
            intWritable.set(template);
            context.write(topNKey,intWritable);
        }catch (ParseException e){
            e.printStackTrace();
        }
    }
}

4.TopNReducer

package com.littlepage.topn;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.Iterator;

public class TopNReducer extends Reducer<TopNKey, IntWritable, Text,IntWritable> {
    Text rkey=new Text();
    IntWritable rval=new IntWritable();
    int flag=0;
    int day=0;
    @Override
    protected void reduce(TopNKey key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        Iterator<IntWritable> iter = values.iterator();
        while(iter.hasNext()){
            IntWritable val=iter.next();
            if(flag==0){
                rkey.set(key.getYear()+"-"+key.getMonth()+"-"+key.getDay());
                rval.set(key.getTemplate());
                context.write(rkey,rval);
                flag++;
                day=key.getDay();
            }
            if(flag!=0&&day!=key.getDay()){
                rkey.set(key.getYear()+"-"+key.getMonth()+"-"+key.getDay());
                rval.set(key.getTemplate());
                context.write(rkey,rval);
                break;
            }
        }
    }
}

5.TopNPartitioner 分区规划，来划分Map之后的结果是存在哪个dn进行处理

package com.littlepage.topn;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;

public class TopNPartitioner extends Partitioner<TopNKey,IntWritable> {
    @Override
    public int getPartition(TopNKey key, IntWritable value, int numPartitions) {
        //1.不能太复杂
        //2.缩小组的维度
        return key.getYear()%numPartitions;//可能会产生数据倾斜
    }

}

6.TopNSortComparator 排序比较器，在Map中精确到月，按温度递减

package com.littlepage.topn;

import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class TopNSortComparator extends WritableComparator {
    public  TopNSortComparator(){
        super(TopNKey.class,true);
    }

    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        TopNKey k1=(TopNKey) a;
        TopNKey k2=(TopNKey) b;
        //年，月，温度，且温度倒序
        int c1=Integer.compare(k1.getYear(),k2.getYear());
        if(c1==0){
            int c2=Integer.compare(k1.getMonth(),k2.getMonth());
            if(c2==0){
                return -Integer.compare(k1.getTemplate(),k2.getTemplate());
            }
            return c2;
        }
        return c1;
    }
}

7.TopNGroupingComparator 分组比较器，用于reduce的分组，每一个组是年月，进行reduce操作

package com.littlepage.topn;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class TopNGroupingComparator extends WritableComparator {
    public TopNGroupingComparator() {
        super(TopNKey.class, true);
    }

    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        TopNKey k1 = (TopNKey) a;
        TopNKey k2 = (TopNKey) b;
        //年，月
        int c1 = Integer.compare(k1.getYear(), k2.getYear());
        if (c1 == 0) {
            return Integer.compare(k1.getMonth(), k2.getMonth());
        }
        return c1;
    }
}

TopN案例是MapReduce的典型案例，需牢记

posted @ 2019-08-13 00:35 SteveYu 阅读(513) 评论(0) 编辑收藏举报

刷新页面返回顶部

Steve Yu

喜欢猫咪，阳光和你

MapReduce TopN（自主复习）

1.MyTopN 主程序

2.TopNKey

3.TopNMapper

4.TopNReducer

5.TopNPartitioner 分区规划，来划分Map之后的结果是存在哪个dn进行处理

6.TopNSortComparator 排序比较器，在Map中精确到月，按温度递减

7.TopNGroupingComparator 分组比较器，用于reduce的分组，每一个组是年月，进行reduce操作

公告