Combiner编程、reduce join、map join、mapreduce优化总结、通过自定义分区类避免数据倾斜、MapReduce自定义排序

Combiner编程

reduce join

map join

mapreduce优化总结

通过自定义分区类避免数据倾斜

#每一个reduce任务生成一个文件

package com.shujia.MapReduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class Demo10Partitioner {
    public static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException {
            String clazz = value.toString().split(",")[4];
            context.write(new Text(clazz), value);
        }
    }

    public static class MyReducer extends Reducer<Text, Text, Text, Text> {
        @Override
        protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
            int cnt = 0;
            for (Text value : values) {
                context.write(key, value);
                cnt += 1;
            }
            context.write(key, new Text(String.valueOf(cnt)));
        }
    }

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJobName("Demo10Partitioner");
        job.setJarByClass(Demo10Partitioner.class);

        // 自定义Reduce的数量
        job.setNumReduceTasks(12);

        // 使用自定义分区
        job.setPartitionerClass(MyPartitioner.class);

        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // 配置输入输出路径
        FileInputFormat.addInputPath(job, new Path("/student/input"));
        // 输出路径不需要提前创建,如果该目录已存在则会报错
        // 通过HDFS的JavaAPI判断输出路径是否存在
        Path outPath = new Path("/student/partition/output");
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(outPath)) {
            fs.delete(outPath, true);
        }

        FileOutputFormat.setOutputPath(job, outPath);

        // 等待job运行完成
        job.waitForCompletion(true);
    }
    /**
     * hadoop jar Hadoop-1.0.jar com.shujia.MapReduce.Demo10Partitioner
     */
}

class MyPartitioner extends Partitioner<Text, Text> {

    @Override
    public int getPartition(Text key, Text value, int numPartitions) {
        String clazz = key.toString();
        int partition = 12;
        switch (clazz) {
            case "理科一班":
                partition = 0;
                break;
            case "理科二班":
                partition = 1;
                break;
            case "理科三班":
                partition = 2;
                break;
            case "理科四班":
                partition = 3;
                break;
            case "理科五班":
                partition = 4;
                break;
            case "理科六班":
                partition = 5;
                break;
            case "文科一班":
                partition = 6;
                break;
            case "文科二班":
                partition = 7;
                break;
            case "文科三班":
                partition = 8;
                break;
            case "文科四班":
                partition = 9;
                break;
            case "文科五班":
                partition = 10;
                break;
            case "文科六班":
                partition = 11;
                break;
        }
        return partition;
    }
}

MapReduce自定义排序

package com.shujia.MapReduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class Demo11Sort {
    // Map端
    public static class MyMapper extends Mapper<LongWritable, Text, KeySort, Text> {
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, KeySort, Text>.Context context) throws IOException, InterruptedException {
            String[] split = value.toString().split(",");
            String id = split[0];
            String name = split[1];
            String clazz = split[2];
            String score = split[3];
            KeySort mapOutKey = new KeySort(clazz, Integer.valueOf(score));
            context.write(mapOutKey, new Text(id + "," + name));
        }
    }

    // Reduce端
    public static class MyReducer extends Reducer<KeySort, Text, Text, Text> {
        @Override
        protected void reduce(KeySort key, Iterable<Text> values, Reducer<KeySort, Text, Text, Text>.Context context) throws IOException, InterruptedException {
            for (Text value : values) {
                context.write(new Text(key.clazz + "," + key.score), value);
            }
        }
    }

    // Driver端
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJobName("Demo11Sort");
        job.setJarByClass(Demo11Sort.class);

        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(KeySort.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // 配置输入输出路径
        /**
         * hdfs dfs -mkdir -p /student/sort/input
         * hdfs dfs -put stuSumScore.txt.txt /student/sort/input
         */

        FileInputFormat.addInputPath(job, new Path("/student/sort/input"));
        // 输出路径不需要提前创建,如果该目录已存在则会报错
        // 通过HDFS的JavaAPI判断输出路径是否存在
        Path outPath = new Path("/student/sort/output");
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(outPath)) {
            fs.delete(outPath, true);
        }

        FileOutputFormat.setOutputPath(job, outPath);

        // 等待job运行完成
        job.waitForCompletion(true);
    }
    /**
     * hadoop jar Hadoop-1.0.jar com.shujia.MapReduce.Demo11Sort
     */
}

/**
 * 自定义排序:实现WritableComparable接口
 */
class KeySort implements WritableComparable<KeySort> {
    String clazz;
    Integer score;

    public KeySort() {

    }

    public KeySort(String clazz, Integer score) {
        this.clazz = clazz;
        this.score = score;
    }

    @Override
    // 自定义排序规则
    public int compareTo(KeySort o) {
        // 首先按班级排序,然后在班级中按value排序
        int clazzCompare = this.clazz.compareTo(o.clazz);
        if (clazzCompare < 0) {
            return -1;
        } else if (clazzCompare > 0) {
            return 1;
        } else {
            int scoreCompare = this.score - o.score;
            if (scoreCompare < 0) {
                return 1;
            } else if (scoreCompare > 0) {
                return -1;
            } else {
                return 0;
            }

        }
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(clazz);
        out.writeInt(score);

    }

    @Override
    public void readFields(DataInput in) throws IOException {
        clazz = in.readUTF();
        score = in.readInt();

    }
}
posted @ 2022-02-17 16:34  赤兔胭脂小吕布  阅读(19)  评论(0编辑  收藏  举报