MapReduce代码编写--求性别人数、求总分、关联、map端的过滤、combiner预聚合

主要基于数据

#students.txt

/*
1500100001,施笑槐,22,女,文科六班
1500100002,吕金鹏,24,男,文科六班
1500100003,单乐蕊,22,女,理科六班
1500100004,葛德曜,24,男,理科三班
1500100005,宣谷芹,22,女,理科五班
1500100006,边昂雄,21,男,理科二班
1500100007,尚孤风,23,女,文科六班
1500100008,符半双,22,女,理科六班
1500100009,沈德昌,21,男,理科一班
1500100010,羿彦昌,23,男,理科六班
1500100011,宰运华,21,男,理科三班
1500100012,梁易槐,21,女,理科一班
1500100013,逯君昊,24,男,文科二班
1500100014,羿旭炎,23,男,理科五班
1500100015,宦怀绿,21,女,理科一班
1500100016,潘访烟,23,女,文科一班
1500100017,高芷天,21,女,理科五班
1500100018,骆怜雪,21,女,文科六班
1500100019,娄曦之,24,男,理科三班
1500100020,杭振凯,23,男,理科四班
1500100021,连鸿晖,22,男,理科六班
1500100022,薄运珧,23,男,文科四班
1500100023,东鸿畴,23,男,理科二班
1500100024,湛慕卉,22,女,文科二班
1500100025,翁飞昂,22,男,文科四班
……
*/

#score.txt

/*
1500100001,1000001,98
1500100001,1000002,5
1500100001,1000003,137
1500100001,1000004,29
1500100001,1000005,85
1500100001,1000006,52
1500100002,1000001,139
1500100002,1000002,102
1500100002,1000003,44
1500100002,1000004,18
1500100002,1000005,46
1500100002,1000006,91
1500100003,1000001,48
1500100003,1000002,132
1500100003,1000003,41
1500100003,1000007,32
1500100003,1000008,7
1500100003,1000009,99
1500100004,1000001,147
1500100004,1000002,69
1500100004,1000003,37
1500100004,1000007,87
1500100004,1000008,21
1500100004,1000009,60
1500100005,1000001,105
……
*/

求性别人数

package com.shujia.MapReduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class Demo2GenderCnt {
    // Map端
    public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            String[] splits = value.toString().split(",");
            String gender = splits[3];
            // 以性别作为key 1作为value
            context.write(new Text(gender), new IntWritable(1));
        }
    }

    // Reduce端
    public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            int cnt = 0;
            // 统计性别人数
            for (IntWritable value : values) {
                cnt += value.get();
            }
            context.write(key, new IntWritable(cnt));
        }
    }

    // Driver端
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        //这个名字在yarn的web界面上可以看到
        job.setJobName("Demo2GenderCnt");
        job.setJarByClass(Demo2GenderCnt.class);

        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        // 配置输入输出路径
        FileInputFormat.addInputPath(job, new Path("/student/input"));
        // 输出路径不需要提前创建,如果该目录已存在则会报错
        // 通过HDFS的JavaAPI判断输出路径是否存在
        Path outPath = new Path("/student/output");
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(outPath)) {
            fs.delete(outPath, true);
        }

        FileOutputFormat.setOutputPath(job, outPath);

        // 等待job运行完成
        job.waitForCompletion(true);

        /**
         * 1、准备数据,将students.txt上传至HDFS的/student/input目录下面
         * hdfs dfs -mkdir -p /student/input
         * hdfs dfs -put students.txt /student/input/
         * 2、提交MapReduce任务
         * hadoop jar Hadoop-1.0.jar com.shujia.MapReduce.Demo2GenderCnt
         * 3、查看日志、杀死任务
         * yarn logs -applicationId application_1644480440500_0006
         * yarn application -kill application_1644480440500_0007
         */
    }
}

求总分

package com.shujia.MapReduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class Demo3SumScore {
    // Map端
    public static class MyMapper extends Mapper<LongWritable, Text, LongWritable, IntWritable> {
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, LongWritable, IntWritable>.Context context) throws IOException, InterruptedException {
            String[] splits = value.toString().split(",");
            String id = splits[0];
            String score = splits[2];
            // 以id作为key 分数score作为value
            context.write(new LongWritable(Long.parseLong(id)), new IntWritable(Integer.parseInt(score)));
        }
    }

    // Reduce端
    public static class MyReducer extends Reducer<LongWritable, IntWritable, LongWritable, IntWritable> {
        @Override
        protected void reduce(LongWritable key, Iterable<IntWritable> values, Reducer<LongWritable, IntWritable, LongWritable, IntWritable>.Context context) throws IOException, InterruptedException {
            int sumScore = 0;
            // 统计学生总分
            for (IntWritable value : values) {
                sumScore += value.get();
            }
            context.write(key, new IntWritable(sumScore));
        }
    }

    // Driver端
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJobName("Demo3SumScore");
        job.setJarByClass(Demo3SumScore.class);

        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(IntWritable.class);

        // 配置输入输出路径
        FileInputFormat.addInputPath(job, new Path("/student/score/input"));
        // 输出路径不需要提前创建,如果该目录已存在则会报错
        // 通过HDFS的JavaAPI判断输出路径是否存在
        Path outPath = new Path("/student/score/output");
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(outPath)) {
            fs.delete(outPath, true);
        }

        FileOutputFormat.setOutputPath(job, outPath);

        // 等待job运行完成
        job.waitForCompletion(true);

        /**
         * 1、准备数据,将students.txt上传至HDFS的/student/input目录下面
         * hdfs dfs -mkdir -p /student/score/input
         * hdfs dfs -put score.txt /student/score/input/
         * 2、提交MapReduce任务
         * hadoop jar Hadoop-1.0.jar com.shujia.MapReduce.Demo3SumScore
         * 3、查看日志、杀死任务
         * yarn logs -applicationId application_1644480440500_0006
         * yarn application -kill application_1644480440500_0007
         */
    }
}

关联

package com.shujia.MapReduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class Demo4Join {
    public static class MyMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, LongWritable, Text>.Context context) throws IOException, InterruptedException {
            // 区分value到底是哪个文件的数据
            String v = value.toString();
            if (v.contains(",")) {
                // 学生数据
                String[] stuSplits = v.split(",");
                long id = Long.parseLong(stuSplits[0]);
                String name = stuSplits[1];
                String clazz = stuSplits[4];
                context.write(new LongWritable(id), new Text(name + "," + clazz + "|"));
            } else {
                // 总分数据
                String[] sumScoreSplit = v.split("\t");
                context.write(new LongWritable(Long.parseLong(sumScoreSplit[0])), new Text(sumScoreSplit[1] + "#"));
            }

        }
    }

    public static class MyReducer extends Reducer<LongWritable, Text, LongWritable, Text> {
        @Override
        protected void reduce(LongWritable key, Iterable<Text> values, Reducer<LongWritable, Text, LongWritable, Text>.Context context) throws IOException, InterruptedException {
            String stuV = "";
            String sumScoreV = "";
            for (Text value : values) {
                String v = value.toString();
                if (v.contains("|")) {
                    // 学生数据
                    stuV = v.replace("|", "");
                } else {
                    // 总分数据
                    sumScoreV = v.replace("#", "");
                }

            }
            context.write(key, new Text(stuV + "," + sumScoreV));

        }
    }

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();
        // 设置MapReduce输出的K-V的分隔符
        conf.set("mapred.textoutputformat.separator", ",");
        Job job = Job.getInstance(conf);
        job.setJobName("Demo4Join");
        job.setJarByClass(Demo4Join.class);

        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(Text.class);

        // 配置输入输出路径
        FileInputFormat.addInputPath(job, new Path("/student/input"));
        FileInputFormat.addInputPath(job, new Path("/student/score/output"));
        // 输出路径不需要提前创建,如果该目录已存在则会报错
        // 通过HDFS的JavaAPI判断输出路径是否存在
        Path outPath = new Path("/student/join/output");
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(outPath)) {
            fs.delete(outPath, true);
        }

        FileOutputFormat.setOutputPath(job, outPath);

        // 等待job运行完成
        job.waitForCompletion(true);

        /**
         * 创建目录
         * hdfs dfs -mkdir -p /student/join/output
         * 提交任务
         * hadoop jar Hadoop-1.0.jar com.shujia.MapReduce.Demo4Join
         */
    }
}

map端的过滤

package com.shujia.MapReduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class Demo5MRFilter {
    public static class MyMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
            // 过滤出文科三班的学生
            String clazz = value.toString().split(",")[4];
            if ("文科三班".equals(clazz)) {
                context.write(value, NullWritable.get());
            }
        }
    }

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();
        // 设置MapReduce输出的K-V的分隔符
        conf.set("mapred.textoutputformat.separator", ",");
        Job job = Job.getInstance(conf);
        job.setJobName("Demo5MRFilter");
        job.setJarByClass(Demo5MRFilter.class);

        job.setMapperClass(MyMapper.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        // 配置输入输出路径
        FileInputFormat.addInputPath(job, new Path("/student/input"));
        // 输出路径不需要提前创建,如果该目录已存在则会报错
        // 通过HDFS的JavaAPI判断输出路径是否存在
        Path outPath = new Path("/student/filter/output");
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(outPath)) {
            fs.delete(outPath, true);
        }

        FileOutputFormat.setOutputPath(job, outPath);

        // 等待job运行完成
        job.waitForCompletion(true);

        /**
         * hdfs dfs -mkdir -p /student/filter/output
         * hadoop jar Hadoop-1.0.jar com.shujia.MapReduce.Demo5MRFilter
         */

    }
}

combiner预聚合

#combiner发生在map端的reduce

package com.shujia.MapReduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

// 统计每个单词出现的次数
public class Demo6WordCountCombiner {
    // Map阶段
    public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        /**
         * @param key     Map端输入的key->偏移量
         * @param value   Map端输入的value->一行数据
         * @param context MapReduce整个过程的上下文环境->可以获取MapReduce程序运行时的一些参数、状态,可以将Map的输出发送到Reduce
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            // 实现自己的map端逻辑
            String vStr = value.toString();
            // 按照空格进行切分,将每个单词切分出来
            String[] words = vStr.split(" ");

            // 遍历每一个单词,构造成k-v格式
            /**
             * hadoop hive hbase spark flink
             * ====>
             * hadoop 1
             * hive 1
             * hbase 1
             * spark 1
             * flink 1
             */
            for (String word : words) {
                Text keyOut = new Text(word);
                IntWritable valueOut = new IntWritable(1);
                // 通过context将构建好的k-v发送出去
                context.write(keyOut, valueOut);
            }
        }
    }

    // 自定义的Combiner
    public static class MyCombiner extends Reducer<Text, IntWritable, Text, IntWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            // 实现自己的Reduce逻辑
            int sum = 0; // 保存每个单词的数量
            for (IntWritable value : values) {
                // 遍历values迭代器
                sum += value.get();
            }

            // 将Reduce统计得到的结果输出到HDFS
            context.write(key, new IntWritable(sum));
        }
    }

    // Reduce阶段
    public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        /**
         * @param key     Map端输出的数据按照key进行分组过后的数据中的key,在这里相当于每个单词
         * @param values  Map端输出的数据按照key进行分组过后,相同key的所有的value组成的集合(迭代器)
         * @param context MapReduce的上下文环境,主要用于输出数据到HDFS
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            // 实现自己的Reduce逻辑
            int sum = 0; // 保存每个单词的数量
            for (IntWritable value : values) {
                // 遍历values迭代器
                sum += value.get();
            }

            // 将Reduce统计得到的结果输出到HDFS
            context.write(key, new IntWritable(sum));
        }
    }

    // Driver端(将Map、Reduce进行组装)
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

        // 创建配置文件
        Configuration conf = new Configuration();

        // 创建一个Job实例
        Job job = Job.getInstance(conf);
        // 对Job进行一些简单的配置
        job.setJobName("Demo6WordCountCombiner");
        // 通过class类设置运行Job时该执行哪一个类
        job.setJarByClass(Demo6WordCountCombiner.class);

        // 对Map端进行配置
        // 对Map端输出的Key的类型进行配置
        job.setMapOutputKeyClass(Text.class);
        // 对Map端输出的Value的类型进行配置
        job.setMapOutputValueClass(IntWritable.class);
        // 配置Map任务该运行哪一个类
        job.setMapperClass(MyMapper.class);

        // 设置Combiner
        job.setCombinerClass(MyCombiner.class);

        // 对Reduce端进行配置
        // 对Reduce端输出的Key的类型进行配置
        job.setOutputKeyClass(Text.class);
        // 对Reduce端输出的Value的类型进行配置
        job.setOutputValueClass(IntWritable.class);
        // 配置Reduce任务该运行哪一个类
        job.setReducerClass(MyReducer.class);

        // 配置输入输出路径
        FileInputFormat.addInputPath(job, new Path("/wordCount/input"));
        // 输出路径不需要提前创建,如果该目录已存在则会报错
        // 通过HDFS的JavaAPI判断输出路径是否存在
        Path outPath = new Path("/wordCount/output");
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(outPath)) {
            fs.delete(outPath, true);
        }

        FileOutputFormat.setOutputPath(job, outPath);

        // 等待job运行完成
        job.waitForCompletion(true);

        /**
         * 1、准备数据,将words.txt上传至HDFS的/wordCount/input目录下面
         * hdfs dfs -mkdir -p /wordCount/input
         * hdfs dfs -put words.txt /wordCount/input
         * 2、提交MapReduce任务
         * hadoop jar Hadoop-1.0.jar com.shujia.MapReduce.Demo6WordCountCombiner
         */

    }
}

#注意combiner预聚合 适合Max、Min、Sum

posted @ 2022-02-15 16:07  赤兔胭脂小吕布  阅读(146)  评论(0编辑  收藏  举报