MapReduce代码编写--求性别人数、求总分、关联、map端的过滤、combiner预聚合
主要基于数据
#students.txt
/*
1500100001,施笑槐,22,女,文科六班
1500100002,吕金鹏,24,男,文科六班
1500100003,单乐蕊,22,女,理科六班
1500100004,葛德曜,24,男,理科三班
1500100005,宣谷芹,22,女,理科五班
1500100006,边昂雄,21,男,理科二班
1500100007,尚孤风,23,女,文科六班
1500100008,符半双,22,女,理科六班
1500100009,沈德昌,21,男,理科一班
1500100010,羿彦昌,23,男,理科六班
1500100011,宰运华,21,男,理科三班
1500100012,梁易槐,21,女,理科一班
1500100013,逯君昊,24,男,文科二班
1500100014,羿旭炎,23,男,理科五班
1500100015,宦怀绿,21,女,理科一班
1500100016,潘访烟,23,女,文科一班
1500100017,高芷天,21,女,理科五班
1500100018,骆怜雪,21,女,文科六班
1500100019,娄曦之,24,男,理科三班
1500100020,杭振凯,23,男,理科四班
1500100021,连鸿晖,22,男,理科六班
1500100022,薄运珧,23,男,文科四班
1500100023,东鸿畴,23,男,理科二班
1500100024,湛慕卉,22,女,文科二班
1500100025,翁飞昂,22,男,文科四班
……
*/
#score.txt
/*
1500100001,1000001,98
1500100001,1000002,5
1500100001,1000003,137
1500100001,1000004,29
1500100001,1000005,85
1500100001,1000006,52
1500100002,1000001,139
1500100002,1000002,102
1500100002,1000003,44
1500100002,1000004,18
1500100002,1000005,46
1500100002,1000006,91
1500100003,1000001,48
1500100003,1000002,132
1500100003,1000003,41
1500100003,1000007,32
1500100003,1000008,7
1500100003,1000009,99
1500100004,1000001,147
1500100004,1000002,69
1500100004,1000003,37
1500100004,1000007,87
1500100004,1000008,21
1500100004,1000009,60
1500100005,1000001,105
……
*/
求性别人数
package com.shujia.MapReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class Demo2GenderCnt {
// Map端
public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
String[] splits = value.toString().split(",");
String gender = splits[3];
// 以性别作为key 1作为value
context.write(new Text(gender), new IntWritable(1));
}
}
// Reduce端
public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int cnt = 0;
// 统计性别人数
for (IntWritable value : values) {
cnt += value.get();
}
context.write(key, new IntWritable(cnt));
}
}
// Driver端
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//这个名字在yarn的web界面上可以看到
job.setJobName("Demo2GenderCnt");
job.setJarByClass(Demo2GenderCnt.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 配置输入输出路径
FileInputFormat.addInputPath(job, new Path("/student/input"));
// 输出路径不需要提前创建,如果该目录已存在则会报错
// 通过HDFS的JavaAPI判断输出路径是否存在
Path outPath = new Path("/student/output");
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outPath)) {
fs.delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
// 等待job运行完成
job.waitForCompletion(true);
/**
* 1、准备数据,将students.txt上传至HDFS的/student/input目录下面
* hdfs dfs -mkdir -p /student/input
* hdfs dfs -put students.txt /student/input/
* 2、提交MapReduce任务
* hadoop jar Hadoop-1.0.jar com.shujia.MapReduce.Demo2GenderCnt
* 3、查看日志、杀死任务
* yarn logs -applicationId application_1644480440500_0006
* yarn application -kill application_1644480440500_0007
*/
}
}
求总分
package com.shujia.MapReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class Demo3SumScore {
// Map端
public static class MyMapper extends Mapper<LongWritable, Text, LongWritable, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, LongWritable, IntWritable>.Context context) throws IOException, InterruptedException {
String[] splits = value.toString().split(",");
String id = splits[0];
String score = splits[2];
// 以id作为key 分数score作为value
context.write(new LongWritable(Long.parseLong(id)), new IntWritable(Integer.parseInt(score)));
}
}
// Reduce端
public static class MyReducer extends Reducer<LongWritable, IntWritable, LongWritable, IntWritable> {
@Override
protected void reduce(LongWritable key, Iterable<IntWritable> values, Reducer<LongWritable, IntWritable, LongWritable, IntWritable>.Context context) throws IOException, InterruptedException {
int sumScore = 0;
// 统计学生总分
for (IntWritable value : values) {
sumScore += value.get();
}
context.write(key, new IntWritable(sumScore));
}
}
// Driver端
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJobName("Demo3SumScore");
job.setJarByClass(Demo3SumScore.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(IntWritable.class);
// 配置输入输出路径
FileInputFormat.addInputPath(job, new Path("/student/score/input"));
// 输出路径不需要提前创建,如果该目录已存在则会报错
// 通过HDFS的JavaAPI判断输出路径是否存在
Path outPath = new Path("/student/score/output");
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outPath)) {
fs.delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
// 等待job运行完成
job.waitForCompletion(true);
/**
* 1、准备数据,将students.txt上传至HDFS的/student/input目录下面
* hdfs dfs -mkdir -p /student/score/input
* hdfs dfs -put score.txt /student/score/input/
* 2、提交MapReduce任务
* hadoop jar Hadoop-1.0.jar com.shujia.MapReduce.Demo3SumScore
* 3、查看日志、杀死任务
* yarn logs -applicationId application_1644480440500_0006
* yarn application -kill application_1644480440500_0007
*/
}
}
关联
package com.shujia.MapReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class Demo4Join {
public static class MyMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, LongWritable, Text>.Context context) throws IOException, InterruptedException {
// 区分value到底是哪个文件的数据
String v = value.toString();
if (v.contains(",")) {
// 学生数据
String[] stuSplits = v.split(",");
long id = Long.parseLong(stuSplits[0]);
String name = stuSplits[1];
String clazz = stuSplits[4];
context.write(new LongWritable(id), new Text(name + "," + clazz + "|"));
} else {
// 总分数据
String[] sumScoreSplit = v.split("\t");
context.write(new LongWritable(Long.parseLong(sumScoreSplit[0])), new Text(sumScoreSplit[1] + "#"));
}
}
}
public static class MyReducer extends Reducer<LongWritable, Text, LongWritable, Text> {
@Override
protected void reduce(LongWritable key, Iterable<Text> values, Reducer<LongWritable, Text, LongWritable, Text>.Context context) throws IOException, InterruptedException {
String stuV = "";
String sumScoreV = "";
for (Text value : values) {
String v = value.toString();
if (v.contains("|")) {
// 学生数据
stuV = v.replace("|", "");
} else {
// 总分数据
sumScoreV = v.replace("#", "");
}
}
context.write(key, new Text(stuV + "," + sumScoreV));
}
}
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
// 设置MapReduce输出的K-V的分隔符
conf.set("mapred.textoutputformat.separator", ",");
Job job = Job.getInstance(conf);
job.setJobName("Demo4Join");
job.setJarByClass(Demo4Join.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(Text.class);
// 配置输入输出路径
FileInputFormat.addInputPath(job, new Path("/student/input"));
FileInputFormat.addInputPath(job, new Path("/student/score/output"));
// 输出路径不需要提前创建,如果该目录已存在则会报错
// 通过HDFS的JavaAPI判断输出路径是否存在
Path outPath = new Path("/student/join/output");
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outPath)) {
fs.delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
// 等待job运行完成
job.waitForCompletion(true);
/**
* 创建目录
* hdfs dfs -mkdir -p /student/join/output
* 提交任务
* hadoop jar Hadoop-1.0.jar com.shujia.MapReduce.Demo4Join
*/
}
}
map端的过滤
package com.shujia.MapReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class Demo5MRFilter {
public static class MyMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
// 过滤出文科三班的学生
String clazz = value.toString().split(",")[4];
if ("文科三班".equals(clazz)) {
context.write(value, NullWritable.get());
}
}
}
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
// 设置MapReduce输出的K-V的分隔符
conf.set("mapred.textoutputformat.separator", ",");
Job job = Job.getInstance(conf);
job.setJobName("Demo5MRFilter");
job.setJarByClass(Demo5MRFilter.class);
job.setMapperClass(MyMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// 配置输入输出路径
FileInputFormat.addInputPath(job, new Path("/student/input"));
// 输出路径不需要提前创建,如果该目录已存在则会报错
// 通过HDFS的JavaAPI判断输出路径是否存在
Path outPath = new Path("/student/filter/output");
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outPath)) {
fs.delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
// 等待job运行完成
job.waitForCompletion(true);
/**
* hdfs dfs -mkdir -p /student/filter/output
* hadoop jar Hadoop-1.0.jar com.shujia.MapReduce.Demo5MRFilter
*/
}
}
combiner预聚合
#combiner发生在map端的reduce
package com.shujia.MapReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
// 统计每个单词出现的次数
public class Demo6WordCountCombiner {
// Map阶段
public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
/**
* @param key Map端输入的key->偏移量
* @param value Map端输入的value->一行数据
* @param context MapReduce整个过程的上下文环境->可以获取MapReduce程序运行时的一些参数、状态,可以将Map的输出发送到Reduce
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
// 实现自己的map端逻辑
String vStr = value.toString();
// 按照空格进行切分,将每个单词切分出来
String[] words = vStr.split(" ");
// 遍历每一个单词,构造成k-v格式
/**
* hadoop hive hbase spark flink
* ====>
* hadoop 1
* hive 1
* hbase 1
* spark 1
* flink 1
*/
for (String word : words) {
Text keyOut = new Text(word);
IntWritable valueOut = new IntWritable(1);
// 通过context将构建好的k-v发送出去
context.write(keyOut, valueOut);
}
}
}
// 自定义的Combiner
public static class MyCombiner extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
// 实现自己的Reduce逻辑
int sum = 0; // 保存每个单词的数量
for (IntWritable value : values) {
// 遍历values迭代器
sum += value.get();
}
// 将Reduce统计得到的结果输出到HDFS
context.write(key, new IntWritable(sum));
}
}
// Reduce阶段
public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
/**
* @param key Map端输出的数据按照key进行分组过后的数据中的key,在这里相当于每个单词
* @param values Map端输出的数据按照key进行分组过后,相同key的所有的value组成的集合(迭代器)
* @param context MapReduce的上下文环境,主要用于输出数据到HDFS
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
// 实现自己的Reduce逻辑
int sum = 0; // 保存每个单词的数量
for (IntWritable value : values) {
// 遍历values迭代器
sum += value.get();
}
// 将Reduce统计得到的结果输出到HDFS
context.write(key, new IntWritable(sum));
}
}
// Driver端(将Map、Reduce进行组装)
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
// 创建配置文件
Configuration conf = new Configuration();
// 创建一个Job实例
Job job = Job.getInstance(conf);
// 对Job进行一些简单的配置
job.setJobName("Demo6WordCountCombiner");
// 通过class类设置运行Job时该执行哪一个类
job.setJarByClass(Demo6WordCountCombiner.class);
// 对Map端进行配置
// 对Map端输出的Key的类型进行配置
job.setMapOutputKeyClass(Text.class);
// 对Map端输出的Value的类型进行配置
job.setMapOutputValueClass(IntWritable.class);
// 配置Map任务该运行哪一个类
job.setMapperClass(MyMapper.class);
// 设置Combiner
job.setCombinerClass(MyCombiner.class);
// 对Reduce端进行配置
// 对Reduce端输出的Key的类型进行配置
job.setOutputKeyClass(Text.class);
// 对Reduce端输出的Value的类型进行配置
job.setOutputValueClass(IntWritable.class);
// 配置Reduce任务该运行哪一个类
job.setReducerClass(MyReducer.class);
// 配置输入输出路径
FileInputFormat.addInputPath(job, new Path("/wordCount/input"));
// 输出路径不需要提前创建,如果该目录已存在则会报错
// 通过HDFS的JavaAPI判断输出路径是否存在
Path outPath = new Path("/wordCount/output");
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outPath)) {
fs.delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
// 等待job运行完成
job.waitForCompletion(true);
/**
* 1、准备数据,将words.txt上传至HDFS的/wordCount/input目录下面
* hdfs dfs -mkdir -p /wordCount/input
* hdfs dfs -put words.txt /wordCount/input
* 2、提交MapReduce任务
* hadoop jar Hadoop-1.0.jar com.shujia.MapReduce.Demo6WordCountCombiner
*/
}
}
#注意combiner预聚合 适合Max、Min、Sum