MapReduce原理深入理解(二)
1.Mapreduce操作不需要reduce阶段
1 import org.apache.hadoop.conf.Configuration;
2 import org.apache.hadoop.fs.FileSystem;
3 import org.apache.hadoop.fs.Path;
4 import org.apache.hadoop.io.LongWritable;
5 import org.apache.hadoop.io.NullWritable;
6 import org.apache.hadoop.io.Text;
7 import org.apache.hadoop.mapreduce.Job;
8 import org.apache.hadoop.mapreduce.Mapper;
9 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
10 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
11
12 import java.io.IOException;
13
14 public class WordCount03 {
15 public static class MyMapper extends Mapper<LongWritable, Text,Text, NullWritable>{
16 @Override
17 protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
18 String line = value.toString();
19 String s = line.split(",")[3];
20 if(s.equals("男")){
21 context.write(new Text(s),NullWritable.get());
22 }
23 }
24 }
25 public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
26 Job job= Job.getInstance();
27 job.setNumReduceTasks(0);
28 /**
29 * 有些情况下,不需要reduce(聚合程序),
30 * 在不需要聚合操作的时候,可以不需要reduce
31 * 而reduce默认为1,需要手动设置为0,
32 * 如果没有设置为0,会产生默认的reduce,只不过reduce不处理任何数据
33 */
34 job.setJobName("mr03程序");
35 job.setJarByClass(WordCount03.class);
36 job.setMapOutputKeyClass(Text.class);
37 job.setMapOutputValueClass(NullWritable.class);
38 Path in = new Path("/word");
39 FileInputFormat.addInputPath(job,in);
40 Path out = new Path("/output");
41 FileSystem fs = FileSystem.get(new Configuration());
42 if(fs.exists(out)){
43 fs.delete(out);
44 }
45 FileOutputFormat.setOutputPath(job,out);
46 job.waitForCompletion(true);
47 }
48 }
注意:
有些情况下,不需要reduce(聚合程序), 在不需要聚合操作的时候,可以不需要reduce 而reduce默认为1,需要手动设置为0,
如果没有设置为0,会产生默认的reduce,只不过reduce不处理任何数据
2.MapReduce中join操作(数据拼接)
1 import org.apache.hadoop.conf.Configuration;
2 import org.apache.hadoop.fs.FileSystem;
3 import org.apache.hadoop.fs.Path;
4 import org.apache.hadoop.io.LongWritable;
5 import org.apache.hadoop.io.NullWritable;
6 import org.apache.hadoop.io.Text;
7 import org.apache.hadoop.mapreduce.InputSplit;
8 import org.apache.hadoop.mapreduce.Job;
9 import org.apache.hadoop.mapreduce.Mapper;
10 import org.apache.hadoop.mapreduce.Reducer;
11 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
13 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14
15 import java.io.IOException;
16 import java.util.ArrayList;
17
18 public class WordCount04 {
19 public static class JoinMapper extends Mapper<LongWritable,Text,Text,Text>{
20 @Override
21 protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
22 //1.获取数据的路径 InputSplit
23 //context 上面是hdfs 下面如果有reduce就是reduce 没有就是hdfs
24 InputSplit inputSplit = context.getInputSplit();
25 FileSplit fs=(FileSplit)inputSplit;
26 String url = fs.getPath().toString();
27 //2.判断
28 if(url.contains("students")){//true当前数据为students.txt
29 String id = value.toString().split(",")[0];
30 //为了方便reduce数据的操作 针对于不同的数据 打一个标签
31 String line = "*" + value.toString();
32 context.write(new Text(id),new Text(line));
33 }else {//false 当前数据为score.txt
34 //以学号作为k 也是两张数据的关联条件
35 String id = value.toString().split(",")[0];
36 //为了方便reduce数据的操作 针对于不同的数据 打一个标签
37 String line = "#" + value.toString();
38 context.write(new Text(id),new Text(line));
39 }
40 }
41 }
42 public static class JoinReduce extends Reducer<Text,Text,Text,NullWritable>{
43 @Override
44 protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
45 //数据在循环之外保存
46 String stuInfo="";
47 ArrayList<String> scores = new ArrayList<String>();
48 //提取数据
49 for (Text value : values) {
50 //获取一行一行的数据(所有数据包含students.txt和score.txt)
51 String line = value.toString();
52 if(line.startsWith("*")){//true 为学生数据
53 stuInfo= line.substring(1);
54 }else {//false 为学生成绩数据
55 scores.add(line.substring(1));
56 }
57 }
58 /**
59 * 求的是 两张表的拼接
60 */
61 //数据拼接
62 for (String score : scores) {
63 String subject = score.split(",")[1];
64 String s = score.split(",")[2];
65 String end=stuInfo+","+subject+","+s;
66 context.write(new Text(end),NullWritable.get());
67 }
68 /**
69 * 求的是 两张表的拼接 拼接过程中对成绩求和
70 */
71 // long sum=0l;
72 // for (String s : scores) {
73 // Integer sc =Integer.valueOf( s.split(",")[2]);
74 // sum+=sc;
75 // }
76 // String end=stuInfo+","+sum;
77 // context.write(new Text(end),NullWritable.get());
78 }
79 }
80 public static void main(String[] args) throws Exception {
81 Job job = Job.getInstance();
82 job.setJobName("Join MapReduce");
83 job.setJarByClass(WordCount04.class);
84
85 job.setMapperClass(JoinMapper.class);
86 job.setMapOutputKeyClass(Text.class);
87 job.setMapOutputValueClass(Text.class);
88
89 job.setReducerClass(JoinReduce.class);
90 job.setOutputKeyClass(Text.class);
91 job.setOutputValueClass(NullWritable.class);
92 //指定路径
93 FileInputFormat.addInputPath(job,new Path("/word"));
94 Path path = new Path("/output");
95 FileSystem fs = FileSystem.get(new Configuration());
96 if(fs.exists(path)){
97 fs.delete(path);
98 }
99 FileOutputFormat.setOutputPath(job,new Path("/output"));
100 job.waitForCompletion(true);
101 System.out.println("join 正在执行");
102 }
103 }
3.MapReduce添加Combiner
combiner发生在map端的reduce操作。 作用是减少map端的输出,减少shuffle过程中网络传输的数据量,提高作业的执行效率。
combiner仅仅是单个map task的reduce,没有对全部map的输出做reduce。
如果不用combiner,那么,所有的结果都是reduce完成,效率会相对低下。使用combiner,先完成的map会在本地聚合,提升速度。 注意:Combiner的输出是Reducer的输入,Combiner绝不能改变最终的计算结果。所以,Combine适合于等幂操作,比如累加,最大值等。求平均数不适合
combine操作前的效果:
combine操作后的效果:
1 package com.shujia.hadoop;
2
3 import org.apache.hadoop.conf.Configuration;
4 import org.apache.hadoop.fs.FileSystem;
5 import org.apache.hadoop.fs.Path;
6 import org.apache.hadoop.io.LongWritable;
7 import org.apache.hadoop.io.Text;
8 import org.apache.hadoop.mapreduce.Job;
9 import org.apache.hadoop.mapreduce.Mapper;
10 import org.apache.hadoop.mapreduce.Reducer;
11 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
13
14 import java.io.IOException;
15
16 public class Demo07Combine {
17 public static class CombineMapper extends Mapper<LongWritable,Text,Text,LongWritable>{
18 @Override
19 protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
20 String line = value.toString();
21 String sex = line.split(",")[3];
22 context.write(new Text(sex),new LongWritable(1));
23 }
24 }
25 //Combine(预聚合 预处理)过程在reduce之前 map端之后
26 //
27 public static class Combine extends Reducer<Text,LongWritable,Text,LongWritable>{
28 @Override
29 protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
30 long sum=0l;
31 for (LongWritable value : values) {
32 sum+=value.get();
33 }
34 context.write(key,new LongWritable(sum));
35 }
36 }
37 public static class CombineReduce extends Reducer<Text,LongWritable,Text,LongWritable>{
38 @Override
39 protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
40 long sum=0l;
41 for (LongWritable value : values) {
42 sum+=value.get();
43 }
44 context.write(key,new LongWritable(sum));
45 }
46 }
47 public static void main(String[] args) throws Exception{
48 //创建一个job任务
49 Job job = Job.getInstance();
50
51 // 指定reduce节点个数
52 job.setNumReduceTasks(2);
53 //指定job名称
54 job.setJobName("Combine,性别统计");
55 //构建mr
56 //指定当前main所在类名(识别具体的类)
57 job.setJarByClass(Demo07Combine.class);
58 //指定map端类
59 job.setMapperClass(CombineMapper.class);
60 // 指定map输出的kv类型
61 job.setMapOutputKeyClass(Text.class);
62 job.setMapOutputValueClass(LongWritable.class);
63
64 /**
65 * 指定combine
66 */
67 job.setCombinerClass(Combine.class);
68
69 //指定reduce端类
70 //指定reduce端输出的kv类型
71 job.setReducerClass(CombineReduce.class);
72 job.setOutputKeyClass(Text.class);
73 job.setOutputValueClass(LongWritable.class);
74
75 // 指定输入路径
76 Path in = new Path("/data");
77 FileInputFormat.addInputPath(job,in);
78 //指定输出
79 Path out = new Path("/output");
80 //如果路径存在 删除
81 FileSystem fs = FileSystem.get(new Configuration());
82 if(fs.exists(out)){
83 fs.delete(out,true);
84 }
85 FileOutputFormat.setOutputPath(job,out);
86
87 //启动任务
88 job.waitForCompletion(true);
89 /**
90 * 提交任务
91 * 1.通过maven中package将项目打包上传服务器然后执行
92 * 2.执行任务 hadoop jar hadoop-mapreduce-examples-2.7.6.jar com.shujia.hadoop.Demo01WordCount /word /output
93 *
94 */
95 System.out.println("Combine正在执行");
96
97 }
98 }