MapReduce经典案例讲解
1.天气案列
1.1原始数据
1949-10-01 14:21:02 34c 1949-10-01 19:21:02 38c 1949-10-02 14:01:02 32c 1950-01-01 11:21:02 32c 1950-10-01 12:21:02 37c 1951-12-01 12:21:02 23c 1950-10-02 12:21:02 41c 1950-10-03 12:21:02 27c 1951-07-01 12:21:02 45c 1951-07-02 12:21:02 46c 1951-07-03 12:21:03 47c
1.2分析过程
查询出每年每月最高的温度
maptask --> 1949-10-01 14:21:02 34c
kvbuffer -->
key(Text):1949-10
value(IntWritable):34
分区
一定要保证每年每月的数据都在一个分区
排序
先按照分区,然后按照Key进行排序
reduceTask
key:(1949-10)
values:(34 38 32 36)
写出
查询出每年每月最高的(两个)温度
从values取出值排序,然后取出前两个不同的
查询出每年每月最高的两个温度在第几天
maptask --> 1949-10-01 14:21:02 34c
kvbuffer -->
key(Text):1949-10-10 1949-10-11
value(IntWritable):10-34 10-21 10-33 11-31
1.3创建类
1.3.1搭建环境
上传weather文件至Hadoop
创建java项目
配置文件
上传jar包
注意:创建mapper类时一定要选择导入的资源包中的mapper仓库下的
1.3.2创建job类
package com.shsxt.rjw;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.sun.jersey.core.impl.provider.entity.XMLJAXBElementProvider.Text;
public class WeatherJob {
public static void main(String[] args) throws Exception{
//获取配置文件
Configuration configuration = new Configuration(true);
//创建job
Job job = Job.getInstance(configuration);
//设置job的名称
job.setJobName("weather");
//设置job的reduce的数量
job.setNumReduceTasks(2);
//设置job的输入路径
FileInputFormat.setInputPaths(job, new Path("/shsxt/java/weather"));
//设置job的输出路径
FileOutputFormat.setOutputPath(job, new Path("/shsxt/java/weather_result_" + System.currentTimeMillis()));
//设置mapper的输出key的类型
job.setMapOutputKeyClass(Text.class);
//设置mapper的输出value的类型
job.setMapOutputValueClass(IntWritable.class);
//设置mapper
job.setMapperClass(WeatherMapper.class);
//设置Reduce
job.setReducerClass(WeatherReducer.class);
//等待Job的完成
job.waitForCompletion(true);
}
}
1.3.3创建mapper类
package com.shsxt.rjw;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.io.Text;
public class WeatherMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
1.3.4创建reduce类
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WeatherReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
1.3.5项目打包上传到Hadoop上
hadoop jar GG.jar com.shsxt.rjw.WeatherJob
1.3.6启动MapReduce集群时
yarn-daemon.sh start resourcemanager
1.3.7常见的坑
1.导错包
2.路径有问题