MapReduce经典案例讲解

1.天气案列

1.1原始数据

1949-10-01 14:21:02 34c 1949-10-01 19:21:02 38c 1949-10-02 14:01:02 32c 1950-01-01 11:21:02 32c 1950-10-01 12:21:02 37c 1951-12-01 12:21:02 23c 1950-10-02 12:21:02 41c 1950-10-03 12:21:02 27c 1951-07-01 12:21:02 45c 1951-07-02 12:21:02 46c 1951-07-03 12:21:03 47c

1.2分析过程

查询出每年每月最高的温度

maptask --> 1949-10-01 14:21:02 34c

kvbuffer -->

key(Text):1949-10

value(IntWritable):34

分区

一定要保证每年每月的数据都在一个分区

排序

先按照分区,然后按照Key进行排序

reduceTask

key:(1949-10)

values:(34 38 32 36)

写出

查询出每年每月最高的(两个)温度

从values取出值排序,然后取出前两个不同的

查询出每年每月最高的两个温度在第几天

maptask --> 1949-10-01 14:21:02 34c

kvbuffer -->

key(Text):1949-10-10 1949-10-11

value(IntWritable):10-34 10-21 10-33 11-31

1.3创建类

1.3.1搭建环境

上传weather文件至Hadoop

创建java项目

配置文件

上传jar包

注意:创建mapper类时一定要选择导入的资源包中的mapper仓库下的

1.3.2创建job类

package com.shsxt.rjw;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.sun.jersey.core.impl.provider.entity.XMLJAXBElementProvider.Text;

public class WeatherJob {

public static void main(String[] args) throws Exception{
//获取配置文件
Configuration configuration = new Configuration(true);
//创建job
Job job = Job.getInstance(configuration);
//设置job的名称
job.setJobName("weather");
//设置job的reduce的数量
job.setNumReduceTasks(2);
//设置job的输入路径
FileInputFormat.setInputPaths(job, new Path("/shsxt/java/weather"));
//设置job的输出路径
FileOutputFormat.setOutputPath(job, new Path("/shsxt/java/weather_result_" + System.currentTimeMillis()));
//设置mapper的输出key的类型
job.setMapOutputKeyClass(Text.class);
//设置mapper的输出value的类型
job.setMapOutputValueClass(IntWritable.class);
//设置mapper
job.setMapperClass(WeatherMapper.class);
//设置Reduce
job.setReducerClass(WeatherReducer.class);
//等待Job的完成
job.waitForCompletion(true);
}

}

1.3.3创建mapper类

package com.shsxt.rjw;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.io.Text;

public class WeatherMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
//开始拆分数据
String[] ss = value.toString().split("\t");
//拆分出时间key
String[] date = ss[0].split("-");
Text outputKey = new Text(date[0] + "-" + date[1]);
//拆分出温度value
IntWritable outputValue = new IntWritable(Integer.parseInt(ss[1].replace("c", "")));
//将数据写出
context.write(outputKey, outputValue);
}
}

1.3.4创建reduce类

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WeatherReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

@Override
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
//声明一个容器存放温度信息
List<Integer> list = new ArrayList<>();
//获取本月(key)所有的温度
Iterator<IntWritable> iterator = values.iterator();
//遍历迭代器
while (iterator.hasNext()) {
//取出值
int temperature = iterator.next().get();
//将其存放到容器
list.add(temperature);
}
//对温度进行排序
Collections.sort(list);
IntWritable outputValue = new IntWritable(list.get(list.size() - 1));
//将结果写出
context.write(key, outputValue);
}
}

1.3.5项目打包上传到Hadoop上

hadoop jar GG.jar com.shsxt.rjw.WeatherJob

1.3.6启动MapReduce集群时

注意:必须启动resourcemanager

yarn-daemon.sh start resourcemanager

1.3.7常见的坑

1.导错包

2.路径有问题

 

posted @ 2020-01-02 16:09  数据阮小白  阅读(1022)  评论(0编辑  收藏  举报