MapReduce经典案例讲解

1.天气案列

1.1原始数据

1949-10-01 14:21:02 34c 1949-10-01 19:21:02 38c 1949-10-02 14:01:02 32c 1950-01-01 11:21:02 32c 1950-10-01 12:21:02 37c 1951-12-01 12:21:02 23c 1950-10-02 12:21:02 41c 1950-10-03 12:21:02 27c 1951-07-01 12:21:02 45c 1951-07-02 12:21:02 46c 1951-07-03 12:21:03 47c

1.2分析过程

查询出每年每月最高的温度

maptask --> 1949-10-01 14:21:02 34c

kvbuffer -->

key(Text):1949-10

value(IntWritable):34

分区

一定要保证每年每月的数据都在一个分区

排序

先按照分区，然后按照Key进行排序

reduceTask

key:(1949-10)

values:(34 38 32 36)

写出

查询出每年每月最高的(两个)温度

从values取出值排序，然后取出前两个不同的

查询出每年每月最高的两个温度在第几天

maptask --> 1949-10-01 14:21:02 34c

kvbuffer -->

key(Text):1949-10-10 1949-10-11

value(IntWritable):10-34 10-21 10-33 11-31

1.3创建类

1.3.1搭建环境

上传weather文件至Hadoop

创建java项目

配置文件

上传jar包

注意:创建mapper类时一定要选择导入的资源包中的mapper仓库下的

1.3.2创建job类

package com.shsxt.rjw;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.sun.jersey.core.impl.provider.entity.XMLJAXBElementProvider.Text;

public class WeatherJob {

    public static void main(String[] args) throws Exception{
        //获取配置文件
        Configuration configuration = new Configuration(true);
        //创建job
        Job job = Job.getInstance(configuration);
        //设置job的名称
        job.setJobName("weather");
        //设置job的reduce的数量
        job.setNumReduceTasks(2);
        //设置job的输入路径
        FileInputFormat.setInputPaths(job, new Path("/shsxt/java/weather"));
        //设置job的输出路径
        FileOutputFormat.setOutputPath(job, new Path("/shsxt/java/weather_result_" + System.currentTimeMillis()));
        //设置mapper的输出key的类型
        job.setMapOutputKeyClass(Text.class);
        //设置mapper的输出value的类型
        job.setMapOutputValueClass(IntWritable.class);
        //设置mapper
        job.setMapperClass(WeatherMapper.class);
        //设置Reduce
        job.setReducerClass(WeatherReducer.class);
        //等待Job的完成
        job.waitForCompletion(true);
    }
    
}

1.3.3创建mapper类

package com.shsxt.rjw;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.io.Text;

public class WeatherMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
        //开始拆分数据
        String[] ss = value.toString().split("\t");
        //拆分出时间key
        String[] date = ss[0].split("-");
        Text outputKey = new Text(date[0] + "-" + date[1]);
        //拆分出温度value
        IntWritable outputValue = new IntWritable(Integer.parseInt(ss[1].replace("c", "")));
        //将数据写出
        context.write(outputKey, outputValue);
    }
}

1.3.4创建reduce类

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WeatherReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
        //声明一个容器存放温度信息
        List<Integer> list = new ArrayList<>();
        //获取本月(key)所有的温度
        Iterator<IntWritable> iterator = values.iterator();
        //遍历迭代器
        while (iterator.hasNext()) {
            //取出值
            int temperature = iterator.next().get();
            //将其存放到容器
            list.add(temperature);
            }
            //对温度进行排序
            Collections.sort(list);
            IntWritable outputValue = new IntWritable(list.get(list.size() - 1));
            //将结果写出
            context.write(key, outputValue);
    }
}

1.3.5项目打包上传到Hadoop上

hadoop jar GG.jar com.shsxt.rjw.WeatherJob

1.3.6启动MapReduce集群时

注意:必须启动resourcemanager

yarn-daemon.sh start resourcemanager

1.3.7常见的坑

1.导错包

2.路径有问题

posted @ 2020-01-02 16:09 数据阮小白阅读(1022) 评论(0) 编辑收藏举报

刷新页面返回顶部

数据阮小白

#数据阮小白的进阶之路