MapReduce案例一：天气温度

1.需求

2.思路

3.代码实现

3.1MyWeather 类代码：

这个类主要是用来定义hadoop的配置，在执行计算程序时所需加载的一些类。

package com.hadoop.mr.weather;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.TestMapReduceLazyOutput.TestMapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class MyWeather {
    
    
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf =new Configuration(true);
        Job job = Job.getInstance(conf);
        
        job.setJarByClass(MyWeather.class);
        
        //----------conf-----------------------
        
        //---begin Map :
        //输入格式化类
//        job.setInputFormatClass(ooxx.class);
        //设置mapper类
        job.setMapperClass(TMapper.class);
        
        job.setMapOutputKeyClass(TQ.class);
        job.setMapOutputValueClass(IntWritable.class);
        
        //设置partitioner类
        job.setPartitionerClass(TPartitioner.class);
        
        //设置排序比较器类
        job.setSortComparatorClass(TSortComparator.class);
        
        //设置combiner类
//        job.setCombinerClass(TCombiner.class);
        
        //----end Map
        
        //----begin Reduce:
        //设置组比较器的类
        job.setGroupingComparatorClass(TGroupingComparator.class);
        //设置reducer类
        job.setReducerClass(TReducer.class);
    
        //-----end Reduce:
        
        //设置输入数据的路径
        Path input = new Path("/data/tq/input");
        FileInputFormat.addInputPath(job, input);
        
        //设置输出数据的路径
        Path output=new Path("/data/tq/output");
        if(output.getFileSystem(conf).exists(output)){
            //如果目录存在递归删除
            output.getFileSystem(conf).delete(output,true);
        }
        FileOutputFormat.setOutputPath(job, output);
        
        //设置reduceTask的数量 和 partitions数量对应
        job.setNumReduceTasks(2);
        //-------------------------------------
        job.waitForCompletion(true);
    }
}

3.2Tmapper类代码

该类继承Mapper类他的主要作用是对输入的文件做一些预处理工作。

package com.hadoop.mr.weather;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.StringUtils;
//TextInputFormat.class  --key类型是 longWritable 偏移量  --value是Text类型

public class TMapper extends Mapper<LongWritable, Text, TQ, IntWritable>{
    
    //创建map的 k v 对象
    TQ mkey=new TQ();  // map --->key
    IntWritable mval=new IntWritable(); //map --->value
    
    //重写map方法
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, TQ, IntWritable>.Context context)
            throws IOException, InterruptedException {
        /**
        1949-10-01 14:21:02     34c
        1949-10-01 19:21:02        38c
        1949-10-02 14:01:02        36c
        1950-01-01 11:21:02        32c
        1950-10-01 12:21:02        37c
        **/
        
        
        try {
            String[] strs = StringUtils.split(value.toString(),'\t');//对文本将制表符切分
            
            SimpleDateFormat sdf= new SimpleDateFormat("yyyy-MM-dd");
            Date date = sdf.parse(strs[0]);
            Calendar cal= Calendar.getInstance();
            cal.setTime(date);
            
            mkey.setYear(cal.get(Calendar.YEAR));
            mkey.setMonth(cal.get(Calendar.MONTH)+1); //第一个月默认从0开始所以加1
            mkey.setDay(cal.get(Calendar.DAY_OF_MONTH));
            
            int wd = Integer.parseInt(strs[1].substring(0, strs[1].length()-1));//获取温度字符串并强转为int类型
            mkey.setWd(wd);
            
            mval.set(wd);
            
            context.write(mkey, mval);
        } catch (ParseException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    
    }
}

3.3TQ类代码

该类实现WritableComparable接口他的作用是给生成相关的属性并重写写入，读取，比较的方法，

package com.hadoop.mr.weather;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class TQ implements WritableComparable<TQ> {
    
    //定义属性
    private int year;
    private int month;
    private int day;
    private int wd;  //温度属性
    
    public int getYear() {
        return year;
    }

    public void setYear(int year) {
        this.year = year;
    }

    public int getMonth() {
        return month;
    }

    public void setMonth(int month) {
        this.month = month;
    }

    public int getDay() {
        return day;
    }

    public void setDay(int day) {
        this.day = day;
    }

    public int getWd() {
        return wd;
    }

    public void setWd(int wd) {
        this.wd = wd;
    }


    
    
    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(year);
        out.writeInt(month);
        out.writeInt(day);
        out.writeInt(wd);
        
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        // TODO Auto-generated method stub
        this.year=in.readInt();
        this.month=in.readInt();
        this.day=in.readInt();
        this.wd=in.readInt();
    }

    @Override
    public int compareTo(TQ that) {
        //compare方法返回值说明the value 0 if x == y; a value less than 0 if x < y; and a value greater than 0 if x > y
        // 日期正序 ,使用这年和那年比较 -.-
        int c1=Integer.compare(this.year, that.getYear());
        // 如果年份相同比较天
        if(c1==0){
            int c2=Integer.compare(this.month, that.getMonth());
            //如果是同一天返回0
            if(c2==0){
                return Integer.compare(this.day, that.getDay());
            }
            return c2;
        }
        
        return 0;
    }

}

3.4Tpartitioner类代码

该类的作用，是定义输出文件的分布规则，避免产生数据倾斜

package com.hadoop.mr.weather;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;

public class TPartitioner extends Partitioner<TQ, IntWritable> {
    
    
    //约定成俗规则：避免数据倾斜，将少的数据都放在一个reduce任务组里，将数据量大的单独放一个任务组里。
    @Override
    public int getPartition(TQ key, IntWritable value, int numPartitions) {
        
        return key.hashCode() % numPartitions;
    }

}

3.5TSortComparator类代码：

该类的作用是定义一个排序比较器

package com.hadoop.mr.weather;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class TSortComparator extends WritableComparator{
    
    public TSortComparator() {
        super(TQ.class,true);
    }
    
    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        
        TQ t1=(TQ) a;
        TQ t2=(TQ) b;
        
        int c1 = Integer.compare(t1.getYear(), t2.getYear());
        if(c1==0){
            
            int c2= Integer.compare(t1.getMonth(), t2.getMonth());
            if(c2==0){
                return -Integer.compare(t1.getWd(), t2.getWd());// -号表示返回温度的倒序排列
            }
            
        }
        return super.compare(a, b);
    }
}

3.6TGroupingComparator类代码：

该类的作用是根据年月两个维度做分组

package com.hadoop.mr.weather;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class TGroupingComparator extends WritableComparator {

    public TGroupingComparator() {
        super(TQ.class,true);
    }
    
    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        
        TQ t1=(TQ) a;
        TQ t2=(TQ) b;
        
        int c1 = Integer.compare(t1.getYear(), t2.getYear());
        if(c1==0){
            return Integer.compare(t1.getMonth(), t2.getMonth()); //返回月份的比较结果来分组
        }
        return c1;
    }
}

3.7TReducer 类代码

该类的作用是定义数据的输出格式和内容

package com.hadoop.mr.weather;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class TReducer extends Reducer<TQ, IntWritable, Text, IntWritable>{
    
    Text rkey=new Text();
    IntWritable rval=new IntWritable();
    
    /* (non-Javadoc)
     * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
     */
    @Override
    protected void reduce(TQ key, Iterable<IntWritable> values, Context context)
            throws IOException, InterruptedException {
        //相同的key为一组。。。。
        //1970 01 01 88   88
        //1970 01 11 78   78
        //1970 01 21 68   68
        //1970 01 01 58   58
        
        int flag=0; //迭代的次数
        int day=0;  
        
        for (IntWritable v : values) {
            if(flag==0){
                //将reduce的key格式化成1970-01-01:88   
                rkey.set(key.getYear()+"-"+key.getMonth()+"-"+key.getDay()+":"+key.getWd());
                //将reduce的value设置为温度
                rval.set(key.getWd());
                flag++;
                day=key.getDay();
                context.write(rkey, rval);
            }
            //如果迭代次数不为0且当前的天不等于迭代得到的天就将新的天气数据赋值给reduce的 kv
            if(flag!=0 && day!=key.getDay()){
                //将reduce的key格式化成1970-01-01:88   
                rkey.set(key.getYear()+"-"+key.getMonth()+"-"+key.getDay()+":"+key.getWd());
                //将reduce的value设置为温度
                rval.set(key.getWd());
                context.write(rkey, rval);
                break;
            }
        }
        
        
    }
}

4.执行程序

4.1将包导出为jar包上传至服务器

4.2创建hdfs文件输入路径

hdfs dfs -mkdir -p /data/tq/input

4.3上传测试文件到创建的hdfs目录下

[root@node01 ~]# cat tq.txt
1949-10-01 14:21:02    34c
1949-10-01 19:21:02    38c
1949-10-02 14:01:02    36c
1950-01-01 11:21:02    32c
1950-10-01 12:21:02    37c
1951-12-01 12:21:02    23c
1950-10-02 12:21:02    41c
1950-10-03 12:21:02    27c
1951-07-01 12:21:02    45c
1951-07-02 12:21:02    46c
1951-07-03 12:21:03    47c
 
[root@node01 ~]# hdfs dfs -put tq.txt /data/tq/input

4.4服务端执行程序

[root@node01 ~]# hadoop jar Myweather.jar com.hadoop.mr.weather.MyWeather
2018-12-29 22:42:01,101 INFO client.ConfiguredRMFailoverProxyProvider: Failing over to rm2
2018-12-29 22:42:01,484 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
2018-12-29 22:42:01,548 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /user/root/.staging/job_1546092355023_0004
2018-12-29 22:42:02,025 INFO input.FileInputFormat: Total input files to process : 1
2018-12-29 22:42:02,922 INFO mapreduce.JobSubmitter: number of splits:1
2018-12-29 22:42:02,975 INFO Configuration.deprecation: yarn.resourcemanager.zk-address is deprecated. Instead, use hadoop.zk.address
2018-12-29 22:42:02,976 INFO Configuration.deprecation: yarn.resourcemanager.system-metrics-publisher.enabled is deprecated. Instead, use yarn.system-metrics-publisher.enabled
2018-12-29 22:42:03,643 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1546092355023_0004
2018-12-29 22:42:03,644 INFO mapreduce.JobSubmitter: Executing with tokens: []
2018-12-29 22:42:03,932 INFO conf.Configuration: resource-types.xml not found
2018-12-29 22:42:03,932 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
2018-12-29 22:42:04,012 INFO impl.YarnClientImpl: Submitted application application_1546092355023_0004
2018-12-29 22:42:04,064 INFO mapreduce.Job: The url to track the job: http://node04:8088/proxy/application_1546092355023_0004/
2018-12-29 22:42:04,065 INFO mapreduce.Job: Running job: job_1546092355023_0004
2018-12-29 22:42:13,301 INFO mapreduce.Job: Job job_1546092355023_0004 running in uber mode : false
2018-12-29 22:42:13,302 INFO mapreduce.Job:  map 0% reduce 0%
2018-12-29 22:42:20,490 INFO mapreduce.Job:  map 100% reduce 0%
2018-12-29 22:42:35,850 INFO mapreduce.Job:  map 100% reduce 50%
2018-12-29 22:42:38,877 INFO mapreduce.Job:  map 100% reduce 100%
2018-12-29 22:42:39,899 INFO mapreduce.Job: Job job_1546092355023_0004 completed successfully
2018-12-29 22:42:40,043 INFO mapreduce.Job: Counters: 53
    File System Counters
        FILE: Number of bytes read=254
        FILE: Number of bytes written=653891
        FILE: Number of read operations=0
        FILE: Number of large read operations=0
        FILE: Number of write operations=0
        HDFS: Number of bytes read=366
        HDFS: Number of bytes written=141
        HDFS: Number of read operations=13
        HDFS: Number of large read operations=0
        HDFS: Number of write operations=4
    Job Counters
        Launched map tasks=1
        Launched reduce tasks=2
        Rack-local map tasks=1
        Total time spent by all maps in occupied slots (ms)=4437
        Total time spent by all reduces in occupied slots (ms)=29074
        Total time spent by all map tasks (ms)=4437
        Total time spent by all reduce tasks (ms)=29074
        Total vcore-milliseconds taken by all map tasks=4437
        Total vcore-milliseconds taken by all reduce tasks=29074
        Total megabyte-milliseconds taken by all map tasks=4543488
        Total megabyte-milliseconds taken by all reduce tasks=29771776
    Map-Reduce Framework
        Map input records=11
        Map output records=11
        Map output bytes=220
        Map output materialized bytes=254
        Input split bytes=102
        Combine input records=0
        Combine output records=0
        Reduce input groups=6
        Reduce shuffle bytes=254
        Reduce input records=11
        Reduce output records=9
        Spilled Records=22
        Shuffled Maps =2
        Failed Shuffles=0
        Merged Map outputs=2
        GC time elapsed (ms)=351
        CPU time spent (ms)=1640
        Physical memory (bytes) snapshot=419917824
        Virtual memory (bytes) snapshot=8213352448
        Total committed heap usage (bytes)=164515840
        Peak Map Physical memory (bytes)=206139392
        Peak Map Virtual memory (bytes)=2733309952
        Peak Reduce Physical memory (bytes)=108830720
        Peak Reduce Virtual memory (bytes)=2740228096
    Shuffle Errors
        BAD_ID=0
        CONNECTION=0
        IO_ERROR=0
        WRONG_LENGTH=0
        WRONG_MAP=0
        WRONG_REDUCE=0
    File Input Format Counters
        Bytes Read=264
    File Output Format Counters
        Bytes Written=141

4.5将hdfs上生成的输出文件拉取到本地

[root@node01 ~]# hdfs dfs -get  /data/tq/output/* ./test

4.6查看输出文件

[root@node01 test]# ls
part-r-00000  part-r-00001  _SUCCESS
[root@node01 test]# cat part-r-00000
[root@node01 test]# cat part-r-00001
1951-7-3:47    47
1951-7-2:46    46
1950-10-2:41    41
1950-10-3:27    27
1951-12-1:23    23
1950-1-1:32    32
1950-10-1:37    37
1949-10-1:38    38
1949-10-2:36    36

0分区是空的 1分区有程序定义的k v输出。这就发生了数据倾斜，可能上面的Tpartitioner类的代码对数据分布规则定义的不恰当导致的。

5.Combiner说明

由于数据量比较少，这边没有对combiner类做扩展

每一个map都可能会产生大量的本地输出，Combiner的作用就是对map端的输出先做一次合并，以减少在map和reduce节点之间的数据传输量，以提高网络IO性能，是MapReduce的一种优化手段之一，其具体的作用如下所述。

（1）Combiner最基本是实现本地key的聚合，对map输出的key排序，value进行迭代。如下所示：

　　map: (K1, V1) → list(K2, V2) 
　　combine: (K2, list(V2)) → list(K2, V2) 
　　reduce: (K2, list(V2)) → list(K3, V3)

（2）Combiner还有本地reduce功能（其本质上就是一个reduce），例如Hadoop自带的wordcount的例子和找出value的最大值的程序，combiner和reduce完全一致，如下所示：

　　map: (K1, V1) → list(K2, V2) 
　　combine: (K2, list(V2)) → list(K3, V3) 
　　reduce: (K3, list(V3)) → list(K4, V4)

如果在wordcount中不用combiner，那么所有的结果都是reduce完成，效率会相对低下。使用combiner之后，先完成的map会在本地聚合，提升速度。对于hadoop自带的wordcount的例子，value就是一个叠加的数字，所以map一结束就可以进行reduce的value叠加，而不必要等到所有的map结束再去进行reduce的value叠加。

posted @ 2018-12-29 16:28 benjamin杨阅读(1392) 评论(0) 编辑收藏举报

努力加载评论中...

刷新页面返回顶部

benjamin杨

Dev or Ops

MapReduce案例一：天气温度

1.需求

2.思路

3.代码实现

3.1MyWeather 类代码：

3.2Tmapper类代码

3.3TQ类代码

3.4Tpartitioner类代码

3.5TSortComparator类代码：

3.6TGroupingComparator类代码：

3.7TReducer 类代码

4.执行程序

4.1将包导出为jar包上传至服务器

4.2创建hdfs文件输入路径

4.3上传测试文件到创建的hdfs目录下

4.4服务端执行程序

4.5将hdfs上生成的输出文件拉取到本地

4.6查看输出文件

5.Combiner说明

公告

benjamin杨

Dev or Ops

MapReduce案例一：天气温度

1.需求

2.思路

3.代码实现

3.1MyWeather 类代码：

3.2Tmapper类代码

3.3TQ类代码

3.4Tpartitioner类代码

3.5TSortComparator类代码：

3.6TGroupingComparator类代码：

3.7TReducer 类代码

4.执行程序

4.1将包导出为jar包 上传至服务器

4.2创建hdfs文件输入路径

4.3上传测试文件到创建的hdfs目录下

4.4服务端执行程序

4.5将hdfs上生成的输出文件 拉取到本地

4.6查看输出文件

5.Combiner说明

公告

4.1将包导出为jar包上传至服务器

4.5将hdfs上生成的输出文件拉取到本地