Hadoop学习之第一个MapReduce程序

期望

通过这个mapreduce程序了解mapreduce程序执行的流程,着重从程序解执行的打印信息中提炼出有用信息。

执行前

程序代码

程序代码基本上是《hadoop权威指南》上原封不动搬下来的,目的为求出某一年份中最高气温,相关代码如下:

public class NcdcWeather {
    private String USAF_station_id;
    private String WBAN_station_id;
    private String date;
    private String time;
    private String latitude;
    private String longitude;
    /** 海拔*/
    private String elevation;
    /** 风向*/
    private String wind_direction;
    private String wind_direction_quality_code;
    private String sky_ceiling_height;
    private String sky_ceiling_height_quality_code;
    private String visibility_distance;
    private String visibility_distance_quality_code;
    private String air_temperature;
    private String air_temperature_quality_code;
    private String dew_point_temperature;
    private String dew_point_temperature_quality_code;
    private String atmospheric_pressure;
    private String atmospheric_pressure_quality_code;

    public NcdcWeather(String rowData) {
        if (StringUtils.isEmpty(rowData) || rowData.length() < 105) {
            return;
        }

        USAF_station_id = rowData.substring(4, 10);
        WBAN_station_id = rowData.substring(10, 15);
        date = rowData.substring(15, 23);
        time = rowData.substring(23, 27);
        latitude = rowData.substring(28, 34);
        longitude = rowData.substring(34, 41);
        elevation = rowData.substring(46, 51);
        wind_direction = rowData.substring(60, 63);
        wind_direction_quality_code = rowData.substring(63, 64);
        sky_ceiling_height = rowData.substring(70, 75);
        sky_ceiling_height_quality_code = rowData.substring(75, 76);
        visibility_distance = rowData.substring(78, 84);
        visibility_distance_quality_code = rowData.substring(84, 85);
        air_temperature = rowData.substring(87, 92);
        air_temperature_quality_code = rowData.substring(92, 93);
        dew_point_temperature = rowData.substring(93, 98);
        dew_point_temperature_quality_code = rowData.substring(98, 99);
        atmospheric_pressure = rowData.substring(99, 104);
        atmospheric_pressure_quality_code = rowData.substring(104, 105);
    }

    public String getUSAF_station_id() {
        return USAF_station_id;
    }

    public void setUSAF_station_id(String USAF_station_id) {
        this.USAF_station_id = USAF_station_id;
    }

    public String getWBAN_station_id() {
        return WBAN_station_id;
    }

    public void setWBAN_station_id(String WBAN_station_id) {
        this.WBAN_station_id = WBAN_station_id;
    }

    public String getDate() {
        return date;
    }

    public void setDate(String date) {
        this.date = date;
    }

    public String getTime() {
        return time;
    }

    public void setTime(String time) {
        this.time = time;
    }

    public String getLatitude() {
        return latitude;
    }

    public void setLatitude(String latitude) {
        this.latitude = latitude;
    }

    public String getLongitude() {
        return longitude;
    }

    public void setLongitude(String longitude) {
        this.longitude = longitude;
    }

    public String getElevation() {
        return elevation;
    }

    public void setElevation(String elevation) {
        this.elevation = elevation;
    }

    public String getWind_direction() {
        return wind_direction;
    }

    public void setWind_direction(String wind_direction) {
        this.wind_direction = wind_direction;
    }

    public String getWind_direction_quality_code() {
        return wind_direction_quality_code;
    }

    public void setWind_direction_quality_code(String wind_direction_quality_code) {
        this.wind_direction_quality_code = wind_direction_quality_code;
    }

    public String getSky_ceiling_height() {
        return sky_ceiling_height;
    }

    public void setSky_ceiling_height(String sky_ceiling_height) {
        this.sky_ceiling_height = sky_ceiling_height;
    }

    public String getSky_ceiling_height_quality_code() {
        return sky_ceiling_height_quality_code;
    }

    public void setSky_ceiling_height_quality_code(String sky_ceiling_height_quality_code) {
        this.sky_ceiling_height_quality_code = sky_ceiling_height_quality_code;
    }

    public String getVisibility_distance() {
        return visibility_distance;
    }

    public void setVisibility_distance(String visibility_distance) {
        this.visibility_distance = visibility_distance;
    }

    public String getVisibility_distance_quality_code() {
        return visibility_distance_quality_code;
    }

    public void setVisibility_distance_quality_code(String visibility_distance_quality_code) {
        this.visibility_distance_quality_code = visibility_distance_quality_code;
    }

    public String getAir_temperature() {
        return air_temperature;
    }

    public void setAir_temperature(String air_temperature) {
        this.air_temperature = air_temperature;
    }

    public String getAir_temperature_quality_code() {
        return air_temperature_quality_code;
    }

    public void setAir_temperature_quality_code(String air_temperature_quality_code) {
        this.air_temperature_quality_code = air_temperature_quality_code;
    }

    public String getDew_point_temperature() {
        return dew_point_temperature;
    }

    public void setDew_point_temperature(String dew_point_temperature) {
        this.dew_point_temperature = dew_point_temperature;
    }

    public String getDew_point_temperature_quality_code() {
        return dew_point_temperature_quality_code;
    }

    public void setDew_point_temperature_quality_code(String dew_point_temperature_quality_code) {
        this.dew_point_temperature_quality_code = dew_point_temperature_quality_code;
    }

    public String getAtmospheric_pressure() {
        return atmospheric_pressure;
    }

    public void setAtmospheric_pressure(String atmospheric_pressure) {
        this.atmospheric_pressure = atmospheric_pressure;
    }

    public String getAtmospheric_pressure_quality_code() {
        return atmospheric_pressure_quality_code;
    }

    public void setAtmospheric_pressure_quality_code(String atmospheric_pressure_quality_code) {
        this.atmospheric_pressure_quality_code = atmospheric_pressure_quality_code;
    }

    @Override
    public String toString() {
        return "NcdcWeather{" +
                "USAF_station_id='" + USAF_station_id + '\'' +
                ", WBAN_station_id='" + WBAN_station_id + '\'' +
                ", date='" + date + '\'' +
                ", time='" + time + '\'' +
                ", latitude='" + latitude + '\'' +
                ", longitude='" + longitude + '\'' +
                ", elevation='" + elevation + '\'' +
                ", wind_direction='" + wind_direction + '\'' +
                ", wind_direction_quality_code='" + wind_direction_quality_code + '\'' +
                ", sky_ceiling_height='" + sky_ceiling_height + '\'' +
                ", sky_ceiling_height_quality_code='" + sky_ceiling_height_quality_code + '\'' +
                ", visibility_distance='" + visibility_distance + '\'' +
                ", visibility_distance_quality_code='" + visibility_distance_quality_code + '\'' +
                ", air_temperature='" + air_temperature + '\'' +
                ", air_temperature_quality_code='" + air_temperature_quality_code + '\'' +
                ", dew_point_temperature='" + dew_point_temperature + '\'' +
                ", dew_point_temperature_quality_code='" + dew_point_temperature_quality_code + '\'' +
                ", atmospheric_pressure='" + atmospheric_pressure + '\'' +
                ", atmospheric_pressure_quality_code='" + atmospheric_pressure_quality_code + '\'' +
                '}';
    }
}
Weather Bean
public class MaxTemperatureMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    private static int MISS_CODE = 9999;

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        NcdcWeather ncdcWeather = new NcdcWeather(line);
        String year = ncdcWeather.getDate().substring(0, 4);
        int temperature = 0;
        if (ncdcWeather.getAir_temperature().startsWith("+")) {
            temperature = Integer.parseInt(ncdcWeather.getAir_temperature().substring(1));
        } else {
            temperature = Integer.parseInt(ncdcWeather.getAir_temperature());
        }


        if (temperature != MISS_CODE && ncdcWeather.getAir_temperature_quality_code().matches("[01459]")) {
            context.write(new Text(year), new IntWritable(temperature));
        }
    }
}
Mapper
public class MaxTemperatureReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int max = Integer.MIN_VALUE;
        for (IntWritable temp : values) {
            max = Math.max(max, temp.get());
        }

        context.write(key, new IntWritable(max));
    }
}
Reducer
public class MaxTemperature {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        if (args.length != 2) {
            System.err.println("Usage: MaxTemperature <input path> <output path>");
            System.exit(-1);
        }

        Job job = Job.getInstance();
        job.setJarByClass(MaxTemperature.class);
        job.setJobName("Max Temperature");

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        job.setMapperClass(MaxTemperatureMapper.class);
        job.setReducerClass(MaxTemperatureReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

 数据准备

提前往hdfs中放了1901、1902两个年份的天气数据如下图所示:

验证执行

 将以上代码打成一个jar,push到我们的虚拟机Hadoop环境中,为防止程序bug也为了节省执行时间,我们先在一个小集群、小数据量中验证代码。为此,我已经提前准备好了一个伪分布式(所谓为分布式即只有一个节点的全分布式)集群环境,下面开始在该环境中执行以上程序:

yarn jar ~/max-temperature-1.0-SNAPSHOT-jar-with-dependencies.jar /ncdc/ /max_temperature_out/

成功执行时的日志打印:

 1 2019-09-10 16:19:00,367 INFO client.RMProxy: Connecting to ResourceManager at localhost/127.0.0.1:8032
 2 2019-09-10 16:19:03,364 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
 3 2019-09-10 16:19:03,463 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/hadoopuser/.staging/job_1568096329203_0001
 4 2019-09-10 16:19:05,748 INFO input.FileInputFormat: Total input files to process : 2
 5 2019-09-10 16:19:07,012 INFO mapreduce.JobSubmitter: number of splits:2
 6 2019-09-10 16:19:07,677 INFO Configuration.deprecation: yarn.resourcemanager.system-metrics-publisher.enabled is deprecated. Instead, use yarn.system-metrics-publisher.enabled
 7 2019-09-10 16:19:08,909 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1568096329203_0001
 8 2019-09-10 16:19:08,912 INFO mapreduce.JobSubmitter: Executing with tokens: []
 9 2019-09-10 16:19:09,911 INFO conf.Configuration: resource-types.xml not found
10 2019-09-10 16:19:09,915 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
11 2019-09-10 16:19:11,331 INFO impl.YarnClientImpl: Submitted application application_1568096329203_0001
12 2019-09-10 16:19:11,660 INFO mapreduce.Job: The url to track the job: http://slave1:8088/proxy/application_1568096329203_0001/
13 2019-09-10 16:19:11,661 INFO mapreduce.Job: Running job: job_1568096329203_0001
14 2019-09-10 16:20:09,966 INFO mapreduce.Job: Job job_1568096329203_0001 running in uber mode : false
15 2019-09-10 16:20:10,014 INFO mapreduce.Job:  map 0% reduce 0%
16 2019-09-10 16:21:22,515 INFO mapreduce.Job:  map 100% reduce 0%
17 2019-09-10 16:21:52,053 INFO mapreduce.Job:  map 100% reduce 100%
18 2019-09-10 16:21:54,192 INFO mapreduce.Job: Job job_1568096329203_0001 completed successfully
19 2019-09-10 16:21:55,534 INFO mapreduce.Job: Counters: 54
20         File System Counters
21                 FILE: Number of bytes read=132380
22                 FILE: Number of bytes written=928635
23                 FILE: Number of read operations=0
24                 FILE: Number of large read operations=0
25                 FILE: Number of write operations=0
26                 HDFS: Number of bytes read=1628892
27                 HDFS: Number of bytes written=18
28                 HDFS: Number of read operations=11
29                 HDFS: Number of large read operations=0
30                 HDFS: Number of write operations=2
31                 HDFS: Number of bytes read erasure-coded=0
32         Job Counters 
33                 Launched map tasks=2
34                 Launched reduce tasks=1
35                 Data-local map tasks=2
36                 Total time spent by all maps in occupied slots (ms)=134255
37                 Total time spent by all reduces in occupied slots (ms)=20841
38                 Total time spent by all map tasks (ms)=134255
39                 Total time spent by all reduce tasks (ms)=20841
40                 Total vcore-milliseconds taken by all map tasks=134255
41                 Total vcore-milliseconds taken by all reduce tasks=20841
42                 Total megabyte-milliseconds taken by all map tasks=137477120
43                 Total megabyte-milliseconds taken by all reduce tasks=21341184
44         Map-Reduce Framework
45                 Map input records=12035
46                 Map output records=12034
47                 Map output bytes=108306
48                 Map output materialized bytes=132386
49                 Input split bytes=200
50                 Combine input records=0
51                 Combine output records=0
52                 Reduce input groups=2
53                 Reduce shuffle bytes=132386
54                 Reduce input records=12034
55                 Reduce output records=2
56                 Spilled Records=24068
57                 Shuffled Maps =2
58                 Failed Shuffles=0
59                 Merged Map outputs=2
60                 GC time elapsed (ms)=1606
61                 CPU time spent (ms)=7900
62                 Physical memory (bytes) snapshot=470282240
63                 Virtual memory (bytes) snapshot=8201203712
64                 Total committed heap usage (bytes)=261046272
65                 Peak Map Physical memory (bytes)=184123392
66                 Peak Map Virtual memory (bytes)=2731495424
67                 Peak Reduce Physical memory (bytes)=102227968
68                 Peak Reduce Virtual memory (bytes)=2738212864
69         Shuffle Errors
70                 BAD_ID=0
71                 CONNECTION=0
72                 IO_ERROR=0
73                 WRONG_LENGTH=0
74                 WRONG_MAP=0
75                 WRONG_REDUCE=0
76         File Input Format Counters 
77                 Bytes Read=1628692
78         File Output Format Counters 
79                 Bytes Written=18

接下来逐行理解日志含义:

2019-09-10 16:19:00,367 INFO client.RMProxy: Connecting to ResourceManager at localhost/127.0.0.1:8032
#emm 我要先找到资源管理器

2019-09-10 16:19:03,364 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
#ohoh,代码里用的运行方式过时了,推荐使用继承Tool接口的方式来实现它

2019-09-10 16:19:03,463 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/hadoopuser/.staging/job_1568096329203_0001
2019-09-10 16:19:05,748 INFO input.FileInputFormat: Total input files to process : 2
#输入文件数 2

2019-09-10 16:19:07,012 INFO mapreduce.JobSubmitter: number of splits:2
#分片数2(与输入块数出奇的一致呢~)

2019-09-10 16:19:07,677 INFO Configuration.deprecation: yarn.resourcemanager.system-metrics-publisher.enabled is deprecated. Instead, use yarn.system-metrics-publisher.enabled
2019-09-10 16:19:08,909 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1568096329203_0001
#提交作业token,终于看到作业命名规则了,大概是job_<时间戳>_<4位数的递增序号>

2019-09-10 16:19:08,912 INFO mapreduce.JobSubmitter: Executing with tokens: []
2019-09-10 16:19:09,911 INFO conf.Configuration: resource-types.xml not found
2019-09-10 16:19:09,915 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
2019-09-10 16:19:11,331 INFO impl.YarnClientImpl: Submitted application application_1568096329203_0001
#似乎是提交作业到Yarn了

2019-09-10 16:19:11,660 INFO mapreduce.Job: The url to track the job: http://slave1:8088/proxy/application_1568096329203_0001/
#em,提供了一个追踪作业执行进度的url

2019-09-10 16:19:11,661 INFO mapreduce.Job: Running job: job_1568096329203_0001
2019-09-10 16:20:09,966 INFO mapreduce.Job: Job job_1568096329203_0001 running in uber mode : false
#作业不满足uber作业的条件,将以非uber模式执行

2019-09-10 16:20:10,014 INFO mapreduce.Job:  map 0% reduce 0%
2019-09-10 16:21:22,515 INFO mapreduce.Job:  map 100% reduce 0%
2019-09-10 16:21:52,053 INFO mapreduce.Job:  map 100% reduce 100%
2019-09-10 16:21:54,192 INFO mapreduce.Job: Job job_1568096329203_0001 completed successfully
#em 作业执行成功了

2019-09-10 16:21:55,534 INFO mapreduce.Job: Counters: 54
#下面是作业的一些计数器数据
        File System Counters
                FILE: Number of bytes read=132380
                FILE: Number of bytes written=928635
                FILE: Number of read operations=0
                FILE: Number of large read operations=0
                FILE: Number of write operations=0
                HDFS: Number of bytes read=1628892
                HDFS: Number of bytes written=18
                HDFS: Number of read operations=11
                HDFS: Number of large read operations=0
                HDFS: Number of write operations=2
                HDFS: Number of bytes read erasure-coded=0
        Job Counters 
                Launched map tasks=2
                Launched reduce tasks=1
                Data-local map tasks=2
                Total time spent by all maps in occupied slots (ms)=134255
                Total time spent by all reduces in occupied slots (ms)=20841
                Total time spent by all map tasks (ms)=134255
                Total time spent by all reduce tasks (ms)=20841
                Total vcore-milliseconds taken by all map tasks=134255
                Total vcore-milliseconds taken by all reduce tasks=20841
                Total megabyte-milliseconds taken by all map tasks=137477120
                Total megabyte-milliseconds taken by all reduce tasks=21341184
        Map-Reduce Framework
                Map input records=12035
                Map output records=12034
                Map output bytes=108306
                Map output materialized bytes=132386
                Input split bytes=200
                Combine input records=0
                Combine output records=0
                Reduce input groups=2
                Reduce shuffle bytes=132386
                Reduce input records=12034
                Reduce output records=2
                Spilled Records=24068
                Shuffled Maps =2
                Failed Shuffles=0
                Merged Map outputs=2
                GC time elapsed (ms)=1606
                CPU time spent (ms)=7900
                Physical memory (bytes) snapshot=470282240
                Virtual memory (bytes) snapshot=8201203712
                Total committed heap usage (bytes)=261046272
                Peak Map Physical memory (bytes)=184123392
                Peak Map Virtual memory (bytes)=2731495424
                Peak Reduce Physical memory (bytes)=102227968
                Peak Reduce Virtual memory (bytes)=2738212864
        Shuffle Errors
                BAD_ID=0
                CONNECTION=0
                IO_ERROR=0
                WRONG_LENGTH=0
                WRONG_MAP=0
                WRONG_REDUCE=0
        File Input Format Counters 
                Bytes Read=1628692
        File Output Format Counters 
                Bytes Written=18

 再看看我们指定的输出目录下到底输出了什么:

 

 

 可以看到,输出目录包含两个文件,其中名为_SUCCESS的空文件标识作业执行成功,名为part-r-00000的文件记录Reduce任务的输出。查看part-r-00000文件内容:

 

 

 可以看到,顺利得出了1901年和1902年的最高气温分别是317和244。
这里我们可以通过非mapreduce的方式去计算一下最高气温,以验证这个mapred程序的气温计算是否正确,但由于这不是我们的重点,而且程序逻辑比较简单,姑且认为至结果可信。

猜想

基于输入输入和程序执行过程中的日志打印,我们做如下猜想:
猜想1、分片数等于输入的块数
猜想2、计数器中File Input Format Counters等于读入的文件总字节数
猜想3、计数器中File Output Format Counters等于写入输出目录的总字节数

验证猜想

在搭建的分布式环境中运行以上代码,以验证猜想。分布式环境中准备好的输入文件如下:

 

 即这个集群中准备了44个年份的数据文件,总大小为3137231401字节,分成了61个数据块。

下面开始提交执行作业:

yarn jar ~/max-temperature-1.0-SNAPSHOT-jar-with-dependencies.jar /ncdc/raw/ /max_temperature_out/

作业提交和执行并非一帆风顺,期间遇到过很多问题,比较典型的是java.net.NoRouteToHostException,对这个错误的解决过程记录在《Hadoop学习问题记录之基础篇》的问题一

解决遇到的问题后,成功运行的日志打印如下:

  1 [hadoop_user@master hadoop-3.2.0]$ yarn jar ~/max-temperature-1.0-SNAPSHOT-jar-with-dependencies.jar /ncdc/raw/ /max_out
  2 2019-09-16 11:48:03,916 INFO client.RMProxy: Connecting to ResourceManager at master/192.168.212.132:8032
  3 2019-09-16 11:48:05,846 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
  4 2019-09-16 11:48:05,937 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/hadoop_user/.staging/job_1568605271327_0002
  5 2019-09-16 11:48:07,075 INFO input.FileInputFormat: Total input files to process : 44
  6 2019-09-16 11:48:08,397 INFO mapreduce.JobSubmitter: number of splits:58
  7 2019-09-16 11:48:08,638 INFO Configuration.deprecation: yarn.resourcemanager.system-metrics-publisher.enabled is deprecated. Instead, use yarn.system-metrics-publisher.enabled
  8 2019-09-16 11:48:09,646 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1568605271327_0002
  9 2019-09-16 11:48:09,648 INFO mapreduce.JobSubmitter: Executing with tokens: []
 10 2019-09-16 11:48:10,350 INFO conf.Configuration: resource-types.xml not found
 11 2019-09-16 11:48:10,351 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
 12 2019-09-16 11:48:11,402 INFO impl.YarnClientImpl: Submitted application application_1568605271327_0002
 13 2019-09-16 11:48:11,595 INFO mapreduce.Job: The url to track the job: http://master:8088/proxy/application_1568605271327_0002/
 14 2019-09-16 11:48:11,596 INFO mapreduce.Job: Running job: job_1568605271327_0002
 15 2019-09-16 11:48:44,279 INFO mapreduce.Job: Job job_1568605271327_0002 running in uber mode : false
 16 2019-09-16 11:48:44,300 INFO mapreduce.Job:  map 0% reduce 0%
 17 2019-09-16 11:55:25,431 INFO mapreduce.Job:  map 1% reduce 0%
 18 2019-09-16 11:55:36,166 INFO mapreduce.Job:  map 2% reduce 0%
 19 2019-09-16 11:55:37,287 INFO mapreduce.Job:  map 3% reduce 0%
 20 2019-09-16 11:55:38,501 INFO mapreduce.Job:  map 4% reduce 0%
 21 2019-09-16 11:55:49,480 INFO mapreduce.Job:  map 5% reduce 0%
 22 2019-09-16 11:55:55,289 INFO mapreduce.Job:  map 6% reduce 0%
 23 2019-09-16 11:56:00,062 INFO mapreduce.Job:  map 7% reduce 0%
 24 2019-09-16 11:56:11,488 INFO mapreduce.Job:  map 8% reduce 0%
 25 2019-09-16 11:56:25,560 INFO mapreduce.Job:  map 9% reduce 0%
 26 2019-09-16 11:56:31,260 INFO mapreduce.Job:  map 10% reduce 0%
 27 2019-09-16 11:56:48,742 INFO mapreduce.Job:  map 11% reduce 0%
 28 2019-09-16 11:56:51,329 INFO mapreduce.Job:  map 12% reduce 0%
 29 2019-09-16 11:57:07,586 INFO mapreduce.Job:  map 13% reduce 0%
 30 2019-09-16 11:57:12,254 INFO mapreduce.Job:  map 14% reduce 0%
 31 2019-09-16 11:57:19,353 INFO mapreduce.Job:  map 15% reduce 0%
 32 2019-09-16 11:57:29,968 INFO mapreduce.Job:  map 16% reduce 0%
 33 2019-09-16 11:57:41,148 INFO mapreduce.Job:  map 17% reduce 0%
 34 2019-09-16 11:57:50,065 INFO mapreduce.Job:  map 18% reduce 0%
 35 2019-09-16 11:58:01,423 INFO mapreduce.Job:  map 19% reduce 0%
 36 2019-09-16 11:58:11,850 INFO mapreduce.Job:  map 20% reduce 0%
 37 2019-09-16 11:58:24,556 INFO mapreduce.Job:  map 21% reduce 0%
 38 2019-09-16 11:58:34,826 INFO mapreduce.Job:  map 22% reduce 0%
 39 2019-09-16 11:58:49,540 INFO mapreduce.Job:  map 23% reduce 0%
 40 2019-09-16 11:58:59,619 INFO mapreduce.Job:  map 24% reduce 0%
 41 2019-09-16 11:59:14,612 INFO mapreduce.Job:  map 25% reduce 0%
 42 2019-09-16 11:59:28,346 INFO mapreduce.Job:  map 26% reduce 0%
 43 2019-09-16 11:59:38,732 INFO mapreduce.Job:  map 27% reduce 0%
 44 2019-09-16 11:59:49,123 INFO mapreduce.Job:  map 28% reduce 0%
 45 2019-09-16 12:00:47,007 INFO mapreduce.Job:  map 30% reduce 0%
 46 2019-09-16 12:00:52,384 INFO mapreduce.Job:  map 33% reduce 0%
 47 2019-09-16 12:00:54,900 INFO mapreduce.Job:  map 34% reduce 0%
 48 2019-09-16 12:01:09,491 INFO mapreduce.Job:  map 35% reduce 0%
 49 2019-09-16 12:01:20,103 INFO mapreduce.Job:  map 36% reduce 0%
 50 2019-09-16 12:01:24,594 INFO mapreduce.Job:  map 38% reduce 0%
 51 2019-09-16 12:01:29,124 INFO mapreduce.Job:  map 41% reduce 0%
 52 2019-09-16 12:01:51,511 INFO mapreduce.Job:  map 42% reduce 0%
 53 2019-09-16 12:02:01,443 INFO mapreduce.Job:  map 43% reduce 0%
 54 2019-09-16 12:02:10,151 INFO mapreduce.Job:  map 43% reduce 5%
 55 2019-09-16 12:02:51,950 INFO mapreduce.Job:  map 44% reduce 5%
 56 2019-09-16 12:02:56,771 INFO mapreduce.Job:  map 44% reduce 6%
 57 2019-09-16 12:03:05,455 INFO mapreduce.Job:  map 45% reduce 6%
 58 2019-09-16 12:03:29,194 INFO mapreduce.Job:  map 45% reduce 7%
 59 2019-09-16 12:04:10,366 INFO mapreduce.Job:  map 47% reduce 7%
 60 2019-09-16 12:04:12,997 INFO mapreduce.Job:  map 47% reduce 8%
 61 2019-09-16 12:04:21,667 INFO mapreduce.Job:  map 47% reduce 9%
 62 2019-09-16 12:05:11,082 INFO mapreduce.Job:  map 47% reduce 10%
 63 2019-09-16 12:06:12,053 INFO mapreduce.Job:  map 48% reduce 10%
 64 2019-09-16 12:06:32,845 INFO mapreduce.Job:  map 50% reduce 10%
 65 2019-09-16 12:07:04,607 INFO mapreduce.Job:  map 50% reduce 11%
 66 2019-09-16 12:07:13,138 INFO mapreduce.Job:  map 52% reduce 11%
 67 2019-09-16 12:07:14,418 INFO mapreduce.Job:  map 56% reduce 11%
 68 2019-09-16 12:07:16,825 INFO mapreduce.Job:  map 58% reduce 11%
 69 2019-09-16 12:07:22,626 INFO mapreduce.Job:  map 59% reduce 11%
 70 2019-09-16 12:07:26,273 INFO mapreduce.Job:  map 60% reduce 11%
 71 2019-09-16 12:07:27,522 INFO mapreduce.Job:  map 62% reduce 11%
 72 2019-09-16 12:07:29,943 INFO mapreduce.Job:  map 63% reduce 11%
 73 2019-09-16 12:07:32,402 INFO mapreduce.Job:  map 64% reduce 11%
 74 2019-09-16 12:07:36,126 INFO mapreduce.Job:  map 65% reduce 11%
 75 2019-09-16 12:07:37,321 INFO mapreduce.Job:  map 66% reduce 11%
 76 2019-09-16 12:07:41,297 INFO mapreduce.Job:  map 67% reduce 11%
 77 2019-09-16 12:07:49,753 INFO mapreduce.Job:  map 67% reduce 12%
 78 2019-09-16 12:07:55,465 INFO mapreduce.Job:  map 68% reduce 12%
 79 2019-09-16 12:07:56,617 INFO mapreduce.Job:  map 69% reduce 12%
 80 2019-09-16 12:08:03,315 INFO mapreduce.Job:  map 70% reduce 12%
 81 2019-09-16 12:08:22,525 INFO mapreduce.Job:  map 70% reduce 13%
 82 2019-09-16 12:08:39,289 INFO mapreduce.Job:  map 71% reduce 13%
 83 2019-09-16 12:08:52,345 INFO mapreduce.Job:  map 72% reduce 13%
 84 2019-09-16 12:08:55,031 INFO mapreduce.Job:  map 73% reduce 13%
 85 2019-09-16 12:09:08,315 INFO mapreduce.Job:  map 74% reduce 13%
 86 2019-09-16 12:09:15,337 INFO mapreduce.Job:  map 76% reduce 13%
 87 2019-09-16 12:09:43,442 INFO mapreduce.Job:  map 77% reduce 14%
 88 2019-09-16 12:09:52,962 INFO mapreduce.Job:  map 78% reduce 14%
 89 2019-09-16 12:09:58,723 INFO mapreduce.Job:  map 79% reduce 14%
 90 2019-09-16 12:10:07,494 INFO mapreduce.Job:  map 80% reduce 14%
 91 2019-09-16 12:10:12,875 INFO mapreduce.Job:  map 80% reduce 16%
 92 2019-09-16 12:10:20,189 INFO mapreduce.Job:  map 81% reduce 16%
 93 2019-09-16 12:10:22,588 INFO mapreduce.Job:  map 81% reduce 17%
 94 2019-09-16 12:10:27,180 INFO mapreduce.Job:  map 82% reduce 17%
 95 2019-09-16 12:10:57,845 INFO mapreduce.Job:  map 83% reduce 17%
 96 2019-09-16 12:11:06,005 INFO mapreduce.Job:  map 83% reduce 18%
 97 2019-09-16 12:11:10,925 INFO mapreduce.Job:  map 84% reduce 18%
 98 2019-09-16 12:11:16,869 INFO mapreduce.Job:  map 86% reduce 18%
 99 2019-09-16 12:11:20,148 INFO mapreduce.Job:  map 86% reduce 22%
100 2019-09-16 12:11:29,748 INFO mapreduce.Job:  map 86% reduce 24%
101 2019-09-16 12:12:08,999 INFO mapreduce.Job:  map 86% reduce 26%
102 2019-09-16 12:12:34,681 INFO mapreduce.Job:  map 88% reduce 26%
103 2019-09-16 12:12:35,741 INFO mapreduce.Job:  map 89% reduce 26%
104 2019-09-16 12:12:37,876 INFO mapreduce.Job:  map 91% reduce 26%
105 2019-09-16 12:12:41,116 INFO mapreduce.Job:  map 92% reduce 27%
106 2019-09-16 12:12:44,267 INFO mapreduce.Job:  map 93% reduce 27%
107 2019-09-16 12:12:45,337 INFO mapreduce.Job:  map 94% reduce 27%
108 2019-09-16 12:12:46,373 INFO mapreduce.Job:  map 96% reduce 27%
109 2019-09-16 12:12:47,431 INFO mapreduce.Job:  map 96% reduce 28%
110 2019-09-16 12:12:56,285 INFO mapreduce.Job:  map 97% reduce 28%
111 2019-09-16 12:12:59,417 INFO mapreduce.Job:  map 97% reduce 30%
112 2019-09-16 12:13:18,013 INFO mapreduce.Job:  map 97% reduce 31%
113 2019-09-16 12:13:25,418 INFO mapreduce.Job:  map 98% reduce 31%
114 2019-09-16 12:13:27,577 INFO mapreduce.Job:  map 100% reduce 31%
115 2019-09-16 12:13:45,273 INFO mapreduce.Job:  map 100% reduce 39%
116 2019-09-16 12:13:51,482 INFO mapreduce.Job:  map 100% reduce 55%
117 2019-09-16 12:13:57,756 INFO mapreduce.Job:  map 100% reduce 71%
118 2019-09-16 12:14:03,989 INFO mapreduce.Job:  map 100% reduce 91%
119 2019-09-16 12:14:08,345 INFO mapreduce.Job:  map 100% reduce 100%
120 2019-09-16 12:14:12,633 INFO mapreduce.Job: Job job_1568605271327_0002 completed successfully
121 2019-09-16 12:14:13,176 INFO mapreduce.Job: Counters: 56
122         File System Counters
123                 FILE: Number of bytes read=217765235
124                 FILE: Number of bytes written=448594463
125                 FILE: Number of read operations=0
126                 FILE: Number of large read operations=0
127                 FILE: Number of write operations=0
128                 HDFS: Number of bytes read=3137294603
129                 HDFS: Number of bytes written=396
130                 HDFS: Number of read operations=179
131                 HDFS: Number of large read operations=0
132                 HDFS: Number of write operations=2
133                 HDFS: Number of bytes read erasure-coded=0
134         Job Counters 
135                 Killed map tasks=14
136                 Launched map tasks=72
137                 Launched reduce tasks=1
138                 Data-local map tasks=70
139                 Rack-local map tasks=2
140                 Total time spent by all maps in occupied slots (ms)=28632455
141                 Total time spent by all reduces in occupied slots (ms)=1004700
142                 Total time spent by all map tasks (ms)=28632455
143                 Total time spent by all reduce tasks (ms)=1004700
144                 Total vcore-milliseconds taken by all map tasks=28632455
145                 Total vcore-milliseconds taken by all reduce tasks=1004700
146                 Total megabyte-milliseconds taken by all map tasks=29319633920
147                 Total megabyte-milliseconds taken by all reduce tasks=1028812800
148         Map-Reduce Framework
149                 Map input records=20796941
150                 Map output records=19796839
151                 Map output bytes=178171551
152                 Map output materialized bytes=217765577
153                 Input split bytes=5858
154                 Combine input records=0
155                 Combine output records=0
156                 Reduce input groups=44
157                 Reduce shuffle bytes=217765577
158                 Reduce input records=19796839
159                 Reduce output records=44
160                 Spilled Records=39593678
161                 Shuffled Maps =58
162                 Failed Shuffles=0
163                 Merged Map outputs=58
164                 GC time elapsed (ms)=1507499
165                 CPU time spent (ms)=1468390
166                 Physical memory (bytes) snapshot=6290808832
167                 Virtual memory (bytes) snapshot=161167908864
168                 Total committed heap usage (bytes)=7517712384
169                 Peak Map Physical memory (bytes)=200282112
170                 Peak Map Virtual memory (bytes)=2736029696
171                 Peak Reduce Physical memory (bytes)=419164160
172                 Peak Reduce Virtual memory (bytes)=2737905664
173         Shuffle Errors
174                 BAD_ID=0
175                 CONNECTION=0
176                 IO_ERROR=0
177                 WRONG_LENGTH=0
178                 WRONG_MAP=0
179                 WRONG_REDUCE=0
180         File Input Format Counters 
181                 Bytes Read=3137288745
182         File Output Format Counters 
183                 Bytes Written=396
View Code

通过日志可以得出
猜想1错误,输入分片数是58,并不等于输入文件的块数61。
猜想2错误,File Input Format Counters并不总是等于实际输入文件字节数,这里mapred任务统计的读入字节数为3137288745,略大于fsck工具检测出的字节数3137231401。
猜想3正确,File Output Format Counters等于实际输出的part-r-*的文件总字节数。

 结论

  1. 分片数不会一定等于输入块数。
  2. File Input Format Conters并不总等于fsck工具检测出的输入文件总字节数。
  3. File Output Format Conters会等于实际输出的所有part-r-*文件总字节数

ps1:以上结论只是在处理文本输入的情况下得出的,对于其它类型的InputFormat是否会违背上述结论,这里先打个问号。

ps2:关于分片数的问题,将另起篇幅叙述,这里暂且不话。见这里

posted @ 2019-09-19 17:44  行走的段子  阅读(893)  评论(0编辑  收藏  举报