mapreduce 实现简单的wordcount词频统计

package com.startbigdata;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WordCountTwo  {

/*
 * KEYIN:输入kv数据对中key的数据类型
 * VALUEIN:输入kv数据对中value的数据类型
 * KEYOUT:输出kv数据对中key的数据类型
 * VALUEOUT:输出kv数据对中value的数据类型
*/
    static class WordCountMap extends Mapper<LongWritable, Text, Text,IntWritable>{

            private final static  IntWritable outPutValue = new IntWritable(1);//设置输出value默认为1
            private Text outPutKey = new Text();//定义输出key

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //将以,为分隔符的每行数据分割成数组
        String[] strings = value.toString().split(",");

        //遍历数组,将数组里面每个字符的输出值设为1
        if (strings.length>0){
            for (String str:strings) {
                this.outPutKey.set(str);//将取得出的每个元素放到outPutKey Text对象中
                context.write(outPutKey,outPutValue); //通过context对象,将map的输出逐个输出
            }
        }

    }
}
    /**
     * Text         :  Mapper输入的key
     * IntWritable  :  Mapper输入的value
     * Text         :  Reducer输出的key
     * IntWritable  :  Reducer输出的value
     */
static class WordCountReduce extends Reducer<Text,IntWritable,Text,IntWritable>{

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int sum = 0;//设置输出值value默认为0
        for (IntWritable value:values) {//遍历values,每出现一次字符出现的次数+1
          sum+=value.get();
        }
        context.write(key,new IntWritable(sum));   //输出字符出现的总次数
    }
}

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        //实例化Configuration类,加载配置文件, 因为通过Configuration可以实现在多个mapper和多个reducer任务之间共享信息。
        Configuration conf = new Configuration();
      // 新建一个任务,名称为类名
        Job job = Job.getInstance(conf, WordCountTwo.class.getName());
        // 设置主类,打包运行时必须执行的秘密方法
        job.setJarByClass(WordCountTwo.class);
        // 设置Mapper类
        job.setMapperClass(WordCountMap.class);
        // 设置Map输出Key的类型
        job.setMapOutputKeyClass(Text.class);
        // 设置Map输出Value的类型
        job.setMapOutputValueClass(IntWritable.class);
       // 设置Reducer类
        job.setReducerClass(WordCountReduce.class);
        // 设置Reducer输出时Key的类型
        job.setOutputKeyClass(Text.class);
        // 设置Reducer输出时Value的类型
        job.setOutputValueClass(IntWritable.class);
        // 设置输入文件
        FileInputFormat.addInputPath(job,new Path(args[0]));
        //判断输出路径是否存在文件,如果存在删除该文件
        FileSystem fs =  FileSystem.get(conf);
        if (fs.exists(new Path(args[1]))){
            fs.delete(new Path(args[1]),true);
        }
        // 设置输出路径
        FileOutputFormat.setOutputPath(job,new Path(args[1]));
        // 提交任务
        job.waitForCompletion(true);
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}
  

 

打包成jar包,记得设置主类。

执行命令:

hadoop jar /opt/modules/hadoop-2.7.3/jars/hadoopapi.jar com.startbigdata.WordCountTwo /tmp1/word.txt /tmp1/out4

 

原文件:

 

 

 

查看输出文件:

 

 不知道有没有人遇到这种错:

20/05/21 15:31:13 INFO client.RMProxy: Connecting to ResourceManager at bigdata-hpsk01.huadian.com/192.168.235.222:8032
20/05/21 15:31:14 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
20/05/21 15:31:15 INFO input.FileInputFormat: Total input paths to process : 1
20/05/21 15:31:15 INFO mapreduce.JobSubmitter: number of splits:1
20/05/21 15:31:15 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1590043374409_0002
20/05/21 15:31:15 INFO impl.YarnClientImpl: Submitted application application_1590043374409_0002
20/05/21 15:31:16 INFO mapreduce.Job: The url to track the job: http://bigdata-hpsk01.huadian.com:8088/proxy/application_1590043374409_0002/
20/05/21 15:31:16 INFO mapreduce.Job: Running job: job_1590043374409_0002
20/05/21 15:31:26 INFO mapreduce.Job: Job job_1590043374409_0002 running in uber mode : false
20/05/21 15:31:26 INFO mapreduce.Job: map 0% reduce 0%
20/05/21 15:31:33 INFO mapreduce.Job: map 100% reduce 0%
20/05/21 15:31:33 INFO mapreduce.Job: Task Id : attempt_1590043374409_0002_m_000000_0, Status : FAILED
Error: java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.Text, received org.apache.hadoop.io.LongWritable
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1072)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.write(MapTask.java:715)
at org.apache.hadoop.mapreduce.task.TaskInputOutputContextImpl.write(TaskInputOutputContextImpl.java:89)
at org.apache.hadoop.mapreduce.lib.map.WrappedMapper$Context.write(WrappedMapper.java:112)
at org.apache.hadoop.mapreduce.Mapper.map(Mapper.java:125)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:146)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:787)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1698)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)

Container killed by the ApplicationMaster.
Container killed on request. Exit code is 143
Container exited with a non-zero exit code 143

20/05/21 15:31:34 INFO mapreduce.Job: map 0% reduce 0%
20/05/21 15:31:38 INFO mapreduce.Job: map 100% reduce 0%
20/05/21 15:31:38 INFO mapreduce.Job: Task Id : attempt_1590043374409_0002_m_000000_1, Status : FAILED
Error: java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.Text, received org.apache.hadoop.io.LongWritable
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1072)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.write(MapTask.java:715)
at org.apache.hadoop.mapreduce.task.TaskInputOutputContextImpl.write(TaskInputOutputContextImpl.java:89)
at org.apache.hadoop.mapreduce.lib.map.WrappedMapper$Context.write(WrappedMapper.java:112)
at org.apache.hadoop.mapreduce.Mapper.map(Mapper.java:125)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:146)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:787)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1698)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)

Container killed by the ApplicationMaster.
Container killed on request. Exit code is 143
Container exited with a non-zero exit code 143

20/05/21 15:31:39 INFO mapreduce.Job: map 0% reduce 0%
20/05/21 15:31:44 INFO mapreduce.Job: Task Id : attempt_1590043374409_0002_m_000000_2, Status : FAILED
Error: java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.Text, received org.apache.hadoop.io.LongWritable
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1072)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.write(MapTask.java:715)
at org.apache.hadoop.mapreduce.task.TaskInputOutputContextImpl.write(TaskInputOutputContextImpl.java:89)
at org.apache.hadoop.mapreduce.lib.map.WrappedMapper$Context.write(WrappedMapper.java:112)
at org.apache.hadoop.mapreduce.Mapper.map(Mapper.java:125)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:146)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:787)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1698)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)

20/05/21 15:31:54 INFO mapreduce.Job: map 100% reduce 100%
20/05/21 15:31:56 INFO mapreduce.Job: Job job_1590043374409_0002 failed with state FAILED due to: Task failed task_1590043374409_0002_m_000000
Job failed as tasks failed. failedMaps:1 failedReduces:0

20/05/21 15:31:56 INFO mapreduce.Job: Counters: 13
Job Counters
Failed map tasks=4
Killed reduce tasks=1
Launched map tasks=4
Other local map tasks=3
Data-local map tasks=1
Total time spent by all maps in occupied slots (ms)=20028
Total time spent by all reduces in occupied slots (ms)=0
Total time spent by all map tasks (ms)=20028
Total time spent by all reduce tasks (ms)=0
Total vcore-milliseconds taken by all map tasks=20028
Total vcore-milliseconds taken by all reduce tasks=0
Total megabyte-milliseconds taken by all map tasks=20508672
Total megabyte-milliseconds taken by all reduce tasks=0

 

我这里的错误是因为Job建错了,写成了

Job job = new Job(conf,WordCountTest.class.getName());

 

 

posted @ 2020-06-11 00:15  阿琼!!!!!  阅读(487)  评论(0编辑  收藏  举报