MapReduce自定义Mapper实现(第一步)

package com.imooc.bigdata.hadoop.mapreduce.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/*
 * Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
 * KEYIN: Map任务读数据的key类型,是每行数据其实位置的偏移量,long类型
 * VALUEIN: Map任务读数据的value类型,是一行行的字符串,string类型
 *
 * hello world welcome
 * hello welcome
 *
 * KEYOUT: map方法自定义实现输出的key类型, string
 * VALUEOUT: map方法自定义实现输出的value类型, integer / long
 *
 * 词频统计:相同单词次数  (word, 1)
 *
 * Long,String,String,Integer是Java里面的数据类型
 * Hadoop是有自定义类型:序列化和反序列化:LongWritable, Text, Text, IntWritable
 *
 */

public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {


    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        // 把value对应的行数据按照指定的分隔符拆开
        String[] words = value.toString().split("\t");


        for (String word : words){
            // (hello, 1)  (world, 1)
            context.write(new Text(word), new IntWritable(1));
        }
    }
}

posted @ 2021-07-08 15:26  酱汁怪兽  阅读(154)  评论(0编辑  收藏  举报