MapReduce WordCount实操

一、前提

1、创建Maven项目

2、导入依赖

<dependencies>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>RELEASE</version>
        </dependency>
        <dependency>
            <groupId>org.apache.logging.log4j</groupId>
            <artifactId>log4j-core</artifactId>
            <version>2.8.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.7.7</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.7.7</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.7.7</version>
        </dependency>
</dependencies>

3、src/main/resources目录下,创建log4j.properties

log4j.rootLogger=INFO, stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n
log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=target/spring.log
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n

二、Mapper

1、规范

a、继承Mapper
b、重写 map()方法,业务逻辑书写的地方
c、Mapper输入 k, v 键值对
d、Mapper输出 k, v 键值对
e、map()方法,每一个key,调用一次

2、创建类

package com.wt;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;


public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    Text k = new Text();
    IntWritable v = new IntWritable(1);
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 1.获取第一行
        String line = value.toString();
        // 2.切割
        String[] words = line.split("\\s+");
        // 3.输出
        for (String word : words) {
            /*
            * Text k = new Text(); 每个 key 执行一次 map 因此,把 这个放在外面,减少内存消耗
            * new IntWritable(1); 同上
            * */
            k.set(word);
            context.write(k, v);
        }
    }
}

三、Reducer

1、规范

a、继承Reducer
b、重写rence()方法,存放业务逻辑代码
c、Reducer的输入数据类型是Mapper的输出数据类型
d、Reducer每使用一次可以,调用一次 reduce()方法

2、创建类

package com.wt;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

    IntWritable v = new IntWritable(); // 省内存
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        // 1. 累加求和
        int sum = 0;
        for (IntWritable value : values) {
            sum += value.get();
        }
        // 2. 输出
        v.set(sum);
        context.write(key, v);
    }
}

四、Driver(基本不需要改变)

1、规范

1)、获取配置信息已经封装任务
2)、设置jar加载路径
3)、设置map和reduce类
4)、设置map输出
5)、设置最终输出kv类型
6)、设置输入和输出路径
7)、提交

2、创建类

package com.wt;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WordCountDriver {

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    //1、获取配置信息已经封装任务
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf);
    //2、设置jar加载路径
    job.setJarByClass(WordCountDriver.class);
    //3、设置map和reduce类
    job.setMapperClass(WordCountMapper.class);
    job.setReducerClass(WordCountReducer.class);
    //4、设置map输出的k, v
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    //5、设置最终输出kv类型
    job.setOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    //6、设置输入和输出路径
    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    //7、提交
    boolean wait = job.waitForCompletion(true);
    System.exit(wait ? 0 : 1);
}
}

 五、在Hadoop环境上运行

1、导出jar包

2、把jar包上传到节点(服务器)

3、在集群上创建 /usr/input,并上传 文本到 该路径下

4、运行命令

 hadoop jar wc.jar com.wt.WordCountDriver  /usr/input /usr/output

com.wt.WordCountDriver  main方法类的全路径

获取方式

 

posted @ 2020-09-02 22:21  市丸银  阅读(208)  评论(0编辑  收藏  举报