Hadoop MapReduce编程规范

用户编写的程序分为三个部分:Mapper、Reducer和Driver

1.Mapper阶段

用户自定义的Mapper要继承自己的父类
Mapper的输入数据是KV对的形式（KV的类型可自定义）
Mapper中的业务逻辑写在map()方法中
Mapper的输出数据是KV对的形式（KV的类型可自定义）
map()方法（MapTask进程）对每一个<K,V>调用一次

package cn.coreqi.mapreduce.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * KEYIN, map阶段输入的key的类型，偏移量的类型，LongWritable
 * VALUEIN, map阶段输入的value的类型，Text
 * KEYOUT, map阶段输出的key的类型，Text
 * VALUEOUT map阶段输出的value类型，IntWritable
 */
public class WordCountMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
    private Text outK = new Text();
    private IntWritable outV = new IntWritable(1);

    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
        // 将一行的内容转换成字符串
        String line = value.toString();
        // 切割内容
        String[] words = line.split(" ");
        // 循环写出
        for (String word : words) {
            outK.set(word);
            context.write(outK,outV);
        }
    }
}

2.Reducer阶段

用户自定义的Reducer要继承自己的父类
Reducer的输入数据类型对应Mapper的输出数据类型，也是KV
Reducer的业务逻辑写在reduce()方法中
ReduceTask进行对每一组相同k的<K,V>组调用一次reduce()方法

package cn.coreqi.mapreduce.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * map的输出是reduce的输入
 * KEYIN, reduce阶段输入的key的类型，偏移量的类型，Text
 * VALUEIN, reduce阶段输入的value的类型，IntWritable
 * KEYOUT, reduce阶段输出的key的类型，Text
 * VALUEOUT reduce阶段输出的value类型，IntWritable
 */
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    private IntWritable outV = new IntWritable();

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
        int sum = 0;
        //累加
        for (IntWritable value : values) {
            sum += value.get();
        }
        outV.set(sum);

        //写出
        context.write(key,outV);
    }
}

3.Driver阶段

相当于YARN集群的客户端，用于提交我们整个程序到YARN集群，提交的是封装了MapReduce程序相关运行参数的job对象。

获取配置信息，获取job对象实例
指定本程序的jar包所在的本地路径
关联Mapper/Ruducer业务类
指定Mapper输出数据的KV类型
指定最终输出的数据的KV类型
指定job的输入原始文件所在目录
指定job的输出结果所在目录
提交作业

package cn.coreqi.mapreduce.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WordCountDriver {

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        // 1.获取job
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        // 2.设置jar包路径
        job.setJarByClass(WordCountDriver.class);   //通过反射指定类所在的包地址来获取当前jar包的路径

        // 3.关联mapper和reducer
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);

        // 4.设置mapper输出的KV类型[因为泛型擦除的问题,所以需要手动指定类型]
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // 5. 设置最终输出(reducer)的KV类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        // 6. 设置输入路径和输出路径(本地模式)
//        FileInputFormat.setInputPaths(job,new Path("D:\\intput\\inputword"));
//        FileOutputFormat.setOutputPath(job,new Path("D:\\hadoop\\output"));

        // 6. 设置输入路径和输出路径(读取命令行参数集群模式运行)
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        // 7.提交job,获取更多返回信息
        boolean result = job.waitForCompletion(true);

        System.exit(result ? 0 : 1);
    }
}

4.打包

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>cn.coreqi</groupId>
    <artifactId>HDFS_Client</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    </properties>

    <dependencies>
        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>3.3.6</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/junit/junit -->
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.13.2</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-reload4j -->
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-reload4j</artifactId>
            <version>2.0.9</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.11.0</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.6.0</version>
                <configuration>
                    <descriptorRefs>
                        <!-- 将所有依赖都解压打包到生成物中 -->
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <!-- 绑定到package生命周期 -->
                        <phase>package</phase>
                        <goals>
                            <!-- 只运行一次 -->
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

运行 maven 的package命令

5.运行

hadoop jar HDFS_Client-1.0-SNAPSHOT.jar cn.coreqi.mapreduce.wordcount.WordCountDriver /input /output

/input & /output 是 HDFS中的目录

posted @ 2023-12-13 10:26 SpringCore 阅读(67) 评论(0) 收藏举报

刷新页面返回顶部

Coreqi