MapReduce-day2

预聚合

在map合并之后，reduce拉取之前有预聚合操作（combiner或者map join）

预聚合目的：减少reduce拉取的次数，加快map任务处理的速度。

不能确定combiner函数会调用多少次，因为不确定map任务有多少个

combiner不适用于求平均数、根号、次方~

数据倾斜解决方法：

1.增加reduce个数

2.给相同的reduce中的key增加随机值，因而改变hash值到不同的reduce中

在idea中实现MapReduce

WordCount

package com.shujia;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WordCount {
    //map读取数据的key类型定死是LongWritable,代表的是行号,从0开始,value是一行数据，Text
    static class MyMapper extends Mapper<LongWritable,Text,Text,LongWritable>{
        @Override
        //context代表的是hadoop的上下文，将来可以使用它将数据写出map
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
            //写map处理逻辑
            //对每一行数据进行分割
            //将hadoop转换为java类型
            String row=value.toString();
            String[] words = row.split(" ");
            //遍历数据，得到每一个单词
            for (String word : words) {
                //将String转为Text
                Text key2 = new Text(word);
                //对每一个单词进行封装，利用context写出map
                context.write(key2,new LongWritable(1L));
            }
            context.write(new Text("行号：【" + key + "】，数据：【" + value + "】"), new LongWritable(1L));
        }
    }

    static class MyReducer extends Reducer<Text,LongWritable,Text,LongWritable>{
        @Override
        protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
            //迭代values,进行求和
            Long sum=0L;
            for (LongWritable value : values) {
                long l=value.get();
                sum=sum+l;
            }
           // context.write(key,new LongWritable(sum));
            context.write(key, new LongWritable(1L));

        }
    }

    public static void main(String[] args) throws Exception{
        //获取hadoop相关的配置
        Configuration conf=new Configuration();
        //创建作业job
        Job job = Job.getInstance(conf);
        //给作业起一个名字，在yarn中可以看到；可写可不写
        job.setJobName("word count");
        //设置reduce的个数；可写可不写;默认一个
        job.setNumReduceTasks(1);
        //设置该作业的运行的主类
        job.setJarByClass(WordCount.class);

        //设置该作业将来的map类和reduce类
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);

        //设置map阶段k-v输出的数据类型
        //hadoop中字符串的类型对应的是叫做Text
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        //设置HDFS的输入路径和输出路径      addInputPath接收一个目录，setOutputPath接收多个目录
        FileInputFormat.addInputPath(job, new Path(args[0]));
        //注意，这里设置的是输出的目录
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //启动mr任务
        job.waitForCompletion(true);
    }
}

HarryPotter

主类

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class HarryPotterDemo {
    public static void main(String[] args) throws Exception{
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(HarryPotterDemo.class);

        job.setMapperClass(HarryMapper.class);
        job.setReducerClass(HarryReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        job.waitForCompletion(true);
    }
}

map类

package com.shujia.HarryPotter;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class HarryMapper extends Mapper<LongWritable,Text, Text,LongWritable> {
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
        //对每一行数据进行清洗
        String info=value.toString();
        //将逗号，句号.'换成空格
        String s = info.replaceAll("[,|.|\']", " ");
        String s3=s.toLowerCase();
        String[] s1 = s3.split(" ");
        for (String s2 : s1) {
            context.write(new Text(s2),new LongWritable(1L));
        }

    }
}

Reduce类

package com.shujia.HarryPotter;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class HarryReducer extends Reducer<Text,LongWritable,Text,LongWritable> {
    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {

        long sum=0L;
        for (LongWritable value : values) {
            sum+=value.get();
        }
        context.write(key,new LongWritable(sum));
    }
}

ik分词器

引入依赖（父工程）

<!-- https://mvnrepository.com/artifact/com.janeluo/ikanalyzer -->
            <dependency>
                <groupId>com.janeluo</groupId>
                <artifactId>ikanalyzer</artifactId>
                <version>2012_u6</version>
            </dependency>

在子工程中引入依赖

        <dependency>
            <groupId>com.janeluo</groupId>
            <artifactId>ikanalyzer</artifactId>
        </dependency>

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.3.0</version>
                <configuration>
                    <descriptorRefs>
                        <!--  打包出来的带依赖jar包名称 -->
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <!--下面是为了使用 mvn package命令，如果不加则使用mvn assembly-->
                <executions>
                    <execution>
                        <id>make-assemble</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

代码

package com.shujia.ik;

import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.StringReader;

/*
ik分词器
 */
public class IKTest {
    public static void main(String[] args) throws Exception{
        BufferedReader br=new BufferedReader(new FileReader("D:\\soft\\projects\\bigdata19-project\\bigdata19-mapreduce\\data\\dldl.txt"));
        String line = br.readLine();
        //将文本变成能够被IK分词器进行分词的对象
        StringReader stringReader = new StringReader(line);

        //创建分词器对象，进行分词
        IKSegmenter ikSegmenter = new IKSegmenter(stringReader, true);

        Lexeme lexeme=null;

        while((lexeme=ikSegmenter.next())!=null){
            //String s=lexeme.toString();
            String s=lexeme.getLexemeText();//获取词
            System.out.println(s);
        }
    }
}

三国演义案例：

package com.shujia.ik;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

import java.io.IOException;
import java.io.StringReader;
/*
统计曹操，董卓，刘备出现的次数
 */
class SgyyMapper extends Mapper<LongWritable,Text, Text,LongWritable>{
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
        String line = value.toString();
        StringReader stringReader = new StringReader(line);
        IKSegmenter ikSegmenter = new IKSegmenter(stringReader, true);
        Lexeme lexeme=null;
        while((lexeme=ikSegmenter.next())!=null){
            String ciyu=lexeme.getLexemeText();
            if("曹操".equals(ciyu)||"董卓".equals(ciyu)||"刘备".equals(ciyu)){
                context.write(new Text(ciyu),new LongWritable(1L));
            }
        }
    }
}

class SgyyReducer extends Reducer<Text,LongWritable,Text,LongWritable>{
    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
        long sum=0L;
        for (LongWritable value : values) {
            sum+=value.get();
        }
        context.write(key,new LongWritable(sum));
    }
}

public class SgyyDeno {
    public static void main(String[] args) throws Exception{
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(SgyyDeno.class);
        job.setMapperClass(SgyyMapper.class);
        job.setReducerClass(SgyyReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        job.waitForCompletion(true);
    }
}

posted on 2022-08-28 23:39 不想写代码的小玉阅读(25) 评论(0) 编辑收藏举报

刷新页面返回顶部

wqy1027

MapReduce-day2

预聚合

在idea中实现MapReduce

WordCount

HarryPotter

ik分词器

导航

公告