wqy1027

eeee

 

MapReduce-day2

预聚合

在map合并之后,reduce拉取之前有预聚合操作(combiner或者map join)

预聚合目的:减少reduce拉取的次数,加快map任务处理的速度。

不能确定combiner函数会调用多少次,因为不确定map任务有多少个

combiner不适用于求平均数、根号、次方~

 

数据倾斜解决方法:

1.增加reduce个数

2.给相同的reduce中的key增加随机值,因而改变hash值到不同的reduce中

在idea中实现MapReduce

WordCount

package com.shujia;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WordCount {
    //map读取数据的key类型定死是LongWritable,代表的是行号,从0开始,value是一行数据,Text
    static class MyMapper extends Mapper<LongWritable,Text,Text,LongWritable>{
        @Override
        //context代表的是hadoop的上下文,将来可以使用它将数据写出map
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
            //写map处理逻辑
            //对每一行数据进行分割
            //将hadoop转换为java类型
            String row=value.toString();
            String[] words = row.split(" ");
            //遍历数据,得到每一个单词
            for (String word : words) {
                //将String转为Text
                Text key2 = new Text(word);
                //对每一个单词进行封装,利用context写出map
                context.write(key2,new LongWritable(1L));
            }
            context.write(new Text("行号:【" + key + "】,数据:【" + value + "】"), new LongWritable(1L));
        }
    }

    static class MyReducer extends Reducer<Text,LongWritable,Text,LongWritable>{
        @Override
        protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
            //迭代values,进行求和
            Long sum=0L;
            for (LongWritable value : values) {
                long l=value.get();
                sum=sum+l;
            }
           // context.write(key,new LongWritable(sum));
            context.write(key, new LongWritable(1L));

        }
    }

    public static void main(String[] args) throws Exception{
        //获取hadoop相关的配置
        Configuration conf=new Configuration();
        //创建作业job
        Job job = Job.getInstance(conf);
        //给作业起一个名字,在yarn中可以看到;可写可不写
        job.setJobName("word count");
        //设置reduce的个数;可写可不写;默认一个
        job.setNumReduceTasks(1);
        //设置该作业的运行的主类
        job.setJarByClass(WordCount.class);

        //设置该作业将来的map类和reduce类
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);

        //设置map阶段k-v输出的数据类型
        //hadoop中字符串的类型对应的是叫做Text
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        //设置HDFS的输入路径和输出路径      addInputPath接收一个目录,setOutputPath接收多个目录
        FileInputFormat.addInputPath(job, new Path(args[0]));
        //注意,这里设置的是输出的目录
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //启动mr任务
        job.waitForCompletion(true);
    }
}

 HarryPotter

主类

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class HarryPotterDemo {
    public static void main(String[] args) throws Exception{
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(HarryPotterDemo.class);

        job.setMapperClass(HarryMapper.class);
        job.setReducerClass(HarryReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        job.waitForCompletion(true);
    }
}

 

 map类

package com.shujia.HarryPotter;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class HarryMapper extends Mapper<LongWritable,Text, Text,LongWritable> {
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
        //对每一行数据进行清洗
        String info=value.toString();
        //将逗号,句号.'换成空格
        String s = info.replaceAll("[,|.|\']", " ");
        String s3=s.toLowerCase();
        String[] s1 = s3.split(" ");
        for (String s2 : s1) {
            context.write(new Text(s2),new LongWritable(1L));
        }

    }
}

 

Reduce类

package com.shujia.HarryPotter;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class HarryReducer extends Reducer<Text,LongWritable,Text,LongWritable> {
    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {

        long sum=0L;
        for (LongWritable value : values) {
            sum+=value.get();
        }
        context.write(key,new LongWritable(sum));
    }
}

 

 

ik分词器

引入依赖(父工程)

<!-- https://mvnrepository.com/artifact/com.janeluo/ikanalyzer -->
            <dependency>
                <groupId>com.janeluo</groupId>
                <artifactId>ikanalyzer</artifactId>
                <version>2012_u6</version>
            </dependency>

 在子工程中引入依赖

        <dependency>
            <groupId>com.janeluo</groupId>
            <artifactId>ikanalyzer</artifactId>
        </dependency>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.3.0</version>
                <configuration>
                    <descriptorRefs>
                        <!--  打包出来的带依赖jar包名称 -->
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <!--下面是为了使用 mvn package命令,如果不加则使用mvn assembly-->
                <executions>
                    <execution>
                        <id>make-assemble</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

 

代码

package com.shujia.ik;

import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.StringReader;

/*
ik分词器
 */
public class IKTest {
    public static void main(String[] args) throws Exception{
        BufferedReader br=new BufferedReader(new FileReader("D:\\soft\\projects\\bigdata19-project\\bigdata19-mapreduce\\data\\dldl.txt"));
        String line = br.readLine();
        //将文本变成能够被IK分词器进行分词的对象
        StringReader stringReader = new StringReader(line);

        //创建分词器对象,进行分词
        IKSegmenter ikSegmenter = new IKSegmenter(stringReader, true);

        Lexeme lexeme=null;

        while((lexeme=ikSegmenter.next())!=null){
            //String s=lexeme.toString();
            String s=lexeme.getLexemeText();//获取词
            System.out.println(s);
        }
    }
}

 

三国演义案例:

package com.shujia.ik;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

import java.io.IOException;
import java.io.StringReader;
/*
统计曹操,董卓,刘备出现的次数
 */
class SgyyMapper extends Mapper<LongWritable,Text, Text,LongWritable>{
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
        String line = value.toString();
        StringReader stringReader = new StringReader(line);
        IKSegmenter ikSegmenter = new IKSegmenter(stringReader, true);
        Lexeme lexeme=null;
        while((lexeme=ikSegmenter.next())!=null){
            String ciyu=lexeme.getLexemeText();
            if("曹操".equals(ciyu)||"董卓".equals(ciyu)||"刘备".equals(ciyu)){
                context.write(new Text(ciyu),new LongWritable(1L));
            }
        }
    }
}

class SgyyReducer extends Reducer<Text,LongWritable,Text,LongWritable>{
    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
        long sum=0L;
        for (LongWritable value : values) {
            sum+=value.get();
        }
        context.write(key,new LongWritable(sum));
    }
}

public class SgyyDeno {
    public static void main(String[] args) throws Exception{
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(SgyyDeno.class);
        job.setMapperClass(SgyyMapper.class);
        job.setReducerClass(SgyyReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        job.waitForCompletion(true);
    }
}

 

posted on 2022-08-28 23:39  不想写代码的小玉  阅读(25)  评论(0编辑  收藏  举报

导航