MapReduce-day2
预聚合
在map合并之后,reduce拉取之前有预聚合操作(combiner或者map join)
预聚合目的:减少reduce拉取的次数,加快map任务处理的速度。
不能确定combiner函数会调用多少次,因为不确定map任务有多少个
combiner不适用于求平均数、根号、次方~
数据倾斜解决方法:
1.增加reduce个数
2.给相同的reduce中的key增加随机值,因而改变hash值到不同的reduce中
在idea中实现MapReduce
WordCount
package com.shujia; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; public class WordCount { //map读取数据的key类型定死是LongWritable,代表的是行号,从0开始,value是一行数据,Text static class MyMapper extends Mapper<LongWritable,Text,Text,LongWritable>{ @Override //context代表的是hadoop的上下文,将来可以使用它将数据写出map protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException { //写map处理逻辑 //对每一行数据进行分割 //将hadoop转换为java类型 String row=value.toString(); String[] words = row.split(" "); //遍历数据,得到每一个单词 for (String word : words) { //将String转为Text Text key2 = new Text(word); //对每一个单词进行封装,利用context写出map context.write(key2,new LongWritable(1L)); } context.write(new Text("行号:【" + key + "】,数据:【" + value + "】"), new LongWritable(1L)); } } static class MyReducer extends Reducer<Text,LongWritable,Text,LongWritable>{ @Override protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException { //迭代values,进行求和 Long sum=0L; for (LongWritable value : values) { long l=value.get(); sum=sum+l; } // context.write(key,new LongWritable(sum)); context.write(key, new LongWritable(1L)); } } public static void main(String[] args) throws Exception{ //获取hadoop相关的配置 Configuration conf=new Configuration(); //创建作业job Job job = Job.getInstance(conf); //给作业起一个名字,在yarn中可以看到;可写可不写 job.setJobName("word count"); //设置reduce的个数;可写可不写;默认一个 job.setNumReduceTasks(1); //设置该作业的运行的主类 job.setJarByClass(WordCount.class); //设置该作业将来的map类和reduce类 job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); //设置map阶段k-v输出的数据类型 //hadoop中字符串的类型对应的是叫做Text job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); //设置HDFS的输入路径和输出路径 addInputPath接收一个目录,setOutputPath接收多个目录 FileInputFormat.addInputPath(job, new Path(args[0])); //注意,这里设置的是输出的目录 FileOutputFormat.setOutputPath(job, new Path(args[1])); //启动mr任务 job.waitForCompletion(true); } }
HarryPotter
主类
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class HarryPotterDemo { public static void main(String[] args) throws Exception{ Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(HarryPotterDemo.class); job.setMapperClass(HarryMapper.class); job.setReducerClass(HarryReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.setInputPaths(job,new Path(args[0])); FileOutputFormat.setOutputPath(job,new Path(args[1])); job.waitForCompletion(true); } }
map类
package com.shujia.HarryPotter; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; public class HarryMapper extends Mapper<LongWritable,Text, Text,LongWritable> { @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException { //对每一行数据进行清洗 String info=value.toString(); //将逗号,句号.'换成空格 String s = info.replaceAll("[,|.|\']", " "); String s3=s.toLowerCase(); String[] s1 = s3.split(" "); for (String s2 : s1) { context.write(new Text(s2),new LongWritable(1L)); } } }
Reduce类
package com.shujia.HarryPotter; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public class HarryReducer extends Reducer<Text,LongWritable,Text,LongWritable> { @Override protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException { long sum=0L; for (LongWritable value : values) { sum+=value.get(); } context.write(key,new LongWritable(sum)); } }
ik分词器
引入依赖(父工程)
<!-- https://mvnrepository.com/artifact/com.janeluo/ikanalyzer --> <dependency> <groupId>com.janeluo</groupId> <artifactId>ikanalyzer</artifactId> <version>2012_u6</version> </dependency>
在子工程中引入依赖
<dependency> <groupId>com.janeluo</groupId> <artifactId>ikanalyzer</artifactId> </dependency>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.3.0</version>
<configuration>
<descriptorRefs>
<!-- 打包出来的带依赖jar包名称 -->
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<!--下面是为了使用 mvn package命令,如果不加则使用mvn assembly-->
<executions>
<execution>
<id>make-assemble</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
代码
package com.shujia.ik; import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.Lexeme; import java.io.BufferedReader; import java.io.FileReader; import java.io.StringReader; /* ik分词器 */ public class IKTest { public static void main(String[] args) throws Exception{ BufferedReader br=new BufferedReader(new FileReader("D:\\soft\\projects\\bigdata19-project\\bigdata19-mapreduce\\data\\dldl.txt")); String line = br.readLine(); //将文本变成能够被IK分词器进行分词的对象 StringReader stringReader = new StringReader(line); //创建分词器对象,进行分词 IKSegmenter ikSegmenter = new IKSegmenter(stringReader, true); Lexeme lexeme=null; while((lexeme=ikSegmenter.next())!=null){ //String s=lexeme.toString(); String s=lexeme.getLexemeText();//获取词 System.out.println(s); } } }
三国演义案例:
package com.shujia.ik; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.Lexeme; import java.io.IOException; import java.io.StringReader; /* 统计曹操,董卓,刘备出现的次数 */ class SgyyMapper extends Mapper<LongWritable,Text, Text,LongWritable>{ @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException { String line = value.toString(); StringReader stringReader = new StringReader(line); IKSegmenter ikSegmenter = new IKSegmenter(stringReader, true); Lexeme lexeme=null; while((lexeme=ikSegmenter.next())!=null){ String ciyu=lexeme.getLexemeText(); if("曹操".equals(ciyu)||"董卓".equals(ciyu)||"刘备".equals(ciyu)){ context.write(new Text(ciyu),new LongWritable(1L)); } } } } class SgyyReducer extends Reducer<Text,LongWritable,Text,LongWritable>{ @Override protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException { long sum=0L; for (LongWritable value : values) { sum+=value.get(); } context.write(key,new LongWritable(sum)); } } public class SgyyDeno { public static void main(String[] args) throws Exception{ Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(SgyyDeno.class); job.setMapperClass(SgyyMapper.class); job.setReducerClass(SgyyReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.setInputPaths(job,new Path(args[0])); FileOutputFormat.setOutputPath(job,new Path(args[1])); job.waitForCompletion(true); } }