Hadoop 系列(三)Top N
一:流程分析
1.Top N简介
关系数据库中经常有Top n数据查询的大部分是以下四种需求
1.直接min或者max就可以取得最大或者最小的数据 (top 1)
2.升级一点就再加上一个groupby取一个分组内的最大值,最小值(分组内的top1)
3.top 10需求,使用order函数取一个前10
4.分组内的top 10需求,使用window 函数生成一个虚拟列,虚拟列取< 11的数据就可以
相同的我们在mapreduce中也可能需要实现这种需求:
1.key取相同的值,value取最大值,或者最小值就可以。(优化一点的就是在map阶段就聚合部分的数据,不然容易数据倾斜,其实就是Combiner,但是没有做过helloworld,自己先试试)
2.key取groupby的值,value取最大值,最小值。(优化方案:map阶段取出来组内的最大最小值)
3.key取相同的值,value取一个前10
4.key取groupby的值,value取一个前10
我们可以把这四种全部都实现一下:有一点需要谨记:所有的map函数和reduce函数都不是只执行一次的
2.代码
数据:
2020040112 1
2020040113 3
2020040114 4
2020040115 5
2020040116 6
2020040117 7
2020040118 8
2020040119 9
2020040312 1
2020040313 3
2020040314 4
2020040315 5
2020040316 6
2020040317 7
2020040318 8
2020040319 9
2020040412 1
2020040413 3
2020040414 4
2020040415 5
2020040416 6
2020040417 7
2020040418 8
2020040419 9
代码1 输出最高温度和最低温度:
package org.example;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private int max = 0;
private int min = 0;
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line[] = value.toString().split(" ");
if(Integer.valueOf(line[1]) > max){
max = Integer.valueOf(line[1]);
}
if(Integer.valueOf(line[1]) < min){
min = Integer.valueOf(line[1]);
}
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
context.write(new Text("min"),new IntWritable(min));
context.write(new Text("max"),new IntWritable(max));
}
}
class WordcountReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
private int max = 0;
private int min = 0;
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
for (IntWritable value : values) {
if(value.get() > max){
max = value.get();
}
if(value.get() < min){
min = value.get();
}
}
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
context.write(new Text("min"),new IntWritable(min));
context.write(new Text("max"),new IntWritable(max));
}
}
public class WordcountDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "file:///");
FileSystem fs= FileSystem.get(conf);
String outputPath = "/software/java/data/output/";
if(fs.exists(new Path(outputPath))) fs.delete(new Path(outputPath),true);
Job job = Job.getInstance(conf);
job.setJarByClass(WordcountDriver.class);
job.setMapperClass(WordcountMapper.class);
job.setReducerClass(WordcountReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path("/software/java/data/input/"));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
//将job配置的参数,以及job所用的java类所在的jar包提交给yarn去运行
//job.submit();
boolean res = job.waitForCompletion(true);
}
}
代码2 分组内输出最高温度和最低温度:
package org.example; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import java.util.Map; class WordcountMapper extends Mapper<LongWritable, Text, Text, Text> { private Map<String,String> minmaxMap = new HashMap<String,String>(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line[] = value.toString().split("\\ "); String date = line[0].substring(0,line[0].length()-2); int temperature = Integer.parseInt(line[1]); if(minmaxMap.containsKey(date)){ int max = Integer.parseInt(minmaxMap.get(date).split("\\:")[0]); int min = Integer.parseInt(minmaxMap.get(date).split("\\:")[1]); if(temperature > max){ minmaxMap.put(date,temperature+":"+min); } if(temperature < min){ minmaxMap.put(date,max+":"+temperature); } } else{ minmaxMap.put(date,temperature+":"+temperature); } } @Override protected void cleanup(Context context) throws IOException, InterruptedException { for (Map.Entry<String, String> dateTemperature :minmaxMap.entrySet()) { System.out.println("map"+dateTemperature.getKey() + "|"+dateTemperature.getValue()); context.write(new Text(dateTemperature.getKey()),new Text(dateTemperature.getValue())); } } } class WordcountReducer extends Reducer<Text,Text,Text,Text> { private Map<String,String> minmaxMap = new HashMap<String,String>(); @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { for (Text value:values ) { String date = key.toString(); if(minmaxMap.containsKey(date)){ int existMax = Integer.parseInt(minmaxMap.get(date).split("\\:")[0]); int existMin = Integer.parseInt(minmaxMap.get(date).split("\\:")[1]); int max = Integer.parseInt(value.toString().split("\\:")[0]); int min = Integer.parseInt(value.toString().split("\\:")[1]); int finalMax = existMax > max ? existMax:max; int finalMin = existMin < min ? existMin:min; minmaxMap.put(date,finalMax+":"+finalMin); } else{ minmaxMap.put(date,value.toString()); } } } @Override protected void cleanup(Context context) throws IOException, InterruptedException { for (Map.Entry<String, String> dateTemperature :minmaxMap.entrySet()) { System.out.println("reduce"+dateTemperature.getKey() + "|"+dateTemperature.getValue()); context.write(new Text(dateTemperature.getKey()),new Text(dateTemperature.getValue())); } } } public class WordcountDriver { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); conf.set("fs.defaultFS", "file:///"); FileSystem fs= FileSystem.get(conf); String outputPath = "/software/java/data/output/"; if(fs.exists(new Path(outputPath))) fs.delete(new Path(outputPath),true); Job job = Job.getInstance(conf); job.setJarByClass(WordcountDriver.class); job.setMapperClass(WordcountMapper.class); job.setReducerClass(WordcountReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, new Path("/software/java/data/input/")); FileOutputFormat.setOutputPath(job, new Path(outputPath)); //将job配置的参数,以及job所用的java类所在的jar包提交给yarn去运行 //job.submit(); boolean res = job.waitForCompletion(true); } }
代码3和代码4就不写了,因为差不多。