MR 分组 案例
一、需求分析
1、需求
a、输入文件
0000001 Pdt_01 222.8 0000002 Pdt_05 722.4 0000001 Pdt_02 33.8 0000003 Pdt_06 232.8 0000003 Pdt_02 33.8 0000002 Pdt_03 522.8 0000002 Pdt_04 122.4
b、期望输出文件
1 222.8 2 722.4 3 232.8
注意:输出每个ID的价格最大值
2、分析
a、二次排序,ID 价格
b、排序必须要设置成Key
c、自定义Hadoop序列化
d、使用分组
二、代码
1、自定义Hadoop序列化
package com.group; import org.apache.hadoop.io.WritableComparable; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; public class GroupBean implements WritableComparable<GroupBean> { private int orderID; private double price; public GroupBean() { } public GroupBean(int orderID, double price) { this.orderID = orderID; this.price = price; } // 比较, 二次排序 public int compareTo(GroupBean bean) { int result; if (this.orderID > bean.getOrderID()){ result = 1; }else if (this.orderID < bean.getOrderID()){ result = -1; }else { if (this.price > bean.getPrice()){ result = -1; }else if (this.price < bean.getPrice()){ result = 1; }else { result = 0; } } return result; } // 序列化 public void write(DataOutput out) throws IOException { out.writeInt(orderID); out.writeDouble(price); } // 反序列化 public void readFields(DataInput in) throws IOException { this.orderID = in.readInt(); this.price = in.readDouble(); } public int getOrderID() { return orderID; } public void setOrderID(int orderID) { this.orderID = orderID; } public double getPrice() { return price; } public void setPrice(double price) { this.price = price; } @Override public String toString() { return orderID + "\t" + price; } }
2、Mapper
package com.group; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; public class GroupMapper extends Mapper<LongWritable, Text, GroupBean, NullWritable> { GroupBean k = new GroupBean(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // 1. 读取一行 String line = value.toString(); // 2.切割 String[] fields = line.split("\t"); // 3.设置 key k.setOrderID(Integer.parseInt(fields[0])); k.setPrice(Double.parseDouble(fields[2])); // 4.写入 context.write(k, NullWritable.get()); } }
3、分组
package com.group; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; public class GroupCompare extends WritableComparator { // 1.创建一个构造将比较对象的类传给父类 public GroupCompare() { super(GroupBean.class, true); } // 2.核心逻辑 @Override public int compare(WritableComparable a, WritableComparable b) { int result; GroupBean aBean = (GroupBean) a; GroupBean bBean = (GroupBean) b; if (aBean.getOrderID() > bBean.getOrderID()){ result = 1; }else if (aBean.getOrderID() < bBean.getOrderID()){ result = -1; }else { result = 0; } return result; } }
4、Reducer
package com.group; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public class GroupReducer extends Reducer<GroupBean, NullWritable,GroupBean, NullWritable> { @Override protected void reduce(GroupBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException { // 1. 写入 context.write(key, NullWritable.get()); } }
5、Driver
package com.group; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; public class GroupDriver { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { args = new String[]{"E:\\a\\input\\GroupingComparator.txt", "E:\\a\\output"}; // 1.job Configuration conf = new Configuration(); Job job = Job.getInstance(conf); // 2.设置jar job.setJarByClass(GroupDriver.class); // 3.关联mapper和reducer job.setMapperClass(GroupMapper.class); job.setReducerClass(GroupReducer.class); // 4.设置 mapper 输出 的 kv job.setMapOutputKeyClass(GroupBean.class); job.setOutputValueClass(NullWritable.class); // 5.设置结果输出 的 kv job.setOutputKeyClass(GroupBean.class); job.setOutputValueClass(NullWritable.class); // 8.设置Group驱动 job.setGroupingComparatorClass(GroupCompare.class); // 6.设置输入输出路径 FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); // 7.提交任务 boolean wait = job.waitForCompletion(true); System.exit(wait? 0: 1); } }