mapjoin与reducejoin
一、mapjoin
1.Mapper类
package com.css.mapjoin;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
// 思路:商品表加载到内存中 然后数据在map端输出前 进行替换
public class CacheMapper extends Mapper<LongWritable, Text, Text, NullWritable>{
HashMap<String, String> pdMap = new HashMap<>();
// 1.商品表加载到内存
@Override
protected void setup(Context context)throws IOException {
// 加载缓存文件
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream("pd.txt"), "UTF-8"));
String line;
while (StringUtils.isNotEmpty(line = br.readLine())) {
// 切分
String[] fields = line.split("\t");
// 缓存
pdMap.put(fields[0], fields[1]);
}
br.close();
}
// 2.map传输
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// 获取数据
String line = value.toString();
// 切割
String[] fields = line.split("\t");
// 获取订单中商品id
String pid = fields[1];
// 根据订单商品id获取商品名
String pName = pdMap.get(pid);
// 拼接数据
line = line + "\t" + pName;
// 输出
context.write(new Text(line), NullWritable.get());
}
}
2.Driver类
package com.css.mapjoin;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class CacheDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
// 1.获取job信息
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
// 2.获取jar包
job.setJarByClass(CacheDriver.class);
// 3.获取自定义的mapper与reducer类
job.setMapperClass(CacheMapper.class);
// 4.设置reduce输出的数据类型(最终的数据类型)
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// 5.设置输入存在的路径与处理后的结果路径
FileInputFormat.setInputPaths(job, new Path("c:/table1029/in"));
FileOutputFormat.setOutputPath(job, new Path("c:/table1029/out"));
// 6.加载缓存商品数据
job.addCacheFile(new URI("file:///c:/inputcache/pd.txt"));
// 7.设置一下reducetask的数量
job.setNumReduceTasks(0);
// 8.提交任务
boolean rs = job.waitForCompletion(true);
System.out.println(rs ? 0 : 1);
}
}
3.输入文件
(1)order.txt
201801 01 1
201802 02 2
201803 03 3
201804 01 4
201805 02 5
201806 03 6
(2)pd.txt
01 苹果
02 华为
03 小米
4.输出文件part-m-00000
201801 01 1 苹果
201802 02 2 华为
201803 03 3 小米
201804 01 4 苹果
201805 02 5 华为
201806 03 6 小米
二、reducejoin
1.Mapper类
package com.css.reducejoin; import java.io.IOException; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileSplit; public class TableMapper extends Mapper<LongWritable, Text, Text, TableBean>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { TableBean v = new TableBean(); Text k = new Text(); // 区分两张表 FileSplit inputSplit = (FileSplit) context.getInputSplit(); String name = inputSplit.getPath().getName(); // 获取数据 String line = value.toString(); // 区分 此时是订单表 if (name.contains("order.txt")) { // 切分字段 String[] fields = line.split("\t"); // 封装对象 v.setOrder_id(fields[0]); v.setPid(fields[1]); v.setAmount(Integer.parseInt(fields[2])); v.setpName(""); v.setFlag("0"); // 设置k 商品id作为k k.set(fields[1]); }else { // 此时为商品表 // 切分字段 String[] fields = line.split("\t"); // 封装对象 v.setOrder_id(""); v.setPid(fields[0]); v.setAmount(0); v.setpName(fields[1]); v.setFlag("1"); // 设置k 商品id作为k k.set(fields[0]); } context.write(k, v); } }
2.Reducer类
package com.css.reducejoin; import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; import org.apache.commons.beanutils.BeanUtils; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class TableReducer extends Reducer<Text, TableBean, TableBean, NullWritable>{ @Override protected void reduce(Text key, Iterable<TableBean> values, Context context) throws IOException, InterruptedException { // 创建集合 存放订单数据 ArrayList<TableBean> orderBean = new ArrayList<TableBean>(); // 商品存储 TableBean pdBean = new TableBean(); // 把pd商品中商品名 拷贝到orderBean for (TableBean v : values) { if ("0".equals(v.getFlag())) { // 订单表 // 1.创建一个临时变量 拷贝数据 TableBean tableBean = new TableBean(); // 2.拷贝 try { BeanUtils.copyProperties(tableBean, v); } catch (IllegalAccessException | InvocationTargetException e) { e.printStackTrace(); } orderBean.add(tableBean); }else { try { BeanUtils.copyProperties(pdBean, v); } catch (IllegalAccessException | InvocationTargetException e) { e.printStackTrace(); } } } // 拼接表 for (TableBean tableBean : orderBean) { // 加入商品名 tableBean.setpName(pdBean.getpName()); context.write(tableBean, NullWritable.get()); } } }
3.封装类
package com.css.reducejoin; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.Writable; public class TableBean implements Writable{ // 封装对应字段 private String order_id; //订单id private String pid; // 产品id private int amount; // 产品数量 private String pName; // 产品名称 private String flag; // 判断是订单表还是商品表 public TableBean() { super(); } public String getOrder_id() { return order_id; } public void setOrder_id(String order_id) { this.order_id = order_id; } public String getPid() { return pid; } public void setPid(String pid) { this.pid = pid; } public int getAmount() { return amount; } public void setAmount(int amount) { this.amount = amount; } public String getpName() { return pName; } public void setpName(String pName) { this.pName = pName; } public String getFlag() { return flag; } public void setFlag(String flag) { this.flag = flag; } @Override public void write(DataOutput out) throws IOException { out.writeUTF(order_id); out.writeUTF(pid); out.writeInt(amount); out.writeUTF(pName); out.writeUTF(flag); } @Override public void readFields(DataInput in) throws IOException { order_id = in.readUTF(); pid = in.readUTF(); amount = in.readInt(); pName = in.readUTF(); flag = in.readUTF(); } @Override public String toString() { return order_id + "\t" + pName + "\t" + amount; } }
4.Driver类
package com.css.reducejoin; import java.io.IOException; import java.net.URISyntaxException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class TableDriver { public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(TableDriver.class); job.setMapperClass(TableMapper.class); job.setReducerClass(TableReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(TableBean.class); job.setOutputKeyClass(TableBean.class); job.setOutputValueClass(NullWritable.class); FileInputFormat.setInputPaths(job, new Path("c:/reduce1029/in")); FileOutputFormat.setOutputPath(job, new Path("c:/reduce1029/out")); boolean rs = job.waitForCompletion(true); System.out.println(rs ? 0 : 1); } }
5.输入文件
(1)order.txt 201801 01 1 201802 02 2 201803 03 3 201804 01 4 201805 02 5 201806 03 6 (2)pd.txt 01 苹果 02 华为 03 小米
6.输出文件part-r-00000
201804 苹果 4 201801 苹果 1 201805 华为 5 201802 华为 2 201806 小米 6 201803 小米 3