Map Join案例
一、需求分析
1、需求
与Reduce join的需求一致
2、分析
a、在mapper的setup加载缓存,设置 kv 键值对
b、在map()方法中根据pid 获取 panme 根据上面的k v
c、写driver,设置reducenum为 0 ,使用缓存文件
二、代码
1、Driver
package com.wt.mapjoin; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; public class TableJoinDriver { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException { // 0 根据自己电脑路径重新配置 args = new String[]{"E:\\a\\input1\\order.txt", "E:\\a\\output2"}; // 1 获取job信息 Configuration configuration = new Configuration(); Job job = Job.getInstance(configuration); // 2 设置加载jar包路径 job.setJarByClass(TableJoinDriver.class); // 3 关联map job.setMapperClass(TableJoinMapper.class); // 4 设置最终输出数据类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); // 5 设置输入输出路径 FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); // 6 加载缓存数据 job.addCacheFile(new URI("file:///E:/a/inputmap/pd.txt")); // 7 Map端Join的逻辑不需要Reduce阶段,设置reduceTask数量为0 job.setNumReduceTasks(0); // 8 提交 boolean result = job.waitForCompletion(true); System.exit(result ? 0 : 1); } }
2、Mapper
package com.wt.mapjoin; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.commons.lang.StringUtils; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.net.URI; import java.util.HashMap; import java.util.Map; public class TableJoinMapper extends Mapper<LongWritable,Text,Text, NullWritable> { Map<String, String> pdMap = new HashMap<String, String>(); Text k = new Text(); String line; @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); // 1 获取缓存的文件 URI[] cacheFiles = context.getCacheFiles(); String path = cacheFiles[0].getPath().toString(); BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8")); while(StringUtils.isNotEmpty(line = reader.readLine())){ // 2 切割 String[] fields = line.split("\t"); // 3 缓存数据到集合 pdMap.put(fields[0], fields[1]); } reader.close(); // pid pname // 01 小米 } @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] fields = line.split("\t"); // id pid amount // 1001 01 1 // 目标 id pname amount String id = fields[0]; String pId = fields[1]; String amount = fields[2]; String pName = pdMap.get(pId); String newLine = id + "\t" + pName + "\t" + amount; k.set(newLine); context.write(k, NullWritable.get()); } }