MR案例:内连接代码实现
本文是对Hive中【内连接】的Java-API的实现,具体的HQL语句详见Hive查询Join
package join.map; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.VLongWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class JoinOn { public static void main(String[] args) throws Exception { //临时配置windows的环境变量 System.setProperty("hadoop.home.dir", "D:\\workspace\\hadoop-2.2.0"); Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(JoinOn.class); job.setMapperClass(JOMapper.class); job.setReducerClass(JOReducer.class); job.setMapOutputKeyClass(VLongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true)? 0:1); } public static class JOMapper extends Mapper<LongWritable, Text, VLongWritable, Text>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //获取当前分片所对应的文件名(优化:放在setup()方法中) String name = ((FileSplit)context.getInputSplit()).getPath().getName(); String[] splited = value.toString().split("\t"); if(name.endsWith("sales")){ //sales表(注意:第二个参数中name和value之间的分隔符不能和value中个字段分隔符一样) //<key,value> --> <id, things+':'+name+'\t'+id> context.write(new VLongWritable(Long.parseLong(splited[1])), new Text(name+":"+value.toString())); }else if(name.endsWith("things")) { //<key,value> --> <id, sales+'\t'+id+'\t'+name> context.write(new VLongWritable(Long.parseLong(splited[0])), new Text(name+":"+value.toString())); } } } public static class JOReducer extends Reducer<VLongWritable, Text, Text, Text>{ @Override protected void reduce(VLongWritable key, Iterable<Text> v2s, Context context) throws IOException, InterruptedException { //分别存储sales和things两表的name List<String> sales=new ArrayList<String>(); List<String> things=new ArrayList<String>(); for(Text text : v2s){ String[] splited = text.toString().split(":"); //sales表中的数据 if(splited[0].endsWith("sales")){ //加入集合 sales.add(splited[1]); } //things表中数据 else if(splited[0].endsWith("things")){ things.add(splited[1]); } } //笛卡尔积 if(sales.size()!=0 && things.size()!=0){ for(String sale : sales){ for(String thing : things){ context.write(new Text(sale), new Text(thing)); } } } } } }
总结:
1).程序中获取FileName应放置中setup()方法中,因为每个文件只需执行一次此方法
2).Map输出的第二个参数中name和value之间的分隔符不能和value中个字段分隔符一样