MR案例:单表关联查询
"单表关联"这个实例要求从给出的数据中寻找所关心的数据,它是对原始数据所包含信息的挖掘。
需求:实例中给出 child-parent(孩子—父母)表,要求输出 grandchild-grandparent(孙子—爷奶)表。
package test; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; /** * 输入: * child parent * 张三 张三的爸爸 * 张三的爸爸 张三的爷爷 * * 输出: * grandChiled grandFather * 张三 张三的爷爷 */ public class MySingle { public static void main(String[] args) throws Exception { //配置环境变量 System.setProperty("hadoop.home.dir", "F:\\JAVA\\hadoop-2.2.0"); Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(MySingle.class); job.setMapperClass(STMapper.class); job.setReducerClass(STReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : -1); } public static class STMapper extends Mapper<LongWritable, Text, Text, Text>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] splited = value.toString().split(" "); if(splited.length >= 2){ //正向输出,value即 父亲前加符号"-" context.write(new Text(splited[0]), new Text("-"+splited[1])); //反向输出 context.write(new Text(splited[1]), new Text(splited[0])); } } } public static class STReducer extends Reducer<Text, Text, Text, Text>{ @Override protected void reduce(Text key, Iterable<Text> v2s,Context context) throws IOException, InterruptedException { List<String> grandChild=new ArrayList<String>(); List<String> grandParent=new ArrayList<String>(); for(Text text : v2s){ //以"-"开始则是key的父亲 if(text.toString().startsWith("-")){ //将可能成为爷爷的变量存储到grandParent集合中去 grandParent.add(text.toString().substring(1)); }else { grandChild.add(text.toString()); } } /** * 【关键的判断】 * 当前输入的key既有儿子又有父亲 */ if(grandChild.size()!=0 && grandParent.size()!=0){ for(int i=0;i<grandChild.size();i++){ for(int j=0;j<grandParent.size();j++){ //key:孙子 value:爷爷 context.write(new Text(grandChild.get(i)), new Text(grandParent.get(j))); } } } } } }
- 在reduce阶段,将两种Value分别存储到grandchild和grandparent集合中
- 对于reduce阶段的key,只有当他既有儿子又有父亲时,他才可以使得grandchild和grandparent两集合都不为空