mapreduce实现"浏览该商品的人大多数还浏览了"经典应用

 

输入:

日期    ...cookie id.        ...商品id..

xx            xx                        xx

输出:

商品id         商品id列表(按优先级排序,用逗号分隔)

xx                   xx

比如:

id1              id3,id0,id4,id2

id2             id0,id5

整个计算过程分为4步

1、提取原始日志日期,cookie id,商品id信息,按天计算,最后输出数据格式

商品id-0 商品id-1

xx           x x         

这一步做了次优化,商品id-0一定比商品id-1小,为了减少存储,在最后汇总数据转置下即可

reduce做局部排序及排重

 

2、基于上次的结果做汇总,按天计算

商品id-0 商品id-1  关联值(关联值即同时访问这两个商品的用户数)

xx             x x                xx

 

3、汇总最近三个月数据,同时考虑时间衰减,时间越久关联值的贡献越低,最后输出两两商品的关联值(包括转置后)

 

4、行列转换,生成最后要的推荐结果数据,按关联值排序生成

 

第一个MR

[java] view plain copy
 
  1. import java.io.IOException;  
  2. import java.util.ArrayList;  
  3. import org.apache.hadoop.conf.Configuration;  
  4. import org.apache.hadoop.fs.FileSystem;  
  5. import org.apache.hadoop.fs.Path;  
  6. import org.apache.hadoop.io.LongWritable;  
  7. import org.apache.hadoop.io.Text;  
  8. import org.apache.hadoop.io.WritableComparable;  
  9. import org.apache.hadoop.io.WritableComparator;  
  10. import org.apache.hadoop.mapreduce.Job;  
  11. import org.apache.hadoop.mapreduce.Mapper;  
  12. import org.apache.hadoop.mapreduce.Partitioner;  
  13. import org.apache.hadoop.mapreduce.Reducer;  
  14. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
  15. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
  16. import org.apache.hadoop.util.GenericOptionsParser;  
  17. import org.apache.log4j.Logger;  
  18.   
  19.   
  20. /* 
  21.  * 输入:原始数据,会有重复 
  22.  *日期 cookie 楼盘id 
  23.  *  
  24.  * 输出: 
  25.  * 日期 楼盘id1 楼盘id2  //楼盘id1一定小于楼盘id2 ,按日期 cookie进行分组 
  26.  *  
  27.  */  
  28.   
  29. public class HouseMergeAndSplit {  
  30.       
  31.     public static class Partitioner1 extends Partitioner<TextPair, Text> {  
  32.           @Override  
  33.           public int getPartition(TextPair key, Text value, int numParititon) {  
  34.                       return Math.abs((new Text(key.getFirst().toString()+key.getSecond().toString())).hashCode() * 127) % numParititon;  
  35.   
  36.           }  
  37.     }  
  38.           public static class Comp1 extends WritableComparator {  
  39.               public Comp1() {  
  40.                super(TextPair.class, true);  
  41.               }  
  42.               @SuppressWarnings("unchecked")  
  43.               public int compare(WritableComparable a, WritableComparable b) {  
  44.                TextPair t1 = (TextPair) a;  
  45.                TextPair t2 = (TextPair) b;  
  46.                int comp= t1.getFirst().compareTo(t2.getFirst());  
  47.                if (comp!=0)  
  48.                    return comp;  
  49.                return t1.getSecond().compareTo(t2.getSecond());  
  50.               }  
  51.             }  
  52.       public static class TokenizerMapper   
  53.            extends Mapper<LongWritable, Text, TextPair, Text>{  
  54.                   Text val=new Text("test");  
  55.         public void map(LongWritable key, Text value, Context context  
  56.                         ) throws IOException, InterruptedException {  
  57.                          String s[]=value.toString().split("\001");              
  58.              TextPair tp=new TextPair(s[0],s[1],s[4]+s[3]); //thedate cookie city+houseid  
  59.              context.write(tp, val);  
  60.         }  
  61.       }  
  62.         
  63.       public static class IntSumReducer   
  64.            extends Reducer<TextPair,Text,Text,Text> {  
  65.           private static String comparedColumn[] = new String[3];  
  66.           ArrayList<String> houselist= new ArrayList<String>();  
  67.           private static Text keyv = new Text();  
  68.             
  69.           private static Text valuev = new Text();  
  70.           static Logger logger = Logger.getLogger(HouseMergeAndSplit.class.getName());  
  71.             
  72.         public void reduce(TextPair key, Iterable<Text> values,   
  73.                            Context context  
  74.                            ) throws IOException, InterruptedException {  
  75.               
  76.             houselist.clear();  
  77.             String thedate=key.getFirst().toString();  
  78.             String cookie=key.getSecond().toString();    
  79.              
  80.             for (int i=0;i<3;i++)  
  81.                 comparedColumn[i]="";  
  82.               
  83.             //first+second为分组键,每次不同重新调用reduce函数  
  84.             for (Text val:values)  
  85.             {  
  86.           
  87.                 if (thedate.equals(comparedColumn[0]) && cookie.equals(comparedColumn[1])&&  !key.getThree().toString().equals(comparedColumn[2]))  
  88.                  {  
  89.                     // context.write(new Text(key.getFirst()+" "+key.getSecond().toString()), new Text(key.getThree().toString()+" first"+ " "+comparedColumn[0]+" "+comparedColumn[1]+" "+comparedColumn[2]));  
  90.                      houselist.add(key.getThree().toString());  
  91.                        
  92.                      comparedColumn[0]=key.getFirst().toString();  
  93.                        comparedColumn[1]=key.getSecond().toString();  
  94.                        comparedColumn[2]=key.getThree().toString();  
  95.                         
  96.                  }  
  97.                      
  98.                    if (!thedate.equals(comparedColumn[0])||!cookie.equals(comparedColumn[1]))  
  99.                       
  100.                        {  
  101.                        
  102.                      //  context.write(new Text(key.getFirst()+" "+key.getSecond().toString()), new Text(key.getThree().toString()+" second"+ " "+comparedColumn[0]+" "+comparedColumn[1]+" "+comparedColumn[2]));  
  103.                        houselist.add(key.getThree().toString());  
  104.                        comparedColumn[0]=key.getFirst().toString();  
  105.                        comparedColumn[1]=key.getSecond().toString();  
  106.                        comparedColumn[2]=key.getThree().toString();  
  107.                          
  108.                        }  
  109.               
  110.               
  111.                           
  112.             }  
  113.   
  114.   
  115.               
  116.             keyv.set(comparedColumn[0]); //日期  
  117.             //valuev.set(houselist.toString());  
  118.             //logger.info(houselist.toString());  
  119.             //context.write(keyv,valuev);  
  120.               
  121.               
  122.             for (int i=0;i<houselist.size()-1;i++)  
  123.             {  
  124.                 for (int j=i+1;j<houselist.size();j++)  
  125.                 {    valuev.set(houselist.get(i)+"  "+houselist.get(j)); //关联的楼盘  
  126.                     context.write(keyv,valuev);  
  127.                 }  
  128.             }   
  129.               
  130.         }  
  131.       }  
  132.   
  133.       public static void main(String[] args) throws Exception {  
  134.         Configuration conf = new Configuration();  
  135.         String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();  
  136.         if (otherArgs.length != 2) {  
  137.           System.err.println("Usage: wordcount <in> <out>");  
  138.           System.exit(2);  
  139.         }  
  140.           
  141.         FileSystem fstm = FileSystem.get(conf);     
  142.         Path outDir = new Path(otherArgs[1]);     
  143.         fstm.delete(outDir, true);  
  144.           
  145.    conf.set("mapred.textoutputformat.separator", "\t"); //reduce输出时key value中间的分隔符  
  146.         Job job = new Job(conf, "HouseMergeAndSplit");  
  147.         job.setNumReduceTasks(4);  
  148.         job.setJarByClass(HouseMergeAndSplit.class);  
  149.         job.setMapperClass(TokenizerMapper.class);  
  150.           
  151.         job.setMapOutputKeyClass(TextPair.class);  
  152.         job.setMapOutputValueClass(Text.class);  
  153.         // 设置partition  
  154.         job.setPartitionerClass(Partitioner1.class);  
  155.         // 在分区之后按照指定的条件分组  
  156.         job.setGroupingComparatorClass(Comp1.class);  
  157.         // 设置reduce  
  158.         // 设置reduce的输出  
  159.         job.setReducerClass(IntSumReducer.class);  
  160.         job.setOutputKeyClass(Text.class);  
  161.         job.setOutputValueClass(Text.class);  
  162.         //job.setNumReduceTasks(18);  
  163.         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));  
  164.         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));  
  165.         System.exit(job.waitForCompletion(true) ? 0 : 1);  
  166.       }  
  167. }  

TextPair

[java] view plain copy
 
  1. import java.io.DataInput;  
  2. import java.io.DataOutput;  
  3. import java.io.IOException;  
  4.   
  5. import org.apache.hadoop.io.Text;  
  6. import org.apache.hadoop.io.WritableComparable;  
  7.   
  8. public class TextPair implements WritableComparable<TextPair> {  
  9.     private Text first;  
  10.     private Text second;  
  11.     private Text three;  
  12.     public TextPair() {  
  13.       set(new Text(), new Text(),new Text());  
  14.     }  
  15.     public TextPair(String first, String second,String three) {  
  16.       set(new Text(first), new Text(second),new Text(three));  
  17.     }  
  18.     public TextPair(Text first, Text second,Text Three) {  
  19.       set(first, second,three);  
  20.     }  
  21.     public void set(Text first, Text second,Text three) {  
  22.       this.first = first;  
  23.       this.second = second;  
  24.       this.three=three;  
  25.     }  
  26.     public Text getFirst() {  
  27.       return first;  
  28.     }  
  29.     public Text getSecond() {  
  30.       return second;  
  31.     }  
  32.     public Text getThree() {  
  33.           return three;  
  34.         }  
  35.     public void write(DataOutput out) throws IOException {  
  36.       first.write(out);  
  37.       second.write(out);  
  38.       three.write(out);  
  39.     }  
  40.     public void readFields(DataInput in) throws IOException {  
  41.       first.readFields(in);  
  42.       second.readFields(in);  
  43.       three.readFields(in);  
  44.     }  
  45.     public int compareTo(TextPair tp) {  
  46.       int cmp = first.compareTo(tp.first);  
  47.       if (cmp != 0) {  
  48.        return cmp;  
  49.       }  
  50.       cmp= second.compareTo(tp.second);  
  51.       if (cmp != 0) {  
  52.            return cmp;  
  53.           }  
  54.       return three.compareTo(tp.three);  
  55.     }  
  56.     }  


TextPairSecond

[java] view plain copy
 
  1. import java.io.DataInput;  
  2. import java.io.DataOutput;  
  3. import java.io.IOException;  
  4.   
  5. import org.apache.hadoop.io.FloatWritable;  
  6. import org.apache.hadoop.io.Text;  
  7. import org.apache.hadoop.io.WritableComparable;  
  8.   
  9. public class TextPairSecond implements WritableComparable<TextPairSecond> {  
  10.     private Text first;  
  11.     private FloatWritable second;  
  12.     public TextPairSecond() {  
  13.       set(new Text(), new FloatWritable());  
  14.     }  
  15.     public TextPairSecond(String first, float second) {  
  16.       set(new Text(first), new FloatWritable(second));  
  17.     }  
  18.     public TextPairSecond(Text first, FloatWritable second) {  
  19.       set(first, second);  
  20.     }  
  21.     public void set(Text first, FloatWritable second) {  
  22.       this.first = first;  
  23.       this.second = second;  
  24.     }  
  25.     public Text getFirst() {  
  26.       return first;  
  27.     }  
  28.     public FloatWritable getSecond() {  
  29.       return second;  
  30.     }  
  31.     public void write(DataOutput out) throws IOException {  
  32.       first.write(out);  
  33.       second.write(out);  
  34.     }  
  35.     public void readFields(DataInput in) throws IOException {  
  36.       first.readFields(in);  
  37.       second.readFields(in);  
  38.     }  
  39.     public int compareTo(TextPairSecond tp) {  
  40.       int cmp = first.compareTo(tp.first);  
  41.       if (cmp != 0) {  
  42.        return cmp;  
  43.       }  
  44.       return second.compareTo(tp.second);  
  45.     }  
  46.   
  47.     }  

 

第二个MR

[java] view plain copy
 
  1. import java.io.IOException;  
  2. import java.text.SimpleDateFormat;  
  3. import java.util.ArrayList;  
  4. import java.util.Date;  
  5.   
  6. import org.apache.hadoop.conf.Configuration;  
  7. import org.apache.hadoop.fs.FileSystem;  
  8. import org.apache.hadoop.fs.Path;  
  9. import org.apache.hadoop.io.IntWritable;  
  10. import org.apache.hadoop.io.LongWritable;  
  11. import org.apache.hadoop.io.NullWritable;  
  12. import org.apache.hadoop.io.Text;  
  13. import org.apache.hadoop.io.WritableComparable;  
  14. import org.apache.hadoop.io.WritableComparator;  
  15. import org.apache.hadoop.mapred.OutputCollector;  
  16. import org.apache.hadoop.mapreduce.Job;  
  17. import org.apache.hadoop.mapreduce.Mapper;  
  18. import org.apache.hadoop.mapreduce.Partitioner;  
  19. import org.apache.hadoop.mapreduce.Reducer;  
  20.   
  21. import org.apache.hadoop.mapreduce.Mapper.Context;  
  22. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
  23. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
  24. import org.apache.hadoop.util.GenericOptionsParser;  
  25. import org.apache.log4j.Logger;  
  26.   
  27.   
  28. /* 
  29.  *  统计楼盘之间共同出现的次数 
  30.  * 输入: 
  31.  * 日期 楼盘1 楼盘2 
  32.  *  
  33.  * 输出: 
  34.  * 日期 楼盘1 楼盘2 共同出现的次数 
  35.  *  
  36.  */  
  37.   
  38. public class HouseCount {  
  39.       
  40.   
  41.       public static class TokenizerMapper   
  42.            extends Mapper<LongWritable, Text, Text, IntWritable>{  
  43.           
  44.       
  45.     IntWritable iw=new IntWritable(1);  
  46.         public void map(LongWritable key, Text value, Context context  
  47.                         ) throws IOException, InterruptedException {  
  48.               
  49.           
  50.          context.write(value, iw);  
  51.         }  
  52.       }  
  53.         
  54.       public static class IntSumReducer   
  55.            extends Reducer<Text,IntWritable,Text,IntWritable> {  
  56.       
  57.          IntWritable result=new IntWritable();  
  58.         public void reduce(Text key, Iterable<IntWritable> values,   
  59.                            Context context  
  60.                            ) throws IOException, InterruptedException {  
  61.               
  62.              int sum=0;  
  63.              for (IntWritable iw:values)  
  64.              {  
  65.                  sum+=iw.get();  
  66.              }  
  67.              result.set(sum);  
  68.          context.write(key, result) ;  
  69.               
  70.         }  
  71.       }  
  72.   
  73.       public static void main(String[] args) throws Exception {  
  74.         Configuration conf = new Configuration();  
  75.         String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();  
  76.         if (otherArgs.length != 2) {  
  77.           System.err.println("Usage: wordcount <in> <out>");  
  78.           System.exit(2);  
  79.         }  
  80.           
  81.         FileSystem fstm = FileSystem.get(conf);     
  82.         Path outDir = new Path(otherArgs[1]);     
  83.         fstm.delete(outDir, true);  
  84.           
  85.    conf.set("mapred.textoutputformat.separator", "\t"); //reduce输出时key value中间的分隔符  
  86.         Job job = new Job(conf, "HouseCount");  
  87.         job.setNumReduceTasks(2);  
  88.         job.setJarByClass(HouseCount.class);  
  89.         job.setMapperClass(TokenizerMapper.class);  
  90.           
  91.         job.setMapOutputKeyClass(Text.class);  
  92.         job.setMapOutputValueClass(IntWritable.class);  
  93.       
  94.         // 设置reduce  
  95.         // 设置reduce的输出  
  96.         job.setReducerClass(IntSumReducer.class);  
  97.         job.setOutputKeyClass(Text.class);  
  98.         job.setOutputValueClass(IntWritable.class);  
  99.         //job.setNumReduceTasks(18);  
  100.         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));  
  101.         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));  
  102.         System.exit(job.waitForCompletion(true) ? 0 : 1);  
  103.       }  
  104. }  


第三个MR

[java] view plain copy
 
  1. import java.io.IOException;  
  2. import java.text.ParseException;  
  3. import java.text.SimpleDateFormat;  
  4. import java.util.ArrayList;  
  5. import java.util.Calendar;  
  6. import java.util.Date;  
  7.   
  8. import org.apache.hadoop.conf.Configuration;  
  9. import org.apache.hadoop.fs.FileSystem;  
  10. import org.apache.hadoop.fs.Path;  
  11. import org.apache.hadoop.io.FloatWritable;  
  12. import org.apache.hadoop.io.IntWritable;  
  13. import org.apache.hadoop.io.LongWritable;  
  14. import org.apache.hadoop.io.NullWritable;  
  15. import org.apache.hadoop.io.Text;  
  16. import org.apache.hadoop.io.WritableComparable;  
  17. import org.apache.hadoop.io.WritableComparator;  
  18. import org.apache.hadoop.mapred.OutputCollector;  
  19. import org.apache.hadoop.mapreduce.Job;  
  20. import org.apache.hadoop.mapreduce.Mapper;  
  21. import org.apache.hadoop.mapreduce.Partitioner;  
  22. import org.apache.hadoop.mapreduce.Reducer;  
  23.   
  24. import org.apache.hadoop.mapreduce.Mapper.Context;  
  25. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
  26. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
  27. import org.apache.hadoop.util.GenericOptionsParser;  
  28. import org.apache.log4j.Logger;  
  29.   
  30.   
  31. /* 
  32.  * 汇总近三个月统计楼盘之间共同出现的次数,考虑衰减系数, 并最后a b 转成 b a输出一次 
  33.  * 输入: 
  34.  * 日期  楼盘1 楼盘2 共同出现的次数 
  35.  *  
  36.  * 输出 
  37.  * 楼盘1 楼盘2 共同出现的次数(考虑了衰减系数,每天的衰减系数不一样) 
  38.  *  
  39.  */  
  40.   
  41. public class HouseCountHz {  
  42.       
  43.   
  44.       public static class HouseCountHzMapper   
  45.            extends Mapper<LongWritable, Text, Text, FloatWritable>{  
  46.           
  47.     Text keyv=new Text();  
  48.       
  49.     FloatWritable valuev=new FloatWritable();  
  50.         public void map(LongWritable key, Text value, Context context  
  51.                         ) throws IOException, InterruptedException {  
  52.               
  53.         String[] s=value.toString().split("\t");  
  54.         keyv.set(s[1]+" "+s[2]);//楼盘1,楼盘2  
  55.         Calendar date1=Calendar.getInstance();  
  56.           Calendar d2=Calendar.getInstance();  
  57.       
  58.           Date b = null;  
  59.           SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd");  
  60.           try {  
  61.             b=sdf.parse(s[0]);  
  62.           } catch (ParseException e) {  
  63.            e.printStackTrace();  
  64.           }  
  65.           d2.setTime(b);  
  66.           long n=date1.getTimeInMillis();  
  67.           long birth=d2.getTimeInMillis();  
  68.           long sss=n-birth;  
  69.           int day=(int)((sss)/(3600*24*1000)); //该条记录的日期与当前日期的日期差  
  70.           float factor=1/(1+(float)(day-1)/10); //衰减系数  
  71.         valuev.set(Float.parseFloat(s[3])*factor);  
  72.           
  73.          context.write(keyv, valuev);  
  74.         }  
  75.       }  
  76.         
  77.       public static class HouseCountHzReducer   
  78.            extends Reducer<Text,FloatWritable,Text,FloatWritable> {  
  79.       
  80.           FloatWritable result=new FloatWritable();  
  81.           Text keyreverse=new Text();  
  82.         public void reduce(Text key, Iterable<FloatWritable> values,   
  83.                            Context context  
  84.                            ) throws IOException, InterruptedException {  
  85.               
  86.              float sum=0;  
  87.              for (FloatWritable iw:values)  
  88.              {  
  89.                  sum+=iw.get();  
  90.              }  
  91.              result.set(sum);  
  92.              String[] keys=key.toString().split("\t");  
  93.              keyreverse.set(keys[1]+"   "+keys[0]);  
  94.          context.write(key, result) ;  
  95.          context.write(keyreverse, result)  ;  
  96.               
  97.         }  
  98.       }  
  99.   
  100.       public static void main(String[] args) throws Exception {  
  101.         Configuration conf = new Configuration();  
  102.         String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();  
  103.         if (otherArgs.length != 2) {  
  104.           System.err.println("Usage: wordcount <in> <out>");  
  105.           System.exit(2);  
  106.         }  
  107.           
  108.         FileSystem fstm = FileSystem.get(conf);     
  109.         Path outDir = new Path(otherArgs[1]);     
  110.         fstm.delete(outDir, true);  
  111.           
  112.    conf.set("mapred.textoutputformat.separator", "\t"); //reduce输出时key value中间的分隔符  
  113.         Job job = new Job(conf, "HouseCountHz");  
  114.         job.setNumReduceTasks(2);  
  115.         job.setJarByClass(HouseCountHz.class);  
  116.         job.setMapperClass(HouseCountHzMapper.class);  
  117.           
  118.         job.setMapOutputKeyClass(Text.class);  
  119.         job.setMapOutputValueClass(FloatWritable.class);  
  120.       
  121.         // 设置reduce  
  122.         // 设置reduce的输出  
  123.         job.setReducerClass(HouseCountHzReducer.class);  
  124.         job.setOutputKeyClass(Text.class);  
  125.         job.setOutputValueClass(FloatWritable.class);  
  126.         //job.setNumReduceTasks(18);  
  127.         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));  
  128.         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));  
  129.         System.exit(job.waitForCompletion(true) ? 0 : 1);  
  130.       }  
  131. }  


第四个MR

[java] view plain copy
 
  1. import java.io.IOException;  
  2. import java.util.Iterator;  
  3.   
  4.   
  5. import org.apache.hadoop.conf.Configuration;  
  6. import org.apache.hadoop.fs.FileSystem;  
  7. import org.apache.hadoop.fs.Path;  
  8. import org.apache.hadoop.io.FloatWritable;  
  9.   
  10. import org.apache.hadoop.io.LongWritable;  
  11.   
  12. import org.apache.hadoop.io.Text;  
  13. import org.apache.hadoop.io.WritableComparable;  
  14. import org.apache.hadoop.io.WritableComparator;  
  15.   
  16. import org.apache.hadoop.mapreduce.Job;  
  17. import org.apache.hadoop.mapreduce.Mapper;  
  18. import org.apache.hadoop.mapreduce.Partitioner;  
  19. import org.apache.hadoop.mapreduce.Reducer;  
  20.   
  21. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
  22. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
  23. import org.apache.hadoop.util.GenericOptionsParser;  
  24.   
  25.   
  26.   
  27. /* 
  28.  * 输入数据: 
  29.  * 楼盘1 楼盘2 共同出现的次数 
  30.  *  
  31.  * 输出数据 
  32.  *  楼盘1 楼盘2,楼盘3,楼盘4 (按次数排序) 
  33.  */  
  34.   
  35. public class HouseRowToCol {  
  36.       
  37.     public static class Partitioner1 extends Partitioner<TextPairSecond, Text> {  
  38.           @Override  
  39.           //分区  
  40.           public int getPartition(TextPairSecond key, Text value, int numParititon) {  
  41.                       return Math.abs((new Text(key.getFirst().toString()+key.getSecond().toString())).hashCode() * 127) % numParititon;  
  42.   
  43.           }  
  44.     }  
  45.     //分组  
  46.           public static class Comp1 extends WritableComparator {  
  47.               public Comp1() {  
  48.                super(TextPairSecond.class, true);  
  49.               }  
  50.               @SuppressWarnings("unchecked")  
  51.               public int compare(WritableComparable a, WritableComparable b) {  
  52.                   TextPairSecond t1 = (TextPairSecond) a;  
  53.                   TextPairSecond t2 = (TextPairSecond) b;  
  54.                 return t1.getFirst().compareTo(t2.getFirst());  
  55.   
  56.               }  
  57.             }  
  58.             
  59.           //排序  
  60.           public static class KeyComp extends WritableComparator {  
  61.               public KeyComp() {  
  62.                super(TextPairSecond.class, true);  
  63.               }  
  64.               @SuppressWarnings("unchecked")  
  65.               public int compare(WritableComparable a, WritableComparable b) {  
  66.                   TextPairSecond t1 = (TextPairSecond) a;  
  67.                   TextPairSecond t2 = (TextPairSecond) b;  
  68.                int comp= t1.getFirst().compareTo(t2.getFirst());  
  69.                if (comp!=0)  
  70.                    return comp;  
  71.                return -t1.getSecond().compareTo(t2.getSecond());  
  72.               }  
  73.             }   
  74.       public static class HouseRowToColMapper   
  75.            extends Mapper<LongWritable, Text, TextPairSecond, Text>{  
  76.   
  77.           Text houseid1=new Text();  
  78.           Text houseid2=new Text();  
  79.           FloatWritable weight=new FloatWritable();  
  80.         public void map(LongWritable key, Text value, Context context  
  81.                         ) throws IOException, InterruptedException {  
  82.               
  83.          String s[]=value.toString().split("\t");  
  84.        
  85.            weight.set(Float.parseFloat(s[2]));  
  86.            houseid1.set(s[0]);  
  87.            houseid2.set(s[1]);  
  88.          TextPairSecond tp=new TextPairSecond(houseid1,weight);   
  89.          context.write(tp, houseid2);  
  90.         }  
  91.       }  
  92.         
  93.       public static class HouseRowToColReducer   
  94.            extends Reducer<TextPairSecond,Text,Text,Text> {  
  95.             
  96.        Text valuev=new Text();  
  97.         public void reduce(TextPairSecond key, Iterable<Text> values,   
  98.                            Context context  
  99.                            ) throws IOException, InterruptedException {  
  100.             Text keyv=key.getFirst();  
  101.             Iterator<Text> it=values.iterator();  
  102.             StringBuilder sb=new StringBuilder(it.next().toString());  
  103.             while(it.hasNext())  
  104.             {  
  105.                 sb.append(","+it.next().toString());  
  106.             }  
  107.             valuev.set(sb.toString());  
  108.             context.write(keyv, valuev);  
  109.               
  110.               
  111.               
  112.         }  
  113.       }  
  114.   
  115.       public static void main(String[] args) throws Exception {  
  116.         Configuration conf = new Configuration();  
  117.         String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();  
  118.         if (otherArgs.length != 2) {  
  119.           System.err.println("Usage: wordcount <in> <out>");  
  120.           System.exit(2);  
  121.         }  
  122.           
  123.         FileSystem fstm = FileSystem.get(conf);     
  124.         Path outDir = new Path(otherArgs[1]);     
  125.         fstm.delete(outDir, true);  
  126.           
  127.    conf.set("mapred.textoutputformat.separator", "\t"); //reduce输出时key value中间的分隔符  
  128.         Job job = new Job(conf, "HouseRowToCol");  
  129.         job.setNumReduceTasks(4);  
  130.         job.setJarByClass(HouseRowToCol.class);  
  131.         job.setMapperClass(HouseRowToColMapper.class);  
  132.           
  133.         job.setMapOutputKeyClass(TextPairSecond.class);  
  134.         job.setMapOutputValueClass(Text.class);  
  135.         // 设置partition  
  136.         job.setPartitionerClass(Partitioner1.class);  
  137.         // 在分区之后按照指定的条件分组  
  138.         job.setGroupingComparatorClass(Comp1.class);  
  139.         job.setSortComparatorClass(KeyComp.class);  
  140.         // 设置reduce  
  141.         // 设置reduce的输出  
  142.         job.setReducerClass(HouseRowToColReducer.class);  
  143.         job.setOutputKeyClass(Text.class);  
  144.         job.setOutputValueClass(Text.class);  
  145.         //job.setNumReduceTasks(18);  
  146.         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));  
  147.         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));  
  148.         System.exit(job.waitForCompletion(true) ? 0 : 1);  
  149.       }  
  150. }  
转:http://blog.csdn.net/u011750989/article/details/12004065
posted @ 2017-05-25 17:58  thinker1017  阅读(256)  评论(0编辑  收藏  举报