Hadoop笔记

mapreduce上面进展缓慢，根据boss要求，去熟悉一下Hama，看看能不能用BSP实现图匹配的需求。因为之前只是用hadoop里面带的example测试了一下环境，没有自己动手写代码，所以。。。华丽丽地被批评了一顿。看Hama之前，还是先要把MapReduce上面的PageRank写一下。

//mymapreduce.java pagerank 2013.3.21
package hadoop.lxd;
import java.io.IOException;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class mymapreduce {
    public static final double alpha = 0.85;
    static class mymapper extends Mapper<LongWritable,Text,Text,Text>{
        
        @Override
        public void map(LongWritable key,Text value,Context context) 
                throws IOException,InterruptedException{
            String line = value.toString();// format like : Fpage_1 /t 1.0 /t 2,3,4,5....
            String[] prt = line.split("\\t");//ptr[] = { <Fpage_1> <1.0> <Tpage_2,Tpage_3,Tpage_4,Tpage_5....> }
            System.out.print(prt.length);
            String page = prt[0];
            String rank = prt[1];
            String[] topagelist = prt[2].split(",");//topagelist = {<Tpage_2> <Tpage_3> <Tpage_4>....}
            int numtopage = topagelist.length;
            context.write(new Text(page), new Text("|"+prt[2]));
            //System.out.println("m1: "+page+" "+"|"+prt[2]);
            for(String topage : topagelist){
                context.write(new Text(topage), new Text (page+"\t"+rank+"\t"+numtopage));
                //System.out.println("m2: "+page+" "+page+"/t"+rank+"/t"+numtopage);
            }
        }
    }
    static class myreducer extends Reducer<Text,Text,Text,Text>{
        @Override
        public void reduce(Text key,Iterable<Text> values,Context context) throws IOException, InterruptedException{
            String temp = null;
            double rank = 0.0;
            double collectrank = 0.0;
            int numoutpages = 0;
            String topagelist = "";
            String pagewithrank = null;
            String[] split = null;
            
            for(Text value : values){
                pagewithrank = value.toString();
                if(pagewithrank.startsWith("|")){
                    topagelist = pagewithrank.substring(1);
                    continue;
                }
                else{
                split = value.toString().split("\\t");
                rank = Double.valueOf(split[1]);
                System.out.println(split[0]);
                System.out.println(split[1]);
                System.out.println(split[2]);
                numoutpages = Integer.valueOf(split[2]);
                collectrank += rank/numoutpages;
                }
            }
            double newrank = 1-alpha + collectrank*alpha;
            context.write(key, new Text(newrank+"\t"+topagelist));
            
//            for(Text value :values){
//                if(temp == null || temp.equals(""))
//                    temp=value.toString();
//                else
//                    temp+=value.toString();
//                System.out.println("value  "+value.toString());
//            }
//            System.out.println(temp);
//            
//            context.write(key, new Text(temp));
//            //System.out.println(key.toString()+newrank+"\t");
        }
    }
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException{
        if(args.length<2){
            System.err.println("erro usage.");
            System.exit(1);
        }
//for
        int iteration = 0;
        int iterationlimit = 3;int status = 0;
        while(iteration<iterationlimit)
        {
            //Todo
            Job job = new Job();
            job.setJarByClass(mymapreduce.class);
            
            job.setMapperClass(mymapper.class);
            job.setReducerClass(myreducer.class);
            
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            FileInputFormat.addInputPath(job, new Path(args[0]+String.valueOf(iteration)+"/p*"));
            FileOutputFormat.setOutputPath(job, new Path(args[1]+String.valueOf(iteration+1)));    
            iteration++;
            status = job.waitForCompletion(true) ? 0 : 1;
        }
//for
        
        
        System.exit(status);
    }
}

示例数据：

raw_data.txt
Fpage rank Tpagelist

2    1.0    2,3,4,5,6
1    1.0    3,5,6
3    1.0    5,1
4    1.0    2,1,6
5    1.0    2
6    1.0    1,3

总结：数据格式，split()函数的用法要注意。多次迭代的job是用什么方法将输出数据作为下一迭代的输入数据的——不断变换输入路径就可以了.—_—|||

作业提交的方式：

1、编译java文件

$ javac -classpath ~/hadoop-1.0.4/hadoop-core-1.0.4.jar -d ./classes mymapreduce.java

2、打包

$ jar -cvf mymapreduce.jar -C ./classes/ .

3、把jar包放在集群中跑

bin/hadoop jar ~/Downloads/mymapreduce/mymapreduce.jar hadoop.lxd.mymapreduce /tmp/mymapreduce/mr /tmp/mymapreduce/mr

posted on 2013-03-21 14:59 甲马阅读(284) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

甲马

Hadoop笔记

导航

公告