Hadoop笔记

mapreduce上面进展缓慢,根据boss要求,去熟悉一下Hama,看看能不能用BSP实现图匹配的需求。因为之前只是用hadoop里面带的example测试了一下环境,没有自己动手写代码,所以。。。华丽丽地被批评了一顿。看Hama之前,还是先要把MapReduce上面的PageRank写一下。

//mymapreduce.java pagerank 2013.3.21
package
hadoop.lxd; import java.io.IOException; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.*; import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class mymapreduce { public static final double alpha = 0.85; static class mymapper extends Mapper<LongWritable,Text,Text,Text>{ @Override public void map(LongWritable key,Text value,Context context) throws IOException,InterruptedException{ String line = value.toString();// format like : Fpage_1 /t 1.0 /t 2,3,4,5.... String[] prt = line.split("\\t");//ptr[] = { <Fpage_1> <1.0> <Tpage_2,Tpage_3,Tpage_4,Tpage_5....> } System.out.print(prt.length); String page = prt[0]; String rank = prt[1]; String[] topagelist = prt[2].split(",");//topagelist = {<Tpage_2> <Tpage_3> <Tpage_4>....} int numtopage = topagelist.length; context.write(new Text(page), new Text("|"+prt[2])); //System.out.println("m1: "+page+" "+"|"+prt[2]); for(String topage : topagelist){ context.write(new Text(topage), new Text (page+"\t"+rank+"\t"+numtopage)); //System.out.println("m2: "+page+" "+page+"/t"+rank+"/t"+numtopage); } } } static class myreducer extends Reducer<Text,Text,Text,Text>{ @Override public void reduce(Text key,Iterable<Text> values,Context context) throws IOException, InterruptedException{ String temp = null; double rank = 0.0; double collectrank = 0.0; int numoutpages = 0; String topagelist = ""; String pagewithrank = null; String[] split = null; for(Text value : values){ pagewithrank = value.toString(); if(pagewithrank.startsWith("|")){ topagelist = pagewithrank.substring(1); continue; } else{ split = value.toString().split("\\t"); rank = Double.valueOf(split[1]); System.out.println(split[0]); System.out.println(split[1]); System.out.println(split[2]); numoutpages = Integer.valueOf(split[2]); collectrank += rank/numoutpages; } } double newrank = 1-alpha + collectrank*alpha; context.write(key, new Text(newrank+"\t"+topagelist)); // for(Text value :values){ // if(temp == null || temp.equals("")) // temp=value.toString(); // else // temp+=value.toString(); // System.out.println("value "+value.toString()); // } // System.out.println(temp); // // context.write(key, new Text(temp)); // //System.out.println(key.toString()+newrank+"\t"); } } public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException{ if(args.length<2){ System.err.println("erro usage."); System.exit(1); } //for int iteration = 0; int iterationlimit = 3;int status = 0; while(iteration<iterationlimit) { //Todo Job job = new Job(); job.setJarByClass(mymapreduce.class); job.setMapperClass(mymapper.class); job.setReducerClass(myreducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0]+String.valueOf(iteration)+"/p*")); FileOutputFormat.setOutputPath(job, new Path(args[1]+String.valueOf(iteration+1))); iteration++; status = job.waitForCompletion(true) ? 0 : 1; } //for System.exit(status); } }

示例数据:

raw_data.txt
Fpage    rank     Tpagelist

2    1.0    2,3,4,5,6
1    1.0    3,5,6
3    1.0    5,1
4    1.0    2,1,6
5    1.0    2
6    1.0    1,3

总结:数据格式,split()函数的用法要注意。多次迭代的job是用什么方法将输出数据作为下一迭代的输入数据的——不断变换输入路径就可以了.—_—|||

作业提交的方式:

1、编译java文件

$ javac -classpath ~/hadoop-1.0.4/hadoop-core-1.0.4.jar -d ./classes mymapreduce.java

2、打包

 

$ jar -cvf mymapreduce.jar -C ./classes/ .

3、把jar包放在集群中跑

bin/hadoop jar ~/Downloads/mymapreduce/mymapreduce.jar hadoop.lxd.mymapreduce /tmp/mymapreduce/mr /tmp/mymapreduce/mr

posted on 2013-03-21 14:59  甲马  阅读(284)  评论(0编辑  收藏  举报

导航