PageRank算法MapReduce实现

如果你现在需要计算网页的排名只有4一:数据如下面的:

baidu	10.00 google,sina,nefu
google	10.00 baidu
sina	10.00 google
nefu	10.00 sina,google

1. baidu  存在三个外链接

2.google 存在1个外链接

3.sina 存在1个外链接

4.nefu. 存在2个外链接

由数据能够看出:全部链接都指向了google,所以google的PR应该最高。而由google指向的baidu的PR值 应该也非常高。

代码例如以下:

package PageRank;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;

public class PageRank {
	/**
	 * @author XD
	 */
	static enum PageCount{
		Count,TotalPR
	}
	public static class  Map extends Mapper < LongWritable , Text , Text , Text >{
		protected void map(LongWritable key, Text value , Context context) throws IOException, InterruptedException{
			context.getCounter(PageCount.Count).increment(1);
			String[] kv = value.toString().split("\t");
			String _key = kv[0];
			String _value = kv[1];
			String _PRnLink[] = _value.split(" ");
			String pr = _PRnLink[0];
			String link = _PRnLink[1];
			context.write(new Text(_key),new Text(link));
			String[] site = link.split(",");
			float score = Float.valueOf(pr)/(site.length)*1.0f;
			for(int i=0;i<site.length;i++){
				context.write(new Text(site[i]), new Text(String.valueOf(score)));
			}
		}
	}
	public static class Reduce extends Reducer < Text , Text , Text, Text>{
		protected void reduce(Text key , Iterable<Text> values ,Context context) throws IOException, InterruptedException{
			StringBuilder sb = new StringBuilder();
			float factor  = 0.85f;	//阻尼因子
			float pr = 0f;
			for(Text val : values){
				String value = val.toString();
				int s = value.indexOf(".");
				if(s != -1){
					pr += Float.valueOf(value);
				}else{
					String site[] = value.split(",");
					int _len = site.length;
					for(int k=0;k<_len;k++){
						sb.append(site[k]);
						sb.append(",");
					}
				}
			}
			pr = ((1-factor)+(factor*(pr)));
			context.getCounter(PageCount.TotalPR).increment((int)(pr*1000));
			String output = pr+" "+sb.toString();
			context.write(key, new Text(output));
		}
	}
	public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
		// TODO Auto-generated method stub
		String input,output;
		int threshold = 100;
		int iteration = 0;
		int iterationLimit = 10;
	
		boolean status = false;
		
		while(iteration < iterationLimit){
			//展开重复迭代  注意 输入输出的路径 
			if((iteration % 2) == 0){
				input = "hdfs://localhost:9000/output_pr/p*";
				output = "hdfs://localhost:9000/output_pr2";
			}else{
				input = "hdfs://localhost:9000/output_pr2/p*";
				output = "hdfs://localhost:9000/output_pr";
			}
			Configuration conf = new Configuration();
			final FileSystem filesystem = FileSystem.get(new URI(input),conf);
			final Path outPath = new Path(output);
			if(filesystem.exists(outPath)){
				filesystem.delete(outPath, true);
			}
			Job job = new Job(conf,PageRank.class.getSimpleName());
			
			//1.1 读取文件 位置
			FileInputFormat.setInputPaths(job, input);
			
			//1.2指定的map类//1.3 map输出的key value 类型 要是和终于的输出类型是一样的 能够省略
			job.setMapperClass(Map.class);
			job.setMapOutputKeyClass(Text.class);
			job.setMapOutputValueClass(Text.class);
			job.setJarByClass(PageRank.class);
			
			//1.3 分区
			job.setPartitionerClass(HashPartitioner.class);
			
			job.setReducerClass(Reduce.class);
			//指定 reduce的输出类型
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(Text.class);
			
			//指定写出到什么位置
			FileOutputFormat.setOutputPath(job, new Path(output));
			status = job.waitForCompletion(true);
			iteration++;
			long count = job.getCounters().findCounter(PageCount.Count).getValue();
			long TotalPr = job.getCounters().findCounter(PageCount.TotalPR).getValue();
			System.out.println("PageCount:"+count);
			System.out.println("TotalPR:"+TotalPr);
			double per_pr = TotalPr/(count*1.0d);
			System.out.println("PEr_er:"+per_pr);
			if((int)per_pr == threshold){
				System.out.println("Iteration:"+iteration);
				break;
			}	
		}
        System.exit(status?

0:1); } }


最后输出结果例如以下:



posted @ 2015-06-20 18:40  hrhguanli  阅读(821)  评论(0编辑  收藏  举报