spark PageRank
import java.io.{File, PrintWriter} import java.util import java.util.regex.Pattern import org.apache.spark.graphx.GraphLoader import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} import scala.collection.mutable import scala.io.Source object PageRankTest { def main(args: Array[String]): Unit = { val masterUrl = "local[2]" val appName = "PageRank_test" val sparkConf = new SparkConf().setMaster(masterUrl).setAppName(appName) val sc = new SparkContext(sparkConf) sc.setLogLevel("ERROR") //原始数据文件 val urlSourceFilePath="Peanut/httpFile.txt" //预处理后要写入的数据集1 : 边 val urlFollowerFilePath="Peanut/urlFollower.txt" //预处理后要写入的数据集2 : 点 val urlPointFilePath="Peanut/urlPoint.txt" //结果文件 val PageRankResultPath="Peanut/PageRankResult.txt" // 每个url 对应一个数值(int) val urlToIntMap = mutable.Map[String,Int]() var count:Int = 1 // 将原始数据中url映射成一个int数值 val sourceFile=Source.fromFile(urlSourceFilePath) for(line <- sourceFile.getLines){ val list = evaluate(line) list.map(x=>{ if (!urlToIntMap.contains(x)) { urlToIntMap.put(x,count) count += 1 } }) } // 写 点数据集: point.txt val writerFollower = new PrintWriter(new File(urlPointFilePath)) urlToIntMap.foreach(x=>{ val writeContain = x._2 + "\t" + x._1 + "\n" writerFollower.write(writeContain) }) writerFollower.close() //写 边数据集: follower.txt val sourceFile2=Source.fromFile(urlSourceFilePath) val writerPoint = new PrintWriter(new File(urlFollowerFilePath)) for(line <- sourceFile2.getLines) { val list = evaluate(line).toList val firstUrl = list.head val firstUrlNum = urlToIntMap(firstUrl) val otherUrlList = list.tail otherUrlList.foreach(x=>{ val writeNum = urlToIntMap(x) val writeString = firstUrlNum + "\t" + writeNum + "\n" writerPoint.write(writeString) }) } writerPoint.close() sourceFile.close sourceFile2.close // 从特定的边列表文件中读取数据生成图框架 val graph = GraphLoader.edgeListFile(sc, urlFollowerFilePath) // 核心api: pageRank // 0.0001为前后两次收敛的误差阈值,小于这个阈值时则结束计算,越小精度越到 val ranks = graph.pageRank(0.0001).vertices // 将上面得到的ranks(顶点属性)和用户进行关系连接 // 首先也是读取一个包含了用户信息的文件,然后调用了一个map函数,即将文件里的每行数据按 ”,” 切开并返回存储了处理后数据的RDD val users = sc.textFile(urlPointFilePath).map { line => val fields = line.split("\t") (fields(0).toLong, fields(1)) } // println("===users: "+users.collect().toBuffer) // 这里具体实现了将ranks和用户列表一一对应起来 // 从map函数的内容可以看出是按id来进行连接,但返回的结果只含用户名和它的相应rank值 val ranksByUsername = users.join(ranks).map { case (id, (username, rank)) => (username, rank) } // 结果 val pageRankResult=ranksByUsername.collect() //返回 Array[String,Double] //打印数据 println(pageRankResult.mkString("\n")) val writerPageRankResult = new PrintWriter(new File(PageRankResultPath)) pageRankResult.toList.foreach(x=>{ val writeString=x._1+","+x._2+"\n" writerPageRankResult.write(writeString) }) writerPageRankResult.close() } /** * 预处理数据 * * @param links * @return 第一个元素为 原url, 后面是 链接url */ def evaluate(links: String) = { val pattern = "(\\[){1,2}.*?, " val r = Pattern.compile(pattern) val m = r.matcher(links) val linksList = new util.ArrayList[String] val title = links.split("\t")(0) linksList.add(title) while ( { m.find }) { val ret = m.group val lastIndex = ret.lastIndexOf("[") val re = ret.substring(lastIndex + 1, ret.length - 2) linksList.add(re) } linksList.toArray(new Array[String](0)) } }