hadoop与spark的处理技巧(四)推荐引擎处理技巧
经常一起购买的商品
scala> var file=sc.textFile("/user/ghj/togeterBought") file: org.apache.spark.rdd.RDD[String] = /user/ghj/togeterBought MapPartitionsRDD[28] at textFile at <console>:25 scala> file.collect res0: Array[String] = Array(t1 p1 p2 p3, t2 p2 p3, t3 p2 p3 p4, t4 p5 p6, t5 p3 p4) scala> var mapFile=file.map(line=>{ | import scala.collection.mutable.ListBuffer; | var listBuff=ListBuffer[(String,String)](); | var list=line.split(" ").toList; | var ll=list.takeRight(list.size-1); | for(p1<-ll){ | for(p2<-ll){ | if(ll.indexOf(p1) != ll.indexOf(p2)){ | if(p1<p2){ | listBuff=listBuff:+((p1,p2)); | }else{ | listBuff=listBuff:+((p2,p1)); | } | } | } | } | listBuff; | }).flatMap(x=>x).map(x=>(x,1)).reduceByKey(_+_).map(x=>(x,x._2/2)); mapFile: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[30] at flatMap at <console>:46 scala> mapFile.collect res4: Array[(((String, String), Int), Int)] = Array((((p5,p6),2),1), (((p1,p3),2),1), (((p2,p4),2),1), (((p3,p4),4),2), (((p2,p3),6),3), (((p1,p2),2),1))