willian
18702515157@163.com

问题背景:pairRDD

调用spark存入文件的api时,最后结果文件的个数(就是分区的个数)取决于PariRDD中的key的hash值,

解决后可是相同key的数据到同一个partitioner中

解决方法:

  1. 自己重新定义一个partitioner

//一般内部通过map来区分最好
class HostPartitioner(hosts:Array[String]) extends Partitioner{
//从host数组中进行partitioner编号的确定

val hostMap = new mutable.HashMap[String,Int]()
var count = 0
for (host <- hosts){
hostMap += (host -> count)
count += 1
}

override def numPartitions: Int = hosts.length

//key 是前面的host 有三个spark会自动将key传进来判断
override def getPartition(key: Any): Int = {
hostMap.getOrElse(key.toString,0)
}
}
 

整个代码如下:

package flowanalysis

import java.net.URL

import org.apache.spark.{Partitioner, SparkConf, SparkContext}

import scala.collection.mutable

/**
* Created by willian on 2017/3/18.
* 解决存放进hadoop中的产生的hash碰撞,导致文件的内容没有按key进分区
*/
object FlowAnalysisPartitioner {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("flow_analysis").setMaster("local")
val sc = new SparkContext(conf)
val rdd = sc.textFile("/Users/willian/Desktop/project/spark/wordcount/src/main/resources/itcast.log").map(line =>{
val f = line.split("\t")
(f(i = 1),1)
})
val rdd1 = rdd.reduceByKey(_+_)
val rdd3 = rdd1.map(tuple =>{
val url = tuple._1
val host = new URL(url).getHost
(host,(url,tuple._2))
})
val hostrdd = rdd3.map(_._1).distinct().collect()
// rdd3.repartition(3).saveAsTextFile("/Users/willian/Desktop/project/spark/wordcount/src/main/output")
// println(hostrdd.collect.toBuffer)
val partitioner = new HostPartitioner(hostrdd)

rdd3.partitionBy(partitioner).mapPartitions(it=>{
it.toList.sortBy(_._2._2).reverse.take(3).iterator
}).saveAsTextFile("/Users/willian/Desktop/project/spark/wordcount/src/main/output")

}
}

class HostPartitioner(hosts:Array[String]) extends Partitioner{
//从host数组中进行partitioner编号的确定

val hostMap = new mutable.HashMap[String,Int]()
var count = 0
for (host <- hosts){
hostMap += (host -> count)
count += 1
}

override def numPartitions: Int = hosts.length

//key 是前面的host 有三个spark会自动将key传进来判断
override def getPartition(key: Any): Int = {
hostMap.getOrElse(key.toString,0)
}
}

 

posted on 2017-03-18 23:14  willian_zhang  阅读(1163)  评论(0编辑  收藏  举报