分区器只有键值对类型的RDD才可以使用分区器去定义分区规则,非键值对类型RDD也有分区,但是数值类型的rdd是均匀分区的,不可把控的
1. HashPartitioner
- 定义:HashPartitioner----按照key值的hashcode的不同 分到不同分区里面
- 弊端:可能会造成数据倾斜问题(每一个分区分配的数据可能差别很多)
| object WordCount { |
| def main(args: Array[String]): Unit = { |
| val conf = new SparkConf().setAppName("WC").setMaster("local[2]") |
| val sc = new SparkContext(conf) |
| val line: RDD[String] = sc.textFile("hdfs://node1:9000/wc.txt") |
| val words = line.flatMap(_.split(" ")) |
| val map = words.map((_, 1)) |
| |
| |
| val partition: RDD[(String, Int)] = map.partitionBy(new HashPartitioner(4)) |
| |
| val value = partition.mapPartitionsWithIndex((index, tuple) => { |
| println(s"分区为: $index , 里面的数据有: ${tuple.mkString(",")}") |
| tuple |
| }) |
| val result = value.reduceByKey((_ + _)) |
| result.foreach(println(_)) |
| sc.stop() |
| |
| |
| |
| |
| |
| |
| |
| } |
| } |
2. RangePartitioner
- 定义:RangePartitioner---按照数据元素的范围划分分区数据,尽可能保证每一个分区的数据是均匀(抽样算法)
- 好处:解决HashPartitioner的分区数据的倾斜问题,无法控制什么数据到什么分区
| package wc |
| |
| import org.apache.spark.rdd.RDD |
| import org.apache.spark.{HashPartitioner, RangePartitioner, SparkConf, SparkContext} |
| |
| object WordCount { |
| def main(args: Array[String]): Unit = { |
| val conf = new SparkConf().setAppName("WC").setMaster("local[2]") |
| val sc = new SparkContext(conf) |
| val line: RDD[String] = sc.textFile("hdfs://node1:9000/wc.txt") |
| val words = line.flatMap(_.split(" ")) |
| val map = words.map((_, 1)) |
| |
| |
| |
| val partition: RDD[(String, Int)] = map.partitionBy(new RangePartitioner(3, map)) |
| |
| val value = partition.mapPartitionsWithIndex((index, tuple) => { |
| println(s"分区为: $index , 里面的数据有: ${tuple.mkString(",")}") |
| tuple |
| }) |
| val result = value.reduceByKey((_ + _)) |
| result.foreach(println(_)) |
| |
| |
| |
| |
| |
| sc.stop() |
| } |
| } |
3. 自定义分区
| package wc |
| |
| import org.apache.spark.Partitioner |
| |
| class WCPartitioner() extends Partitioner{ |
| override def numPartitions: Int = 3 |
| |
| override def getPartition(key: Any): Int = { |
| val word = key.toString |
| val first:Char = word.charAt(0) |
| if (first == 'h'){ |
| 0 |
| }else if(first == 's'){ |
| 1 |
| }else{ |
| 2 |
| } |
| } |
| } |
| package wc |
| |
| import org.apache.spark.rdd.RDD |
| import org.apache.spark.{HashPartitioner, RangePartitioner, SparkConf, SparkContext} |
| |
| object WordCount { |
| def main(args: Array[String]): Unit = { |
| val conf = new SparkConf().setAppName("WC").setMaster("local[2]") |
| val sc = new SparkContext(conf) |
| val line: RDD[String] = sc.textFile("hdfs://node1:9000/wc.txt") |
| val words = line.flatMap(_.split(" ")) |
| val map = words.map((_, 1)) |
| |
| |
| |
| |
| val partition = map.partitionBy(new WCPartitioner()) |
| |
| val value = partition.mapPartitionsWithIndex((index, tuple) => { |
| println(s"分区为: $index , 里面的数据有: ${tuple.mkString(",")}") |
| tuple |
| }) |
| val result = value.reduceByKey((_ + _)) |
| result.foreach(println(_)) |
| sc.stop() |
| } |
| } |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· AI 智能体引爆开源社区「GitHub 热点速览」
· 三行代码完成国际化适配,妙~啊~
· .NET Core 中如何实现缓存的预热?