spark04

spark04

join leftOuterjoin rightOuterJoin cogroup

scala> var arr = Array(("zhangsan",200),("lisi",300),("wangwu",350))

arr: Array[(String, Int)] = Array((zhangsan,200), (lisi,300), (wangwu,350))

 

scala> var arr1 = Array(("zhangsan",10),("lisi",15),("zhaosi",20))

arr1: Array[(String, Int)] = Array((zhangsan,10), (lisi,15), (zhaosi,20))

 

scala> sc.makeRDD(arr,3)

res0: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[0] at makeRDD at <console>:27

 

scala> sc.makeRDD(arr1,3)

res1: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[1] at makeRDD at <console>:27

 

scala>

 

scala>

 

scala> res0 join res1

res2: org.apache.spark.rdd.RDD[(String, (Int, Int))] = MapPartitionsRDD[4] at join at <console>:33

 

scala> res2.mapValues(t=>t._1*t._2)

res3: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[5] at mapValues at <console>:35

 

scala> res3.collect

res4: Array[(String, Int)] = Array((zhangsan,2000), (lisi,4500))                

 

scala> res0 leftOuterJoin res1

res5: org.apache.spark.rdd.RDD[(String, (Int, Option[Int]))] = MapPartitionsRDD[8] at leftOuterJoin at <console>:33

 

scala> res5.mapValues(t=>t._1*t._2.getOrElse(0))

res6: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[9] at mapValues at <console>:35

 

scala> res6.collect

res7: Array[(String, Int)] = Array((zhangsan,2000), (wangwu,0), (lisi,4500))    

 

scala>

 

scala> res0 rightOuterJoin res1

res8: org.apache.spark.rdd.RDD[(String, (Option[Int], Int))] = MapPartitionsRDD[12] at rightOuterJoin at <console>:33

 

scala> res8.mapValues(t=>t._1.getOrElse(0)*t._2)

res9: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[13] at mapValues at <console>:35

 

scala> res9.collect

res10: Array[(String, Int)] = Array((zhangsan,2000), (lisi,4500), (zhaosi,0))   

 

scala>

 

scala> res0 cogroup res1

res11: org.apache.spark.rdd.RDD[(String, (Iterable[Int], Iterable[Int]))] = MapPartitionsRDD[15] at cogroup at <console>:33

 

scala> res11.mapValues(t=>t._1.sum*t._2.sum)

res12: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[16] at mapValues at <console>:35

 

scala> res12.collect

res13: Array[(String, Int)] = Array((zhangsan,2000), (wangwu,0), (lisi,4500), (zhaosi,0))

 

cartesian笛卡儿积

scala> var arr = Array(1,2,3,4,5,6)

arr: Array[Int] = Array(1, 2, 3, 4, 5, 6)

 

scala> var arr1 = Array("a","b","c")

arr1: Array[String] = Array(a, b, c)

 

scala> sc.makeRDD(arr,3)

res14: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[17] at makeRDD at <console>:27

 

scala> sc.makeRDD(arr1,3)

res15: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[18] at makeRDD at <console>:27

 

scala> res14 cartesian res15

res16: org.apache.spark.rdd.RDD[(Int, String)] = CartesianRDD[19] at cartesian at <console>:33

 

scala> res16.collect

res17: Array[(Int, String)] = Array((1,a), (2,a), (1,b), (2,b), (1,c), (2,c), (3,a), (4,a), (3,b), (4,b), (3,c), (4,c), (5,a), (6,a), (5,b), (6,b), (5,c), (6,c))

 

 

repartition == coalesce

修改分区

scala> var arr = Array(1,2,3,4,5,6,7,8,9)

arr: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9)

 

scala> sc.makeRDD(arr,3)

res18: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[20] at makeRDD at <console>:27

 

scala> res18.partitions.size

res19: Int = 3

 

scala> res18.repartition(4)

res20: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[24] at repartition at <console>:29

 

scala> res20.partitions.size

res21: Int = 4

 

scala> res18.repartition(2)

res22: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[28] at repartition at <console>:29

 

scala> res22.partitions.size

res23: Int = 2

repartition任意的改变分区

 

scala> res18.coalesce(2)

res24: org.apache.spark.rdd.RDD[Int] = CoalescedRDD[29] at coalesce at <console>:29

 

scala> res24.partitions.size

res25: Int = 2

 

scala> res18.coalesce(6)

res26: org.apache.spark.rdd.RDD[Int] = CoalescedRDD[30] at coalesce at <console>:29

 

scala> res26.partitions.size

res27: Int = 3

coalesce只能缩小分区不能增加分区数量

 

 

 

repartition底层调用的是coalesce,但是coalesce中加入的是shuffle=true

所以repartition含有shuffle流程

 

 

 

repartition存在shuffle

repartition = coalesce(true)

 

repartition存在shuffle   coalesce不存在shuffle

最后一个rdd的分区数量才是这个阶段的task任务的个数

 

scala> var arr = Array(1,2,3,4,5,6)

arr: Array[Int] = Array(1, 2, 3, 4, 5, 6)

 

scala> sc.makeRDD(arr,3)

res35: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[35] at makeRDD at <console>:27

 

scala> res35.repartition(4)

res36: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[39] at repartition at <console>:29

 

scala> res36.collect

res37: Array[Int] = Array(6, 4, 2, 1, 3, 5)

 

 

 

 

DAG有向无环图中,按照stage进行切分(shuffle流程),总共的task数量

stage中的所有的task任务的总和,每个stage中最后一个rdd的分区数量的和

 

aggreagte  aggregateByKey

scala> res41.aggregate(10)(_+_,_+_)

res42: Int = 85

 

scala> res41.aggregate(0)(_+_,_+_)

res43: Int = 45

每个分区单独加一次,整体聚合加一次

第二个聚合函数是全体聚合

scala> var arr = Array(("a",1),("a",2),("a",5),("a",6),("b",3),("b",4))

arr: Array[(String, Int)] = Array((a,1), (a,2), (a,5), (a,6), (b,3), (b,4))

 

scala> sc.makeRDD(arr,2)

res45: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[43] at makeRDD at <console>:27

 

scala> res45.aggregateByKey(0)(_+_,_+_)

res46: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[44] at aggregateByKey at <console>:29

 

scala> res46.collect

res47: Array[(String, Int)] = Array((b,7), (a,14))

 

scala> res45.aggregateByKey(10)(_+_,_+_)

res48: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[45] at aggregateByKey at <console>:29

 

scala> res48.collect

res49: Array[(String, Int)] = Array((b,17), (a,34))     

 

aggregateByKey()全局聚合函数,初始化值每个分区加一次,但是全局聚合不加

aggregate是行动类算子 aggregateByKey是转换类算子

 

算子分为两类

行动类和转换类算子

行动类算子立即执行  转换类算子不会执行  转换类算子含有shuffle和没有shuffle

如果是shuffle类的算子会切分阶段,没有shuffle的算子都处于一个阶段

 

 

RDD的五大特性(重点)

 

 

 

五大特性

 

 

 

  1. 每个rdd都是默认被分区的。含有一系列的分区列表
  2. 专门存在一个函数用来处理每一个rdd的分区
  3. 每个rdd之间存在依赖关系
  4. key-value键值对的形式的rdd上面存在可选择的分区器
  5. 优先位置进行计算每一个分区/分片

 

 

 

分区是一个特质,其中存在index下标  hashcode方法 equals方法

分区就是一个rdd上面的分岔路,每个路口流动的数据要交给一个task线程进行处理

每个rdd都会存在一个或者多个分区,读取hdfs文件的时候,每个分区对应的是一个block块,其实分区就是记录了数据的位置在哪里,每个分区应该从哪个block中读取数据。

每个分区交给一个executor的一个线程处理

 

 

compute函数

 

 

 

 

每个rdd上面都会存在一个compute方法专门来计算每一个分区中的数据

所有的算子进行数据处理的时候都会交给compute方法进行统一计算

 

 

 


compute方法可以统一处理每一个算子中的逻辑,用compute使用算子中的函数,将rdd每个分区中的数据进行迭代处理

 

每个rdd之间存在依赖关系

依赖关系存在两种,一对一的(pipeline) 多对多shuffle

什么是依赖关系?算子

 

每个rdd上面都存在一个获取依赖关系的函数getDependencies

 

 

 

根据两个rdd之间调用的算子不一样产生的依赖关系主要分为两种

宽依赖和窄依赖

存在shuffle的就是宽依赖  没有shuffle的,一对一的就是窄依赖

窄依赖存在两种关系  OneToOne  RangeDependency

 

 

 

 

OneToOneDependency一对一的依赖  map mapValues flatMap...

RangeDependency 只有sortByKey

 

比如map算子

 

 

 

 

 

 

 

可以看出map算子就是窄依赖

 

宽依赖shuffleDependency

 

 

 

 

 

 

 

 

 

最后可以得到reduceByKeyshuffleDependency

 

可选择的分区器partitioner必须再k-vrdd上面存在分区器,默认得rdd上面不存在分区器

 

 

 

分区器自带得主要分为两种,hashPartitioner  rangePartitioner

reduceByKey  groupByKey  distinct 使用得都是hashPartitioner

sortByKey rangePartitioner 依赖关系也是rangeDependency

 

 

rdd上面自带得属性,分区器partitioner,默认是None

 

scala> sc.makeRDD(Array(1,2,3,4,5,6,7,8,9),3)

res0: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at makeRDD at <console>:25

 

scala> res0.partitioner

res1: Option[org.apache.spark.Partitioner] = None

 

scala>

 

scala> sc.makeRDD(Array(("a",1),("a",1),("b",1),("b",1)))

res2: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[1] at makeRDD at <console>:25

 

scala> res2.partitioner

res3: Option[org.apache.spark.Partitioner] = None

 

scala> res2.reduceByKey(_+_)

res5: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[2] at reduceByKey at <console>:27

 

scala> res5.partitioner

res6: Option[org.apache.spark.Partitioner] = Some(org.apache.spark.HashPartitioner@18)

 

scala> res5.sortByKey()

res7: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[5] at sortByKey at <console>:29

 

scala> res7.partitioner

res8: Option[org.apache.spark.Partitioner] = Some(org.apache.spark.RangePartitioner@1f12d)

 

 

reduceByKey使用的是HashPartitioner所以两个rdd之间得依赖宽依赖shuffleDependency

sortByKey使用得是RangeDependency 所以依赖关系是rangeDependency

 

scala> var arr = Array(1,2,3,4,5,6,7,8,9)

arr: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9)

 

scala> arr.zipWithIndex

res9: Array[(Int, Int)] = Array((1,0), (2,1), (3,2), (4,3), (5,4), (6,5), (7,6), (8,7), (9,8))

 

scala> sc.makeRDD(res9,3)

res10: org.apache.spark.rdd.RDD[(Int, Int)] = ParallelCollectionRDD[6] at makeRDD at <console>:29

 

scala> res10.reduceByKey(_+_)

res11: org.apache.spark.rdd.RDD[(Int, Int)] = ShuffledRDD[7] at reduceByKey at <console>:31

 

scala> res11.mapPartitionsWithIndex((a,b)=>b.map((a,_)))

res12: org.apache.spark.rdd.RDD[(Int, (Int, Int))] = MapPartitionsRDD[8] at mapPartitionsWithIndex at <console>:33

 

scala> res12.collect

res13: Array[(Int, (Int, Int))] = Array((0,(6,5)), (0,(3,2)), (0,(9,8)), (1,(4,3)), (1,(1,0)), (1,(7,6)), (2,(8,7)), (2,(5,4)), (2,(2,1)))

 

reduceByKey使用得hashpartitioner

 

scala> var arr = Array(1,2,10000,20000,30000,40000,5000,60000,70000,8,9000000)

arr: Array[Int] = Array(1, 2, 10000, 20000, 30000, 40000, 5000, 60000, 70000, 8, 9000000)

 

scala> sc.makeRDD(arr,3)

res17: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[13] at makeRDD at <console>:27

 

scala> res17.zipWithIndex

res18: org.apache.spark.rdd.RDD[(Int, Long)] = ZippedWithIndexRDD[14] at zipWithIndex at <console>:29

 

scala> res18.sortByKey()

res19: org.apache.spark.rdd.RDD[(Int, Long)] = ShuffledRDD[17] at sortByKey at <console>:31

 

scala> res19.mapPartitionsWithIndex((a,b)=>b.map((a,_)))

res20: org.apache.spark.rdd.RDD[(Int, (Int, Long))] = MapPartitionsRDD[18] at mapPartitionsWithIndex at <console>:33

 

scala> res20.collect

res21: Array[(Int, (Int, Long))] = Array((0,(1,0)), (0,(2,1)), (0,(8,9)), (0,(5000,6)), (1,(10000,2)), (1,(20000,3)), (1,(30000,4)), (1,(40000,5)), (2,(60000,7)), (2,(70000,8)), (2,(9000000,10)))

 

rangeDependency尽量得保证了数据在数值得范围和数值得个数两个因素上面保证平均

scala> res18.sortByKey(true,20)

res23: org.apache.spark.rdd.RDD[(Int, Long)] = ShuffledRDD[21] at sortByKey at <console>:31

 

scala> res23.partitions.size

res24: Int = 12

sortBykey使用得rangePartitioner,重新分区得时候,如果分区数字比元素得个数还要大,那么重新分区得个数就不能起作用,最少保证一个分区中应该含有一个数据

 

优先位置

 

rdd === aa.txt= ===hdfs  ===== 2 blk ==== 2 partition ===每个分区得读取文件得位置

找到对应得blk块得位置,在本地进行运算

移动计算比移动数据本身更划算

 

以上为五个特性(重点)

分区列表

一个compute方法用于计算

依赖关系

k-vrdd上存在一个可选择得分区器

优先位置进行计算

 

自定义分区器:

object WordCountPartition {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setMaster("local[*]")
    conf.setAppName("wc")
    val sc = new SparkContext(conf)
    val rdd1 = sc.textFile("aa.txt")
    val rdd2 = rdd1.flatMap(_.split(" " ))
    val rdd3 = rdd2.map((_,1))
    val rdd4 = rdd3.reduceByKey(_+_)
    val rdd5 = rdd4.partitionBy(new MyPartitioner)
    rdd5.saveAsTextFile("wcresult")
  }
}

class MyPartitioner extends Partitioner {
  override def numPartitions: Int = 2

  override def getPartition(key: Any): Int = {
    if(key.asInstanceOf[String].equals("hello")){
      0
    }else
      1
  }
}

 

老师得访问量,专业得topN,将每个老师对应得专业进行分区,每个分区中都是一个专业得全部老师,然后将分区中得数据进行排序,就可以得到topN

自定partitioner 然后mapPartitions每次遍历一个分区中得数据,得到topN

 

作业:

分区器得形式进行教师得专业排名?

spark版本得电影推荐算法,每个人最喜欢得类型?

 

 

wordcount中得rdd个数

通过println(rdd.toDebugString)

 

 

 

本地模式是5rdd

 

 

 

 

wordcount中存在6rdd

 

posted @ 2019-09-16 14:58  lilixia  阅读(200)  评论(0编辑  收藏  举报