Transfor类算子(转换类算子)

1、RDD的分区重新进行划分:rdd1.coalesce(num,boolean)

1 val rdd1 = sc.parallelize(Array[String]("love1", "love2", "love3", "love4", "love5", "love6", "love7", "love8", "love9", "love10", "love11", "love12"), 3)
2 val result = rdd1.coalesce(4,true)
3 println(s"result partition length = "+result.getNumPartitions)
4 result.foreach(println)

2、rdd1.cogroup(rdd2)

 1 val rdd1 = sc.parallelize(Array[(String,Int)](
 2   ("zhangsan",1),
 3   ("lisi",2),
 4   ("zhangsan",3),
 5   ("wangwu",4),
 6   ("wangwu",5)
 7 ))
 8 val rdd2 = sc.parallelize(Array[(String,Int)](
 9   ("zhangsan",10),
10   ("lisi",20),
11   ("zhangsan",30),
12   ("wangwu",40),
13   ("wangwu",50)
14 ))
15 val result: RDD[(String, (Iterable[Int], Iterable[Int]))] = rdd1.cogroup(rdd2)
16 result.foreach(tp=>{
17   val key = tp._1
18   val value: Iterable[Int] = tp._2._1
19   value.foreach(println)
20   tp._2._2.foreach(println)
21 })
22 result.foreach(println)
23 /**
24  * (zhangsan,(CompactBuffer(1, 3),CompactBuffer(10, 30)))
25  * (wangwu,(CompactBuffer(4, 5),CompactBuffer(40, 50)))
26  * (lisi,(CompactBuffer(2),CompactBuffer(20)))
27  */

3、RDD中数据去重:rdd1.distinct()

1 val rdd1 = sc.parallelize(Array[String]("a","b","c","d","d","a","b","a","c","d"))
2 val result: RDD[String] = rdd1.distinct()
3 result.foreach(println)

4、RDD中数据及逆行过滤:lines.filter

1 val lines: RDD[String] = sc.textFile("./data/words")
2 val result: RDD[String] = lines.filter(line => {!"hello spark".equals(line)})
3 result.foreach(println)

5、RDD中数据按Key分组:rdd.groupByKey()

 1 val rdd = sc.parallelize(Array[(String, Int)](
 2   ("zhangsan", 10),
 3   ("zhangsan", 20),
 4   ("lisi", 30),
 5   ("wangwu", 40),
 6   ("wangwu", 50)
 7 ))
 8 val result: RDD[(String, Iterable[Int])] = rdd.groupByKey()
 9 result.foreach(println)
10 /**
11  * (zhangsan,CompactBuffer(10, 20))
12  * (wangwu,CompactBuffer(40, 50))
13  * (lisi,CompactBuffer(30))
14  */

6、rdd1.intersection(rdd2)

1 val rdd1 = sc.parallelize(Array[String]("a","b","c","d"))
2 val rdd2 = sc.parallelize(Array[String]("a","b","e","f"))
3 val result = rdd1.intersection(rdd2)
4 result.foreach(println)     
5 /**
6  * a
7  * b
8  */

7、rdd1.join(rdd2)

 1 val nameRDD: RDD[(String, Int)] = sc.parallelize(Array[(String, Int)](
 2   ("zhangsan", 18),
 3   ("lisi", 19),
 4   ("wangwu", 20),
 5   ("maliu", 21)
 6 ),3)
 7 val scoreRDD: RDD[(String, Int)] = sc.parallelize(Array[(String, Int)](
 8   ("zhangsan", 100),
 9   ("lisi", 200),
10   ("wangwu", 300),
11   ("tianqi", 400)
12 ),4)
13 val result: RDD[(String, (Int, Int))] = nameRDD.join(scoreRDD)
14 println(s"nameRDD partition length = ${nameRDD.getNumPartitions}")
15 println(s"scoreRDD partition length = ${scoreRDD.getNumPartitions}")
16 println(s"result partition length = ${result.getNumPartitions}")
17 result.foreach(println)
18 /**
19  * nameRDD partition length = 3
20  * scoreRDD partition length = 4
21  * result partition length = 4
22  * (zhangsan,(18,100))
23  * (wangwu,(20,300))
24  * (lisi,(19,200))
25  */

8、rdd1.fullOuterJoin(rdd2)

 1 val nameRDD: RDD[(String, Int)] = sc.parallelize(Array[(String, Int)](
 2   ("zhangsan", 18),
 3   ("lisi", 19),
 4   ("wangwu", 20),
 5   ("maliu", 21)
 6 ))
 7 val scoreRDD: RDD[(String, Int)] = sc.parallelize(Array[(String, Int)](
 8   ("zhangsan", 100),
 9   ("lisi", 200),
10   ("wangwu", 300),
11   ("tianqi", 400)
12 ))
13 val result: RDD[(String, (Option[Int], Option[Int]))] = nameRDD.fullOuterJoin(scoreRDD)
14 result.foreach(println)
15 /**
16  * (zhangsan,(Some(18),Some(100)))
17  * (wangwu,(Some(20),Some(300)))
18  * (maliu,(Some(21),None))
19  * (tianqi,(None,Some(400)))
20  * (lisi,(Some(19),Some(200)))
21  */

9、rdd1.rightOuterJoin(rdd2)

 1 val nameRDD: RDD[(String, Int)] = sc.parallelize(Array[(String, Int)](
 2   ("zhangsan", 18),
 3   ("lisi", 19),
 4   ("wangwu", 20),
 5   ("maliu", 21)
 6 ))
 7 val scoreRDD: RDD[(String, Int)] = sc.parallelize(Array[(String, Int)](
 8   ("zhangsan", 100),
 9   ("lisi", 200),
10   ("wangwu", 300),
11   ("tianqi", 400)
12 ))
13 /**
14  * (zhangsan,(Some(18),100))
15  * (wangwu,(Some(20),300))
16  * (tianqi,(None,400))
17  * (lisi,(Some(19),200))
18  */

10、rdd1.leftOuterJoin(rdd2)

 1 val nameRDD: RDD[(String, Int)] = sc.parallelize(Array[(String, Int)](
 2   ("zhangsan", 18),
 3   ("lisi", 19),
 4   ("wangwu", 20),
 5   ("maliu", 21)
 6 ))
 7 val scoreRDD: RDD[(String, Int)] = sc.parallelize(Array[(String, Int)](
 8   ("zhangsan", 100),
 9   ("lisi", 200),
10   ("wangwu", 300),
11   ("tianqi", 400)
12 ))
13 val result: RDD[(String, (Option[Int], Int))] = nameRDD.rightOuterJoin(scoreRDD)
14 result.foreach(println)
15 /**
16  * key = zhangsan ,value1 = 18,value2 = 100
17  * key = wangwu ,value1 = 20,value2 = 300
18  * key = maliu ,value1 = 21,value2 = null
19  * key = lisi ,value1 = 19,value2 = 200
20  * (zhangsan,(18,Some(100)))
21  * (wangwu,(20,Some(300)))
22  * (maliu,(21,None))
23  * (lisi,(19,Some(200)))
24  */

11、rdd.mapPartitions()

 1 val rdd = sc.parallelize(List[String]("a","b","c","d","e"),3)
 2 val result: RDD[String] = rdd.mapPartitions(iter => {
 3   val listBuffer = new ListBuffer[String]()
 4   println("建立数据库连接... ...")
 5   while (iter.hasNext) {
 6     val next = iter.next()
 7     listBuffer.append(next + "#")
 8   }
 9   println("批量插入数据库.. ..." + listBuffer.toString())
10   println("关闭数据库连接... ...")
11   listBuffer.iterator
12 })
13 val strings: Array[String] = result.collect()
14 strings.foreach(println)

12、rdd1.mapPartitionsWithIndex()

 1 val rdd1 = sc.parallelize(Array[String]("a","b","c","d","e","f"),3)
 2 val result = rdd1.mapPartitionsWithIndex((index, iter) => {
 3   val listBuffer = new ListBuffer[String]()
 4   while (iter.hasNext) {
 5     val one = iter.next()
 6     listBuffer.append(s"partition : $index ,value : $one")
 7   }
 8   listBuffer.iterator
 9 })
10 result.foreach(println)

13、rdd1.mapValues()

 1 val rdd1 = sc.parallelize(Array[(String, Int)](
 2   ("zhangsan", 100),
 3   ("lisi", 100),
 4   ("wangwu", 300),
 5   ("maliu", 400),
 6   ("tianqi", 500)
 7 ))
 8 val end: RDD[(String, Int)] = rdd1.mapValues(value =>{value+100})
 9 end.foreach(println)
10 /**
11  * (zhangsan,200)
12  * (lisi,200)
13  * (wangwu,400)
14  * (maliu,500)
15  * (tianqi,600)
16  */

14、rdd.repartition(num)

 1 val rdd1 = sc.parallelize(Array[String](
 2   "love1", "love2", "love3", "love4", "love5", "love6", "love7", "love8", "love9", "love10", "love11", "love12"
 3 ), 3)
 4 val rdd2 = rdd1.mapPartitionsWithIndex((index,iter)=>{
 5   val listBuffer = new ListBuffer[String]()
 6   while(iter.hasNext){
 7     val one = iter.next()
 8     listBuffer.append(s"rdd1 partition index = $index ,value = $one")
 9   }
10   listBuffer.iterator
11 })
12 val repartition = rdd2.repartition(2)
13 val result = repartition.mapPartitionsWithIndex((index,iter)=>{
14   val listBuffer = new ListBuffer[String]()
15   while(iter.hasNext){
16     val one = iter.next()
17     listBuffer.append(s"repartition partition index = $index ,value = $one")
18   }
19   listBuffer.iterator
20 })
21 val arr = result.collect()
22 arr.foreach(println)

15、lines.sample(有无放回抽样,抽样比例,种子)

1 val lines : RDD[String] = sc.textFile("./data/words")
2 val result: RDD[String] = lines.sample(false,0.1,100L)
3 result.foreach(println)

16、rdd.sortByKey(boolean):boolean代表是否排序

1 val lines: RDD[String] = sc.textFile("./data/words")
2 val words = lines.flatMap(line=>{line.split(" ")})
3 val pairWords = words.map(word=>{new Tuple2(word,1)})
4 val reduceResult : RDD[(String,Int)] = pairWords.reduceByKey((v1,v2)=>{v1+v2})
5 val transRDD1: RDD[(Int, String)] = reduceResult.map(tp=>{tp.swap})
6 val sortedRDD = transRDD1.sortByKey(false)
7 val result: RDD[(String, Int)] = sortedRDD.map(tp=>{tp.swap})
8 result.foreach(println)

17、rdd1.subtract(rdd2)

1 val rdd1 = sc.parallelize(List[Int](1,2,3,4,5,6))
2 val rdd2 = sc.parallelize(List[Int](1,2,3,4,7,8))
3 val result = rdd1.subtract(rdd2)
4 result.foreach(println)
5 /**
6  * 5
7  * 6
8  */

18、rdd1.union(rdd2)

 1 val rdd1 = sc.makeRDD(Array[String]("a","b","c","d"))
 2 val rdd2 = sc.parallelize(Array[String]("a","b","e","f"))
 3 val result: RDD[String] = rdd1.union(rdd2)
 4 result.foreach(println)
 5 /**
 6  * a
 7  * b
 8  * c
 9  * d
10  * a
11  * b
12  * e
13  * f
14  */

19、rdd.zipWithIndex()

1 val rdd = sc.parallelize(Array[String]("a","b","c","d","e"))
2 val result: RDD[(String, Long)] = rdd.zipWithIndex()
3 /**
4  * (a,0)
5  * (b,1)
6  * (c,2)
7  * (d,3)
8  * (e,4)
9  */

20、rdd1.zip(rdd2)

 1 val rdd1 = sc.parallelize(Array[String]("a","b","c","d","e"))
 2 val rdd2 = sc.parallelize(Array[Int](1,2,3,4,5))
 3 val result: RDD[(String, Int)] = rdd1.zip(rdd2)
 4 result.foreach(println)
 5 /**
 6  * (a,1)
 7  * (b,2)
 8  * (c,3)
 9  * (d,4)
10  * (e,5)
11  */
posted @ 2021-04-19 11:28  大数据程序员  阅读(201)  评论(0编辑  收藏  举报