Transfor类算子(转换类算子)
1、对RDD的分区重新进行划分:rdd1.coalesce(num,boolean)
1 val rdd1 = sc.parallelize(Array[String]("love1", "love2", "love3", "love4", "love5", "love6", "love7", "love8", "love9", "love10", "love11", "love12"), 3) 2 val result = rdd1.coalesce(4,true) 3 println(s"result partition length = "+result.getNumPartitions) 4 result.foreach(println)
2、rdd1.cogroup(rdd2)
1 val rdd1 = sc.parallelize(Array[(String,Int)]( 2 ("zhangsan",1), 3 ("lisi",2), 4 ("zhangsan",3), 5 ("wangwu",4), 6 ("wangwu",5) 7 )) 8 val rdd2 = sc.parallelize(Array[(String,Int)]( 9 ("zhangsan",10), 10 ("lisi",20), 11 ("zhangsan",30), 12 ("wangwu",40), 13 ("wangwu",50) 14 )) 15 val result: RDD[(String, (Iterable[Int], Iterable[Int]))] = rdd1.cogroup(rdd2) 16 result.foreach(tp=>{ 17 val key = tp._1 18 val value: Iterable[Int] = tp._2._1 19 value.foreach(println) 20 tp._2._2.foreach(println) 21 }) 22 result.foreach(println) 23 /** 24 * (zhangsan,(CompactBuffer(1, 3),CompactBuffer(10, 30))) 25 * (wangwu,(CompactBuffer(4, 5),CompactBuffer(40, 50))) 26 * (lisi,(CompactBuffer(2),CompactBuffer(20))) 27 */
3、对RDD中数据去重:rdd1.distinct()
1 val rdd1 = sc.parallelize(Array[String]("a","b","c","d","d","a","b","a","c","d")) 2 val result: RDD[String] = rdd1.distinct() 3 result.foreach(println)
4、对RDD中数据及逆行过滤:lines.filter
1 val lines: RDD[String] = sc.textFile("./data/words") 2 val result: RDD[String] = lines.filter(line => {!"hello spark".equals(line)}) 3 result.foreach(println)
5、对RDD中数据按Key分组:rdd.groupByKey()
1 val rdd = sc.parallelize(Array[(String, Int)]( 2 ("zhangsan", 10), 3 ("zhangsan", 20), 4 ("lisi", 30), 5 ("wangwu", 40), 6 ("wangwu", 50) 7 )) 8 val result: RDD[(String, Iterable[Int])] = rdd.groupByKey() 9 result.foreach(println) 10 /** 11 * (zhangsan,CompactBuffer(10, 20)) 12 * (wangwu,CompactBuffer(40, 50)) 13 * (lisi,CompactBuffer(30)) 14 */
6、rdd1.intersection(rdd2)
1 val rdd1 = sc.parallelize(Array[String]("a","b","c","d")) 2 val rdd2 = sc.parallelize(Array[String]("a","b","e","f")) 3 val result = rdd1.intersection(rdd2) 4 result.foreach(println) 5 /** 6 * a 7 * b 8 */
7、rdd1.join(rdd2)
1 val nameRDD: RDD[(String, Int)] = sc.parallelize(Array[(String, Int)]( 2 ("zhangsan", 18), 3 ("lisi", 19), 4 ("wangwu", 20), 5 ("maliu", 21) 6 ),3) 7 val scoreRDD: RDD[(String, Int)] = sc.parallelize(Array[(String, Int)]( 8 ("zhangsan", 100), 9 ("lisi", 200), 10 ("wangwu", 300), 11 ("tianqi", 400) 12 ),4) 13 val result: RDD[(String, (Int, Int))] = nameRDD.join(scoreRDD) 14 println(s"nameRDD partition length = ${nameRDD.getNumPartitions}") 15 println(s"scoreRDD partition length = ${scoreRDD.getNumPartitions}") 16 println(s"result partition length = ${result.getNumPartitions}") 17 result.foreach(println) 18 /** 19 * nameRDD partition length = 3 20 * scoreRDD partition length = 4 21 * result partition length = 4 22 * (zhangsan,(18,100)) 23 * (wangwu,(20,300)) 24 * (lisi,(19,200)) 25 */
8、rdd1.fullOuterJoin(rdd2)
1 val nameRDD: RDD[(String, Int)] = sc.parallelize(Array[(String, Int)]( 2 ("zhangsan", 18), 3 ("lisi", 19), 4 ("wangwu", 20), 5 ("maliu", 21) 6 )) 7 val scoreRDD: RDD[(String, Int)] = sc.parallelize(Array[(String, Int)]( 8 ("zhangsan", 100), 9 ("lisi", 200), 10 ("wangwu", 300), 11 ("tianqi", 400) 12 )) 13 val result: RDD[(String, (Option[Int], Option[Int]))] = nameRDD.fullOuterJoin(scoreRDD) 14 result.foreach(println) 15 /** 16 * (zhangsan,(Some(18),Some(100))) 17 * (wangwu,(Some(20),Some(300))) 18 * (maliu,(Some(21),None)) 19 * (tianqi,(None,Some(400))) 20 * (lisi,(Some(19),Some(200))) 21 */
9、rdd1.rightOuterJoin(rdd2)
1 val nameRDD: RDD[(String, Int)] = sc.parallelize(Array[(String, Int)]( 2 ("zhangsan", 18), 3 ("lisi", 19), 4 ("wangwu", 20), 5 ("maliu", 21) 6 )) 7 val scoreRDD: RDD[(String, Int)] = sc.parallelize(Array[(String, Int)]( 8 ("zhangsan", 100), 9 ("lisi", 200), 10 ("wangwu", 300), 11 ("tianqi", 400) 12 )) 13 /** 14 * (zhangsan,(Some(18),100)) 15 * (wangwu,(Some(20),300)) 16 * (tianqi,(None,400)) 17 * (lisi,(Some(19),200)) 18 */
10、rdd1.leftOuterJoin(rdd2)
1 val nameRDD: RDD[(String, Int)] = sc.parallelize(Array[(String, Int)]( 2 ("zhangsan", 18), 3 ("lisi", 19), 4 ("wangwu", 20), 5 ("maliu", 21) 6 )) 7 val scoreRDD: RDD[(String, Int)] = sc.parallelize(Array[(String, Int)]( 8 ("zhangsan", 100), 9 ("lisi", 200), 10 ("wangwu", 300), 11 ("tianqi", 400) 12 )) 13 val result: RDD[(String, (Option[Int], Int))] = nameRDD.rightOuterJoin(scoreRDD) 14 result.foreach(println) 15 /** 16 * key = zhangsan ,value1 = 18,value2 = 100 17 * key = wangwu ,value1 = 20,value2 = 300 18 * key = maliu ,value1 = 21,value2 = null 19 * key = lisi ,value1 = 19,value2 = 200 20 * (zhangsan,(18,Some(100))) 21 * (wangwu,(20,Some(300))) 22 * (maliu,(21,None)) 23 * (lisi,(19,Some(200))) 24 */
11、rdd.mapPartitions()
1 val rdd = sc.parallelize(List[String]("a","b","c","d","e"),3) 2 val result: RDD[String] = rdd.mapPartitions(iter => { 3 val listBuffer = new ListBuffer[String]() 4 println("建立数据库连接... ...") 5 while (iter.hasNext) { 6 val next = iter.next() 7 listBuffer.append(next + "#") 8 } 9 println("批量插入数据库.. ..." + listBuffer.toString()) 10 println("关闭数据库连接... ...") 11 listBuffer.iterator 12 }) 13 val strings: Array[String] = result.collect() 14 strings.foreach(println)
12、rdd1.mapPartitionsWithIndex()
1 val rdd1 = sc.parallelize(Array[String]("a","b","c","d","e","f"),3) 2 val result = rdd1.mapPartitionsWithIndex((index, iter) => { 3 val listBuffer = new ListBuffer[String]() 4 while (iter.hasNext) { 5 val one = iter.next() 6 listBuffer.append(s"partition : $index ,value : $one") 7 } 8 listBuffer.iterator 9 }) 10 result.foreach(println)
13、rdd1.mapValues()
1 val rdd1 = sc.parallelize(Array[(String, Int)]( 2 ("zhangsan", 100), 3 ("lisi", 100), 4 ("wangwu", 300), 5 ("maliu", 400), 6 ("tianqi", 500) 7 )) 8 val end: RDD[(String, Int)] = rdd1.mapValues(value =>{value+100}) 9 end.foreach(println) 10 /** 11 * (zhangsan,200) 12 * (lisi,200) 13 * (wangwu,400) 14 * (maliu,500) 15 * (tianqi,600) 16 */
14、rdd.repartition(num)
1 val rdd1 = sc.parallelize(Array[String]( 2 "love1", "love2", "love3", "love4", "love5", "love6", "love7", "love8", "love9", "love10", "love11", "love12" 3 ), 3) 4 val rdd2 = rdd1.mapPartitionsWithIndex((index,iter)=>{ 5 val listBuffer = new ListBuffer[String]() 6 while(iter.hasNext){ 7 val one = iter.next() 8 listBuffer.append(s"rdd1 partition index = $index ,value = $one") 9 } 10 listBuffer.iterator 11 }) 12 val repartition = rdd2.repartition(2) 13 val result = repartition.mapPartitionsWithIndex((index,iter)=>{ 14 val listBuffer = new ListBuffer[String]() 15 while(iter.hasNext){ 16 val one = iter.next() 17 listBuffer.append(s"repartition partition index = $index ,value = $one") 18 } 19 listBuffer.iterator 20 }) 21 val arr = result.collect() 22 arr.foreach(println)
15、lines.sample(有无放回抽样,抽样比例,种子)
1 val lines : RDD[String] = sc.textFile("./data/words") 2 val result: RDD[String] = lines.sample(false,0.1,100L) 3 result.foreach(println)
16、rdd.sortByKey(boolean):boolean代表是否排序
1 val lines: RDD[String] = sc.textFile("./data/words") 2 val words = lines.flatMap(line=>{line.split(" ")}) 3 val pairWords = words.map(word=>{new Tuple2(word,1)}) 4 val reduceResult : RDD[(String,Int)] = pairWords.reduceByKey((v1,v2)=>{v1+v2}) 5 val transRDD1: RDD[(Int, String)] = reduceResult.map(tp=>{tp.swap}) 6 val sortedRDD = transRDD1.sortByKey(false) 7 val result: RDD[(String, Int)] = sortedRDD.map(tp=>{tp.swap}) 8 result.foreach(println)
17、rdd1.subtract(rdd2)
1 val rdd1 = sc.parallelize(List[Int](1,2,3,4,5,6)) 2 val rdd2 = sc.parallelize(List[Int](1,2,3,4,7,8)) 3 val result = rdd1.subtract(rdd2) 4 result.foreach(println) 5 /** 6 * 5 7 * 6 8 */
18、rdd1.union(rdd2)
1 val rdd1 = sc.makeRDD(Array[String]("a","b","c","d")) 2 val rdd2 = sc.parallelize(Array[String]("a","b","e","f")) 3 val result: RDD[String] = rdd1.union(rdd2) 4 result.foreach(println) 5 /** 6 * a 7 * b 8 * c 9 * d 10 * a 11 * b 12 * e 13 * f 14 */
19、rdd.zipWithIndex()
1 val rdd = sc.parallelize(Array[String]("a","b","c","d","e")) 2 val result: RDD[(String, Long)] = rdd.zipWithIndex() 3 /** 4 * (a,0) 5 * (b,1) 6 * (c,2) 7 * (d,3) 8 * (e,4) 9 */
20、rdd1.zip(rdd2)
1 val rdd1 = sc.parallelize(Array[String]("a","b","c","d","e")) 2 val rdd2 = sc.parallelize(Array[Int](1,2,3,4,5)) 3 val result: RDD[(String, Int)] = rdd1.zip(rdd2) 4 result.foreach(println) 5 /** 6 * (a,1) 7 * (b,2) 8 * (c,3) 9 * (d,4) 10 * (e,5) 11 */