spark 算子

spark 算子的分类

转换算子

transformations 延迟执行--针对RDD的操作

操作算子

Action 触发执行

常用算子归纳

程序示例

transformations map、mapPartitions、mapPartitionsWithIndex

package com.shujia.spark

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo3Map {
  def main(args: Array[String]): Unit = {

    val conf: SparkConf = new SparkConf()
      .setAppName("map")
      .setMaster("local")

    val sc = new SparkContext(conf)


    /**
      * 构建rdd的方法
      * 1、读取文件
      * 2、基于scala的集合构建rdd --- 用于测试
      */


    //基于集合构建rdd
    // Sequence -- Scala中所有集合的父类
    val listRDD: RDD[Int] = sc.parallelize(List(1, 2, 3, 4, 5, 6, 7, 8, 9), 2)

    println(listRDD.getNumPartitions)

    /**
      * map算子:
      * 将rdd中的数据一条一条传递给后面的函数。将函数的返回值构建成一个新的rdd
      * map不会产生shuffle,map之后rdd的分区数等于map之前rdd的分区数
      *
      * map是一个转换算子-- 懒执行, 需要一个action触发执行
      * foreach: action算子
      *
      * 如果一个算子的返回值是一个新的rdd,那么这个算子就是转换算子
      *
      * 
      */
    val mapRDD: RDD[Int] = listRDD.map(i => {
      println("map")
      i * 2
    })

    //mapRDD.foreach(println)

    /**
      * mapPartitions : 将每一个分区的数据传递给后面的函数,后面的函数需要返回一个迭代器,再构建一个新的rdd
      *
      * 迭代器中是一个分区的数据
      *
      * mapPartitions: 懒执行,转换算子
      */

    val mapPartitionsRDD: RDD[Int] = listRDD.mapPartitions((iter: Iterator[Int]) => {

      val iterator: Iterator[Int] = iter.map(i => i * 2)

      //最后一行作为返回值
      iterator
    }
    )

    //mapPartitionsRDD.foreach(println)


    /**
      * mapPartitionsWithIndex: 一次遍历一个分区,会多一个分区的下标
      *
      */

    val mapPartitionsWithIndexRDD: RDD[Int] = listRDD.mapPartitionsWithIndex((index: Int, iter: Iterator[Int]) => {
      println(s"当前分区的编号:$index")
      iter
    })

    mapPartitionsWithIndexRDD.foreach(println)
  }
}

transformations Filter

package com.shujia.spark

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo4Filter {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
      .setAppName("Demo4Filter")
      .setMaster("local")

    val sc = new SparkContext(conf)

    //构建rdd

    val listRDD: RDD[Int] = sc.parallelize(List(1, 2, 3, 4, 5, 6, 7, 8))

    /**
      * filter: 对rdd中的数据进行过滤,函数返回true保留数据,函数返回false过滤数据
      *
      * filter: 转换算子,懒执行
      */

    val filterRDD: RDD[Int] = listRDD.filter(i => {
      i % 2 == 1
    })

    filterRDD.foreach(println)
  }
}

transformations FlatMap

package com.shujia.spark

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable

object Demo5FlatMap {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
      .setAppName("Demo5FlatMap")
      .setMaster("local")

    val sc = new SparkContext(conf)


    val linesRDD: RDD[String] = sc.parallelize(List("java,spark,java", "spark,scala,hadoop"))

    /**
      * flatMap: 将rdd的数据一条一条传递给后面的函数,
      * 后面的函数的返回值是一个集合,最后会将这个集合拆分出来构建成一个新的rdd
      *
      */

    val wordsRDD: RDD[String] = linesRDD.flatMap(line => {
      val arr: Array[String] = line.split(",")
      //返回值可以是一个数组,list,set map ,必须是scala中的集合
      arr.toList
    })

    wordsRDD.foreach(println)

  }
}

transformations Sample

package com.shujia.spark

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo6Sample {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
      .setAppName("Demo5FlatMap")
      .setMaster("local")

    val sc = new SparkContext(conf)

    val studentRDD: RDD[String] = sc.textFile("data/students.txt")

    /**
      * sample: 抽样, 不是精确抽样
      * withReplacement: 是否放回
      * fraction: 抽样比例(近似的)
      *
      */
    val sample: RDD[String] = studentRDD.sample(false, 0.1)

    sample.foreach(println)

  }
}

Action 算子 foreach、saveAsTextFile、count、collect

package com.shujia.spark

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo7Action {
  def main(args: Array[String]): Unit = {

    /**
      * spark 任务的层级关系
      * application --> job(每个Action 算子触发的任务) --> stage(类似MapReduce中的map端或者reduce端)--> task(每个分区中的task)
      * 如果算子的返回值不是RDD,就是 Action 算子
      */

    val conf: SparkConf = new SparkConf()
      .setAppName("Demo5FlatMap")
      .setMaster("local")

    val sc = new SparkContext(conf)

    /**
      * action算子 -- 触发任务执行,每一个action算子都会触发一个job任务
      * 1、foreach: 遍历rdd
      * 2、saveAsTextFile : 保存数据
      * 3、count : 统计rdd的行数
      * 4、collect: 将rdd转换成scala的集合
      */


    val studentsRDD: RDD[String] = sc.textFile("data/students.txt")

    /**
      * foreach: 一次遍历一条数据
      * foreachPartition: 一次遍历一个分区
      *
      */
    studentsRDD.foreach(println)

    studentsRDD.foreachPartition((iter: Iterator[String]) => println(iter.toList))

    /**
      * saveAsTextFile: 将数据保存到hdfs
      * 1、输出目录不能存在
      * 2、rdd一个分区对应一个文件
      *
      */

    //studentsRDD.saveAsTextFile("data/temp2")

    /**
      *count: 统计行数
      *
      */

    val count: Long = studentsRDD.count()
    println(s"studentsRDD的行数:$count")

    /**
      * collect: 将rdd的数据拉取到内存中
      * 如果数据量很大,会出现内存溢出
      *
      */

    val studentArr: Array[String] = studentsRDD.collect()
    println(studentArr)

    while (true) {

    }
  }
}

transformations groupBy、groupByKey

package com.shujia.spark

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo8GroupByKey {
  def main(args: Array[String]): Unit = {

    val conf: SparkConf = new SparkConf()
      .setMaster("local")
      .setAppName("groupByKey")


    val sc = new SparkContext(conf)

    val linesRDD: RDD[String] = sc.textFile("data/words.txt")

    val wordsRDD: RDD[String] = linesRDD.flatMap(line => line.split(","))

    //将rdd转换成kv格式
    //Optionally, a Partitioner for key-value RDDs
    //分区类的算子只能作用在k-v格式的RDD上
    //所有Bykey的算子
    val kvRDD: RDD[(String, Int)] = wordsRDD.map(word => (word, 1))

    /**
      * groupByKey: 按照key进行分组,必须是kv格式的rdd,  将同一个key的value放到迭代器中
      *
      */

    val groupBykeyRDD: RDD[(String, Iterable[Int])] = kvRDD.groupByKey()

    groupBykeyRDD.foreach(println)

    val countRDD: RDD[(String, Int)] = groupBykeyRDD.map {
      case (word: String, ints: Iterable[Int]) =>
        val count: Int = ints.sum
        (word, count)
    }

    countRDD.foreach(println)

    /**
      * groupBy: 指定一个分组的列,返回的rdd的value包含所有的数据
      * shuffle过程中需要传输的数据量比groupBykey要多,性能要差一点
      *
      */

    val groupByRDD: RDD[(String, Iterable[(String, Int)])] = kvRDD.groupBy(kv => kv._1)

    groupByRDD.foreach(println)
  }
}

transformations reduceByKey

package com.shujia.spark

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

object Demo9ReduceBykey {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
      .setMaster("local")
      .setAppName("groupByKey")

    val sc = new SparkContext(conf)

    val linesRDD: RDD[String] = sc.textFile("data/words.txt")

    val wordsRDD: RDD[String] = linesRDD.flatMap(line => line.split(","))

    //将rdd转换成kv格式
    val kvRDD: RDD[(String, Int)] = wordsRDD.map(word => (word, 1))

    /**
      * reduceByKey: 按照key分组并将value进行聚合计算,聚合的方式由传入的函数来定
      * 按照key进行聚合计算,会在Map端进行预聚合,所以reduceByKey的性能要比groupByKey更好
      * 只能做简单的聚合计算
      * 聚合的过程是一个一个的进行的
      * 例如:(A,1)和(A,1)聚合完之后形成(A,2),(A,2)再和下一个(A,1)聚合形成(A,3),……
      */
    //统计单词的数量
    val countRDD: RDD[(String, Int)] = kvRDD.reduceByKey((x: Int, y: Int) => x + y)

    countRDD.foreach(println)
  }
}

groupByKey、reduceByKey 底层原理图(面试常问)

transformations union、distinct、intersection

package com.shujia.spark

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo10Union {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
      .setMaster("local")
      .setAppName("Demo10Union")


    val sc = new SparkContext(conf)

    // makeRDD:将集合构建成RDD
    val rdd1: RDD[Int] = sc.makeRDD(List(1, 2, 3, 4, 5, 6, 7, 8))
    val rdd2: RDD[Int] = sc.makeRDD(List(4, 5, 6, 7, 8, 9, 10))

    /**
      * union:合并两个rdd, 两个rdd的数据类型要一致,默认不去重
      * union只是逻辑层面(代码层面)的合并,在物理层面并没有合并
      * union 不会产生shuffle
      *
      */
    val unionRDD: RDD[Int] = rdd1.union(rdd2)

    unionRDD.foreach(println)

    /**
      * distinct: 去重
      * distinct: 会产生shuffle
      * distinct: 会先在map端局部去重,再到reduce端全局去重
      */

    val distinctRDD: RDD[Int] = unionRDD.distinct()

    distinctRDD.foreach(println)

    /**
      * 所有会产生shuffle的算子都可以指定shuffle后的分区域数
      * distinct也可以指定shuffle后的分区域数
      */

    /**
      * intersection: 取两个rdd的交集
      * 可以指定shuffle后的分区数 -- 会有shuffle的过程
      */

    val intersectionRDD: RDD[Int] = rdd1.intersection(rdd2)

    intersectionRDD.foreach(println)
  }
}

transformations join、leftOuterJoin、fullOuterJoin

package com.shujia.spark

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo11Join {
  def main(args: Array[String]): Unit = {

    val conf: SparkConf = new SparkConf()
      .setMaster("local")
      .setAppName("Demo10Union")

    val sc = new SparkContext(conf)

    val namesRDD: RDD[(String, String)] = sc.makeRDD(List(("001", "张三"), ("002", "李四")))
    val agesRDD: RDD[(String, Int)] = sc.makeRDD(List(("002", 24), ("003", 25)))

    /**
      * inner join; 通过rdd的key进行关联
      *
      */
    val innerJoinRDD: RDD[(String, (String, Int))] = namesRDD.join(agesRDD)

    //关联之后整理数据
    val rdd1: RDD[(String, String, Int)] = innerJoinRDD.map {
      case (id: String, (name: String, age: Int)) =>
        (id, name, age)
    }
    
    //rdd1.foreach(println)

    /**
      * left join : 以左表为主,关联之后如果右表对应位置没有数据,则为null
      * Option: 是一个可选的值 (有值:Some , 没有值:None)
      */

    val leftJoinRDD: RDD[(String, (String, Option[Int]))] = namesRDD.leftOuterJoin(agesRDD)

    //整理数据
    val rdd2: RDD[(String, String, Int)] = leftJoinRDD.map {
      //匹配关联成功的数据
      case (id: String, (name: String, Some(age))) =>
        (id, name, age)

      //匹配没有关联成功的数据
      case (id: String, (name: String, None)) =>
        //没有关联可以给一个默认值0
        (id, name, 0)
    }

    // rdd2.foreach(println)

    /**
      * full join: 两表关联时两边都可能出现关联不上,关联不上则补null
      *
      */

    val fullJOinRDD: RDD[(String, (Option[String], Option[Int]))] = namesRDD.fullOuterJoin(agesRDD)

    //整理数据
    val rdd3: RDD[(String, String, Int)] = fullJOinRDD.map {
      case (id: String, (Some(name), Some(age))) =>
        (id, name, age)

      case (id: String, (None, Some(age))) =>
        (id, "默认值", age)

      case (id: String, (Some(name), None)) =>
        (id, name, 0)
    }

    rdd3.foreach(println)

  }
}

transformations mapValues

package com.shujia.spark

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

object Demo12MapValues {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
      .setMaster("local")
      .setAppName("Demo10Union")

    val sc = new SparkContext(conf)

    val agesRDD: RDD[(String, Int)] = sc.makeRDD(List(("002", 24), ("003", 25)))

    // map 
    val mapRDD: RDD[(String, Int)] = agesRDD.map {
      case (name: String, age: Int) =>
        (name, age + 1)
    }

    mapRDD.foreach(println)
    /**
      * mapValues: 和map差不多,但只对 value 做处理,key不变
      *
      */
    val mapValuesRDD: RDD[(String, Int)] = agesRDD.mapValues(v => v + 1)

    mapValuesRDD.foreach(println)
  }
}

transformations sortBy、sortByKey

package com.shujia.spark

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo13Sort {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
      .setMaster("local")
      .setAppName("Demo10Union")

    val sc = new SparkContext(conf)

    val studentRDD: RDD[String] = sc.textFile("data/students.txt")

    /**
      *
      * sortBy: 指定一个排序的列,默认是升序
      * ascending 控制排序方式
      */

    // 指定降序 ascending = false
    val sortByRDD: RDD[String] = studentRDD.sortBy(student => {
      val age: Int = student.split(",")(2).toInt
      age
    }, ascending = false)

    sortByRDD.foreach(println)

    /**
      * sortByKey: 通过key进行排序,默认是升序
      * 要求 k-v 的RDD
      */

    val agesRDD: RDD[(Int, String)] = sc.makeRDD(List((24, "002"), (25, "003")))

    val sortByKeyRDD: RDD[(Int, String)] = agesRDD.sortByKey()

    sortByKeyRDD.foreach(println)
  }
}

transformations aggregateByKey

package com.shujia.spark

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

object Demo14Agg {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
      .setMaster("local")
      .setAppName("Demo14Agg")

    val sc = new SparkContext(conf)

    val linesRDD: RDD[String] = sc.textFile("data/words.txt")

    val wordsRDD: RDD[String] = linesRDD.flatMap(line => line.split(","))

    //将rdd转换成kv格式
    val kvRDD: RDD[(String, Int)] = wordsRDD.map(word => (word, 1))

    /**
      * reduceByKey: 会再map进行预聚合,后面的聚合函数会应用在map端和reduce端(聚合函数会应用在分区内的聚合和分区间的聚合)
      *
      */
    val reduceRDD: RDD[(String, Int)] = kvRDD.reduceByKey((x, y) => x + y)
    reduceRDD.foreach(println)

    /**
      * aggregateByKey: 按照key进行聚合,需要传入两个聚合函数,一个是map端的聚合,一个是reduce端的聚合
      *
      */
    // (0) -- 指定初始值(在map端聚合时的初始值)
    val aggRDD: RDD[(String, Int)] = kvRDD.aggregateByKey(0)(
      (u: Int, i: Int) => u + i, //分区内的聚合函数(map端的聚合函数)
      (u1: Int, u2: Int) => u1 + u2 // 分区间的聚合函数(reduce端的聚合函数)
    )

    aggRDD.foreach(println)
  }
}

计算班级的平均年龄

使用groupByKey实现

使用aggregateByKey实现

package com.shujia.spark

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo15AggAvgAge {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
      .setMaster("local")
      .setAppName("Demo14Agg")


    val sc = new SparkContext(conf)

    /**
      * 计算班级的平均年龄
      *
      */

    val studentsRDD: RDD[String] = sc.textFile("data/students.txt")

    //先取出班级和年龄
    val clazzAndAge: RDD[(String, Double)] = studentsRDD.map(student => {
      val split: Array[String] = student.split(",")
      (split(4), split(2).toDouble)
    })

    /**
      * 1、使用groupByKey实现
      *
      */

    val groupByRDD: RDD[(String, Iterable[Double])] = clazzAndAge.groupByKey()

    //计算平均年龄
    val avgAgeRDD: RDD[(String, Double)] = groupByRDD.map {
      case (clazz: String, ages: Iterable[Double]) =>
        val avgAge: Double = ages.sum / ages.size
        (clazz, avgAge)
    }

    avgAgeRDD.foreach(println)

    /**
      * 在大数据计算中shuffle是最耗时间的,因为shuffle过程中的数据是需要落地到磁盘的
      *
      * 高性能算子
      * aggregateByKey: 会先在map端做预聚合,性能高, 可以减少shuffle过程中的数据量
      * 1、初始值,初始值可以有多个
      * 2、map端的聚合函数
      * 3、reduce端的聚合函数
      */

    val aggRDD: RDD[(String, (Double, Int))] = clazzAndAge.aggregateByKey((0.0, 0))(
      (u: (Double, Int), age: Double) => (u._1 + age, u._2 + 1), //map端的聚合函数
      (u1: (Double, Int), u2: (Double, Int)) => (u1._1 + u2._1, u1._2 + u2._2) //reduce端的聚合函数
    )

    //计算平均年龄
    val avgAgeRDD2: RDD[(String, Double)] = aggRDD.map {
      case (clazz: String, (sumAge: Double, num: Int)) =>
        (clazz, sumAge / num)
    }

    avgAgeRDD2.foreach(println)

    while (true){

    }
  }
}

transformations cartesian

package com.shujia.spark

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo16cartesian {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
      .setMaster("local")
      .setAppName("Demo14Agg")

    val sc = new SparkContext(conf)

    val namesRDD: RDD[(String, String)] = sc.makeRDD(List(("001", "张三"), ("002", "李四")))
    val agesRDD: RDD[(String, Int)] = sc.makeRDD(List(("002", 24), ("003", 25)))

    /**
      * 笛卡尔积 -- cartesian
      * 很少用,性能非常差
      */

    val cartesianRDD: RDD[((String, String), (String, Int))] = namesRDD.cartesian(agesRDD)

    cartesianRDD.foreach(println)
  }
}

transformations pipe

在spark代码中调用外部脚本的方法 -- 很少用

常用于和第三方整合

Action reduce、sum; transformations reduceBykey

package com.shujia.spark

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo17Reduce {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
      .setMaster("local")
      .setAppName("Demo14Agg")

    val sc = new SparkContext(conf)
    val listRDD: RDD[Int] = sc.makeRDD(List(1, 2, 3, 4, 5, 6, 7, 8, 9))

    /**
      * sum 求和,Action算子 只能用于int,double,long类型的求和
      *
      */
    val sum: Double = listRDD.sum()
    println(sum)

    /**
      * reduce: 全局聚合 action算子
      * reduceBykYe 通过key进行聚合 transformations 算子
      */
    val reduce: Int = listRDD.reduce((x, y) => x + y)
    println(reduce)
  }
}

Action take、first

package com.shujia.spark

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

object Demo18Take {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
      .setMaster("local")
      .setAppName("Demo14Agg")

    val sc = new SparkContext(conf)

    val studentsRDD: RDD[String] = sc.textFile("data/students.txt")

    /**
      * take: 取top,是一个 action 算子
      *
      */
      
    // 取 前100条
    val top100: Array[String] = studentsRDD.take(100)
    top100.foreach(println)

    //first -- 获取第一条数据  Action 算子
    val first: String = studentsRDD.first()
    println(first)

  }
}

练习

统计总分大于年级平均分的学生

package com.shujia.spark

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo19Student1 {
  def main(args: Array[String]): Unit = {
    /**
      * 统计总分大于年级平均分的学生
      */

    val conf: SparkConf = new SparkConf()
      .setMaster("local")
      .setAppName("Demo14Agg")

    val sc = new SparkContext(conf)

    val scoreRDD: RDD[String] = sc.textFile("data/score.txt")

    val idAndScoreRDD: RDD[(String, Double)] = scoreRDD.map(s => {
      val split: Array[String] = s.split(",")
      (split(0), split(2).toDouble)
    })

    //1、计算学生的总分
    val stuSumSocreRDD: RDD[(String, Double)] = idAndScoreRDD.reduceByKey((x, y) => x + y)

    //所有的总分
    val scores: RDD[Double] = stuSumSocreRDD.map(kv => kv._2)

    //计算平均分
    val avgSco: Double = scores.sum() / scores.count()

    println(s"年级平均分:$avgSco")

    //取出总分打印平均分的学生
    val filterRDD: RDD[(String, Double)] = stuSumSocreRDD.filter {
      case (id: String, sco: Double) =>
        sco > avgSco
    }

    filterRDD.foreach(println)
  }
}
posted @ 2022-03-07 23:56  赤兔胭脂小吕布  阅读(63)  评论(0编辑  收藏  举报