Spark核心编程模型之RDD（二）

　　承接上一篇，key-value类型RDD

ShuffleRDD

　　上篇文章里讲到，combineByKey有参数支持设置分区器，它返回的结果就是ShuffleRDD，对于shuffle过程，父RDD会先对map计算的结果聚合(mapSideCombine参数默认为true，会进行聚合)，然后按照分区器对结果进行分区，中间数据会以文件的形式写入磁盘，reduce端就会对已经分区好的中间数据进行读数据然后聚合

　　这些特点都没理解。。。。

　　记住aggregateByKey和reduceByKey，foldByKey，groupByKey都基于combineByKey实现的，只是参数函数不同而已，其中distinct是用reduceByKey实现的，groupBy是用groupByKey实现

　　其他的看代码

import org.apache.spark.serializer.JavaSerializer
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag


    val pairRDD = sc.parallelize[(Int, Int)](Seq((1, 2), (3, 4), (3, 6), (5, 6)), 2)
    // 指定初始值 累加 统计次数，多元操作
    pairRDD.aggregateByKey((0, 0))( // createCombiner = mergeValue((0, 0), v)
      (acc: (Int, Int), v) => (acc._1 + v, acc._2 + 1), //mergeValue
      (acc1: (Int, Int), acc2: (Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2) // mergeCombiners
    ).collect()
    // 效果和下面的是一样的
    def createCombinerAggregate = (value: Int) => mergeValueAggregate((0, 0), value)
    def mergeValueAggregate = (acc: (Int, Int), v: Int) => (acc._1 + v, acc._2 + 1)
    def mergeCombinersAggregate = (acc1: (Int, Int), acc2: (Int, Int)) =>
      (acc1._1 + acc2._1, acc1._2 + acc2._2)
    pairRDD.combineByKey(createCombinerAggregate,
      mergeValueAggregate, mergeCombinersAggregate).collect()


    // createCombiner = (v: V) => v
    // mergeValue = (x, y) => x + y
    // mergeCombiners = (x, y) => x + y
    // 基于key  对值累加
    pairRDD.reduceByKey((x, y) => x + y).collect()
    //效果和下面的是一样的
    def createCombinerReduce = (value: Int) => value
    def mergeValueReduce = (v: Int, value: Int) => v + value
    def mergeCombinersReduce = (v: Int, value: Int) => v + value
    pairRDD.combineByKey(createCombinerReduce, mergeValueReduce, mergeCombinersReduce).collect()

    // createCombiner = (v: V) => mergeValue(0, v)
    // mergeValue = (x, y) => x + y
    // mergeCombiners = (x, y) => x + y
    // 和reduceByKey效果差不多，只是给定了累加的起始值
    pairRDD.foldByKey(0)((x, y) => x + y).collect()
    //效果和下面的是一样的
    def createCombinerFold = (value: Int) => mergeValueFold(0, value)
    def mergeValueFold = (v: Int, value: Int) => v + value
    def mergeCombinersFold = (v: Int, value: Int) => v + value
    pairRDD.combineByKey(createCombinerFold, mergeValueFold, mergeCombinersFold).collect()

    //createCombiner = (v: V) => CompactBuffer(v)
    //mergeValue = (buf: CompactBuffer[V], v: V) => buf += v
    //mergeCombiners = (c1: CompactBuffer[V], c2: CompactBuffer[V]) => c1 ++= c2
    // 基于key相同的  分组
    pairRDD.groupByKey().collect()
    //效果和下面的是一样的
    def createCombinerGroup = (value: Int) => ArrayBuffer(value)
    def mergeValueGroup = (buf: ArrayBuffer[Int], value: Int) => buf += value
    def mergeCombinersGroup = (buf1: ArrayBuffer[Int], buf2: ArrayBuffer[Int]) => buf1 ++= buf2
    pairRDD.combineByKey(createCombinerGroup, mergeValueGroup, mergeCombinersGroup,
      new HashPartitioner(2), false).collect()


    val rdd = sc.parallelize(Seq(1,2,2,3,1))
    val distinctRDD = rdd.distinct()  //对rdd数据去重
    distinctRDD.collect()

    pairRDD.reduceByKeyLocally((x, y) => x + y)  //效果和reduceByKey差不多，只是返回值类型 变成了map类型

　　CombineByKey练习：按key聚合统计值的正负个数

package com.twq.spark.rdd.keyvalue

import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by tangweiqun on 2017/8/19.
  */
object CombineByKeyPractice {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("word count")

    val sc = new SparkContext(conf)

    val pairStrRDD = sc.parallelize[(String, Double)](Seq(("coffee", 0.6),
      ("coffee", -0.1), ("panda", -0.3), ("coffee", 0.1)), 2)

    def createCombiner = (label: Double) => new BinaryLabelCounter(0L, 0L) += label

    def mergeValue = (c: BinaryLabelCounter, label: Double) => c += label

    def mergeCombiners = (c1: BinaryLabelCounter, c2: BinaryLabelCounter) => c1 += c2

    // 统计相同key对应的所有值中是正数值的个数以及是负数值的个数
    //需要的三个参数：
    //createCombiner: V => C,
    //mergeValue: (C, V) => C,
    //mergeCombiners: (C, C) => C
    val testCombineByKeyRDD =
      pairStrRDD.combineByKey(createCombiner, mergeValue, mergeCombiners)
    testCombineByKeyRDD.collect()
  }

}

class BinaryLabelCounter(var numPositives: Long = 0L,
                         var numNegatives: Long = 0L) extends Serializable {

  def +=(label: Double): BinaryLabelCounter = {
    if (label > 0) numPositives += 1L else numNegatives += 1L
    this
  }

  def +=(other: BinaryLabelCounter): BinaryLabelCounter = {
    numPositives += other.numPositives
    numNegatives += other.numNegatives
    this
  }

  override def toString: String = s"{numPos: $numPositives, numNeg: $numNegatives}"

}

　　reduceByKey和foldByKey的区别

    //1  rdd action api reduce and fold
    val emptyRdd = sc.emptyRDD[Int]
    emptyRdd.reduce(_ + _)  // 如果空的rdd reduce会报错
    //java.lang.UnsupportedOperationException: empty collection
    //  at org.apache.spark.rdd.RDD$$anonfun$reduce$1$$anonfun$apply$36.apply(RDD.scala:1027)

    emptyRdd.fold(0)(_ + _) // res1: Int = 0  而fold在计算空RDD不会报错

    val testRdds = sc.parallelize(Seq(ArrayBuffer(0, 1, 3), ArrayBuffer(2, 4, 5)))
    // fold由于要指定初始值的特点，在下面的这种场景非常合适: 可变数组 ++ ++=
    // 会产生很多的中间临时对象 因为ArrayBuffer ++ ArrayBuffer会创建一个新的ArrayBuffer对象
    ArrayBuffer(0, 1, 3) ++ ArrayBuffer(0, 1, 3)
    testRdds.reduce(_ ++ _)
    // ArrayBuffer只初始化一次，每次都是将ArrayBuffer append到之前的ArrayBuffer中，不会产生中间临时对象
    ArrayBuffer(0, 1, 3) ++= ArrayBuffer(0, 1, 3)
    testRdds.fold(ArrayBuffer.empty[Int])((buff, elem) => buff ++= elem)

    //2 key-value rdd transformations api reduceByKey and foldByKey
    //空的RDD的行为是一样的
    val emptyKeyValueRdd = sc.emptyRDD[(Int, Int)]
    emptyKeyValueRdd.reduceByKey(_ + _).collect  // key-value的  不会报错
//    scala> emptyKeyValueRdd.reduceByKey(_+_).collect
//    res2: Array[(Int, Int)] = Array()
    emptyKeyValueRdd.foldByKey(0)(_ + _).collect

    // 同样适用 可变数组的场景
    val testPairRdds = sc.parallelize(Seq(("key1", ArrayBuffer(0, 1, 3)),
      ("key2", ArrayBuffer(2, 4, 5)), ("key1", ArrayBuffer(2, 1, 3))))
    testPairRdds.reduceByKey(_ ++ _).collect()
    testPairRdds.foldByKey(ArrayBuffer.empty[Int])((buff, elem) => buff ++= elem).collect()

　　reduceByKey和groupByKey对比：

　　reduceByKey默认会在map端汇总的，汇总后可以达到减少网络传输，而GroupByKey则不会在map端汇总，它把所有的map端计算结果shuffle发送到reduce端进行汇总计算

　　好像这么说reduceByKey性能好，那groupByKey有什么使用场景呢？reduceByKey既然是在map端先汇总，那对于汇总计算场景性能好，比如求和，统计个数，求平均，但是对于不是聚合操作的场景，groupByKey就更适用了，比如对同一key的值进行排序

　　groupByKey还要注意的地方就是：它会把所有的数据拿到reduce端再计算，如果数据量大，可能会撑爆内存，这种情况，就需要我们重新设计key了

package com.twq.spark.rdd.keyvalue

import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}


/**
  * Created by tangweiqun on 2017/8/19.
  */
object ReduceAndGroupByKeyCompare {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("word count")

    val sc = new SparkContext(conf)

    val pairRDD = sc.parallelize(Seq(("a", 1), ("b", 2), ("c", 1), ("a", 2),
        ("c", 4), ("b", 1), ("a", 1), ("a", 1)), 3)
    //结果都是 res1: Array[(String, Int)] = Array((b,3), (a,5), (c,5))
    pairRDD.reduceByKey(new HashPartitioner(2), _ + _).collect() //先map端汇总，再shuffle重分区，reduce端再汇总

    pairRDD.groupByKey(new HashPartitioner(2)).map(t => (t._1, t._2.sum)).collect()  //直接shuffle重分区，再reduce端汇总

    //需要对同一个key下的所有value值进行排序
    pairRDD.groupByKey().map { case (key, iter) =>
      val sortedValues = iter.toArray.sorted
      (key, sortedValues)
    }.collect()

    //对于一个key对应的value有很多数据的话，groupByKey可能发生OOM，可以通过重新设计key来消除这个OOM
  }

}

　　总结：对combineByKey的儿子们进行总结-- aggregateByKey和reduceByKey，foldByKey，groupByKey

aggregateByKey，多维度统计key对应的数据
reduceByKey 和 foldByKey 都是针对key统计数据，foldByKey需要指定初始值，因为这个，foldByKey在某些场景会比reduceByKey更好，比如防止空指针异常(这个在非key-value RDD有体现)和GC优化
reduceByKey和groupByKey，reduceByKey会在map端聚合，主要用于汇总计算，而groupByKey不会在map端聚合，主要用于汇总会排序场景

　　cogroup 主要对两个RDD进汇总，主要是把同一key的元素收集到一个元组里

　　其中groupWith，join，rightOuterJoin，leftOuterJoin，fullOuterJoin都是由cogroup实现

package com.twq.spark.rdd.keyvalue

import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by tangweiqun on 2017/8/19.
  */
object CogroupApiTest {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("word count")

    val sc = new SparkContext(conf)

    val pairRDD = sc.parallelize[(Int, Int)](Seq((1, 2), (3, 4), (3, 6), (5, 6)), 4)

    val otherRDD = sc.parallelize(Seq((3, 9), (4, 5)))

    //res0: Array[(Int, (Iterable[Int], Iterable[Int]))]
    // = Array((4,(CompactBuffer(),CompactBuffer(5))), (1,(CompactBuffer(2),CompactBuffer())),
    // (5,(CompactBuffer(6),CompactBuffer())), (3,(CompactBuffer(6, 4),CompactBuffer(9))))
    pairRDD.cogroup(otherRDD).collect() // 按照键进行汇总，把两个RDD中 key对应的值，分别封装在 两个CompactBuffer对象中

    //groupWith是cogroup的别名，效果和cogroup一摸一样
    pairRDD.groupWith(otherRDD).collect()

    // Array[(Int, (Int, Int))] = Array((3,(4,9)), (3,(6,9)))
    pairRDD.join(otherRDD).collect()  //相当sql中的inner join， 只有key在两个RDD都有的，才会join，主要对两个RDD key对应的值，随机组合呈现

    // Array[(Int, (Int, Option[Int]))]
    // = Array((1,(2,None)), (5,(6,None)), (3,(4,Some(9))), (3,(6,Some(9))))
    pairRDD.leftOuterJoin(otherRDD).collect()  //以左RDD为准所有的展示，右RDD没有的 以None显示

    // Array[(Int, (Option[Int], Int))] = Array((4,(None,5)), (3,(Some(4),9)), (3,(Some(6),9)))
    pairRDD.rightOuterJoin(otherRDD).collect() //以右RDD为准所有的展示，左RDD没有的 以None显示

    // Array[(Int, (Option[Int], Option[Int]))]
    // = Array((4,(None,Some(5))), (1,(Some(2),None)), (5,(Some(6),None)),
    // (3,(Some(4),Some(9))), (3,(Some(6),Some(9))))
    pairRDD.fullOuterJoin(otherRDD).collect() 

    // 减掉相同的key, 这个示例减掉了为3的key
    // Array[(Int, Int)] = Array((1,2), (5,6))
    pairRDD.subtractByKey(otherRDD).collect()  //理解为取差集，otherRDD有的key  就不返回

  }

}

　　cogroup实现原理：

　　这里假如RDD1是没有分区器的，而RDD2是有HashPartitioner(2)分区器的

　　而RDD3=RDD1.cogroup(RDD2),这种场景RDD3分区器也是HashPartitioner(2)，由于RDD1没有分区器，所以RDD1 group的时候按照HashPartitioner(2)会发生shuffle重分区，而重分区会尽量保持和RDD2分区情况保持协同，减少已经分区好RDD2的数据进行传输，保证计算的本地性，至于分区器的实现细节，我们可以看下源码

　　如果cogroup时没传分区器，这里默认使用的是一个defaultPartitioner

  def cogroup[W](other: RDD[(K, W)]): RDD[(K, (Iterable[V], Iterable[W]))] = self.withScope {
    cogroup(other, defaultPartitioner(self, other))
  }

　　这个默认分区器，最后会从RDD和spark默认配置取最大的值，作为分区器的分区数，所以在上面这种场景，会RDD2的分区器

  def defaultPartitioner(rdd: RDD[_], others: RDD[_]*): Partitioner = {
    val rdds = (Seq(rdd) ++ others)
//    循环所有的rdd，获取进行分区过的rdd，判断依据就是分区器的分区数 >0
    val hasPartitioner = rdds.filter(_.partitioner.exists(_.numPartitions > 0))

//    获取设置有分区器的RDD是最大分区数的RDD
    val hasMaxPartitioner: Option[RDD[_]] = if (hasPartitioner.nonEmpty) {
      Some(hasPartitioner.maxBy(_.partitions.length))
    } else {
      None
    }
//    如果spark程序里设置了spark.default.parallelism，那么就取这个默认值，如果没有就取上面RDD的最大分区数
    val defaultNumPartitions = if (rdd.context.conf.contains("spark.default.parallelism")) {
      rdd.context.defaultParallelism
    } else {
      rdds.map(_.partitions.length).max
    }
    
    // If the existing max partitioner is an eligible one, or its partitions number is larger
    // than or equal to the default number of partitions, use the existing partitioner.
    // 最后 比较spark默认规则下的分区数 和  最大分区数RDD的分区数  取最大
    if (hasMaxPartitioner.nonEmpty && (isEligiblePartitioner(hasMaxPartitioner.get, rdds) ||
        defaultNumPartitions <= hasMaxPartitioner.get.getNumPartitions)) {
      hasMaxPartitioner.get.partitioner.get
    } else {
      new HashPartitioner(defaultNumPartitions)
    }
  }

　　继续往下看cogroup的源码，会发现主要把所有的值汇总过来的动作在CoGroupedRDD中做的，我们可以看下这个RDD的实现，它是RDD的一个实现类

　　首先看到getDependencies，由于会把分区器传过来，所以判断是什么依赖的时候是依据循环的RDD分区器和传过来的分区器是否相等，相等就是窄依赖，不相等就是宽依赖，就当前场景下，RDD2有分区器，在这里就会判断成窄依赖，而RDD1则会判断宽依赖，也确实是，RDD1是需要shuffle重分区的，最终返回这么两个依赖的Seq

　　getPartitions，根据传进来的分区器的分区数量，实例化这个长度的数组，然后循环取获取每个分区的依赖，其中如果是宽依赖就是获取所有的分区，窄依赖就是获取依赖的那个分区(分区也有依赖，获取方式还没确定)

  override def getPartitions: Array[Partition] = {
    val array = new Array[Partition](part.numPartitions)
    for (i <- 0 until array.length) {
      // Each CoGroupPartition will have a dependency per contributing RDD
      array(i) = new CoGroupPartition(i, rdds.zipWithIndex.map { case (rdd, j) =>
        // Assume each RDD contributed a single dependency, and get it
        dependencies(j) match {
          case s: ShuffleDependency[_, _, _] =>
            None
          case _ =>
            Some(new NarrowCoGroupSplitDep(rdd, i, rdd.partitions(i)))
        }
      }.toArray)
    }
    array
  }

　　最后再说下compute的逻辑，它主要把分区循环，读取数据，读取的过程会判断窄依赖还是宽依赖，这个不同读取的方式不同，最后读取出来的key-value数据，会交给insertAll处理，其中 createExternalMap也是通过createCombiner，mergeValue，mergeCombiners来实现的

    val map = createExternalMap(numRdds)
    for ((it, depNum) <- rddIterators) {
      map.insertAll(it.map(pair => (pair._1, new CoGroupValue(pair._2, depNum))))
    }

　　mapValues，flatMapValues，sortBy，sortByKey，filterByRange，其中两个sort接口使用了RangePartitioner，因为RangePartitioner能保证分区间是有序，排序动作只要对分区内的数据进行排序，filterByRange也是基于这个分区器，在过滤范围的时候可以减少扫描分区，提高性能

package com.twq.spark.rdd.keyvalue

import org.apache.spark.{HashPartitioner, RangePartitioner, SparkConf, SparkContext}

/**
  * Created by tangweiqun on 2017/8/19.
  */
object OtherApiTest {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("word count")

    val sc = new SparkContext(conf)

    val rdd = sc.parallelize(Seq(2, 3, 3, 6, 2))
    rdd.distinct().collect()

    val pairRDD =
      sc.parallelize[(Int, Int)](Seq((5, 2), (7, 4), (3, 3), (2, 4)), 4).partitionBy(new HashPartitioner(2))

    val mapValuesRDD = pairRDD.mapValues(x => x + 1)
    mapValuesRDD.collect()

    mapValuesRDD.partitioner //会记住父亲RDD的分区器

    val flatMapValuesRDD = pairRDD.flatMapValues(x => (x to 5))
    flatMapValuesRDD.collect()

    flatMapValuesRDD.partitioner

    pairRDD.keys.collect()

    pairRDD.values.collect()

    pairRDD.sortByKey().collect()

    pairRDD.sortByKey(false).collect()

    pairRDD.sortBy(_._1).collect()
    pairRDD.sortBy(_._1, false).collect()

    val rangeTestRDD =
      sc.parallelize[(Int, Int)](Seq((5, 2), (7, 4), (3, 6), (2, 6), (3, 6), (2, 6)), 4)
    rangeTestRDD.filterByRange(3, 5).collect()  //把key在 3-5的取出来
  }

}

　　count api

package com.twq.spark.rdd.keyvalue

import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by tangweiqun on 2017/8/19.
  */
object CountApiTest {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("word count")

    val sc = new SparkContext(conf)

    //
    val numberRDD = sc.parallelize(1 to 10000 , 200)
    //RDD a内容 union 5次，其中有50000个元素
    val numbersRDD = numberRDD ++ numberRDD ++ numberRDD ++ numberRDD ++ numberRDD

    numbersRDD.count()

    //第一个参数是超时时间
    //第二个参数是期望达到近似估计的准确度
    // 如果你不断用0.9来调用countApprox，则我们期望90%的结果数据是正确的count值
    //如果count统计在超时时间内执行完，则不会近视估值，而是取正确的值
    //如果count统计在超时时间内没有执行完，则根据执行完的task的返回值和准确度进行近似估值
    val resultCount = numbersRDD.countApprox(200, 0.9)
    resultCount.initialValue.mean
    resultCount.initialValue.low
    resultCount.initialValue.high
    resultCount.initialValue.confidence
    resultCount.getFinalValue().mean

    numbersRDD.countByValue()
    val resultCountValue = numbersRDD.countByValueApprox(200, 0.9)
    resultCountValue.initialValue(1).mean

    //结果是9760，不传参数，默认是0.05
    numbersRDD.countApproxDistinct()
    //结果是9760
    numbersRDD.countApproxDistinct(0.05)
    //8224
    numbersRDD.countApproxDistinct(0.1)
    //10000  参数越小值越精确
    numbersRDD.countApproxDistinct(0.006)

    val pair = sc.parallelize((1 to 10000).zipWithIndex)

    pair.collect()

    val pairFive = pair ++ pair ++ pair ++ pair ++ pair

    pairFive.countByKey()

    pairFive.countByKeyApprox(10, 0.95)

    //用HyperLogLogPlus来实现的
    //也是调用combineByKey来实现的
    //val createCombiner = (v: V) => {
    //  val hll = new HyperLogLogPlus(p, sp)
    //  hll.offer(v)
    //  hll
    //}
    //val mergeValue = (hll: HyperLogLogPlus, v: V) => {
    //  hll.offer(v)
    //  hll
    //}
    //val mergeCombiner = (h1: HyperLogLogPlus, h2: HyperLogLogPlus) => {
    //  h1.addAll(h2)
    //  h1
    //}
    pairFive.countApproxDistinctByKey(0.1).collect().size

    pairFive.collectAsMap()  // 这种 操作很容易把内存打爆，不建议
    pairFive.lookup(5)
  }

}

　　union 等同把两个RDD ++的效果，主要两种场景的分区器，如果所有的父RDD分区器一致，那么分区数就是分区器的分区数

　　如果是父RDD的分区器不相同，那么分区数就是所有的父RDD分区数加总

　　intersection 取两个RDD的交集

　　cartesian 笛卡尔积，可以理解为 RDD的元素随机组合，分区数是每个RDD分区数相乘

package com.twq.spark.rdd

import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by tangweiqun on 2017/8/19.
  */
object TwoRDDApiTest {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("word count")

    val sc = new SparkContext(conf)

    val oneRDD = sc.parallelize[Int](Seq(1, 2, 3), 3)

    val otherRDD = sc.parallelize(Seq(3, 4, 5), 3)

    val unionRDD = oneRDD.union(otherRDD)
    unionRDD.collect() // Array[Int] = Array(1, 2, 3, 3, 4, 5)

    val plusPlusRDD = oneRDD ++ otherRDD
    plusPlusRDD.collect() // Array[Int] = Array(1, 2, 3, 3, 4, 5)

    val thirdRDD = sc.parallelize(Seq(5, 5, 5), 3)
    val unionAllRDD = sc.union(Seq(oneRDD, otherRDD, thirdRDD))
    oneRDD.union(otherRDD).union(thirdRDD).collect()
    unionAllRDD.collect()

    val intersectionRDD = oneRDD.intersection(otherRDD)
    intersectionRDD.collect() // Array[Int] = Array(3)

    val subtractRDD = oneRDD.subtract(otherRDD)
    subtractRDD.collect() // Array[Int] = Array(1, 2)

    // Array[(Int, Int)] = Array((1,3), (1,4), (1,5), (2,3), (2,4), (2,5), (3,3), (3,4), (3,5))
    val cartesianRDD = oneRDD.cartesian(otherRDD)
    cartesianRDD.collect()

    //要求两个RDD有相同的元素个数, 分区也得是一样的
    val zipRDD = oneRDD.zip(otherRDD)
    zipRDD.collect() // Array[(Int, Int)] = Array((1,3), (2,4), (3,5))

    //要求两个rdd需要有相同的分区数，但是每一个分区可以不需要有相同的元素个数
    val zipPartitionRDD =
      oneRDD.zipPartitions(otherRDD)((iterator1, iterator2)
      => Iterator(iterator1.sum + iterator2.sum))
    zipPartitionRDD.collect() // Array[Int] = Array(0, 4, 6, 8)


    val zipPartition3RDD =
      oneRDD.zipPartitions(otherRDD, thirdRDD)((iterator1, iterator2, iterator3)
      => Iterator(iterator1.sum + iterator2.sum + iterator3.sum))
    zipPartition3RDD.collect()

  }

}

zip的使用及其原理

　　zip 两个RDD相同索引的分区相同索引位置的元素对应成元组，注意：两个RDD分区数必须相同，分区里的元素个数都要相同，否则在进行操作的时候就会报错

　　zipPartitions 两个RDD相同索引分区的所有元素按照某个计算函数求得结果项，注意：两个RDD分区数必须相同，否则也会报错

　　zip是基于zipPartitions实现的

    val oneRDD = sc.parallelize[Int](Seq(1, 2, 3), 3)

    val otherRDD = sc.parallelize(Seq(3, 4, 5), 3)

    //要求两个RDD有相同的元素个数, 分区也得是一样的
    val zipRDD = oneRDD.zip(otherRDD)
    zipRDD.collect() // Array[(Int, Int)] = Array((1,3), (2,4), (3,5))

    //要求两个rdd需要有相同的分区数，但是每一个分区可以不需要有相同的元素个数
    val zipPartitionRDD =
      oneRDD.zipPartitions(otherRDD)((iterator1, iterator2)
      => Iterator(iterator1.sum + iterator2.sum))
    zipPartitionRDD.collect() // Array[Int] = Array(0, 4, 6, 8)


    val zipPartition3RDD =
      oneRDD.zipPartitions(otherRDD, thirdRDD)((iterator1, iterator2, iterator3)
      => Iterator(iterator1.sum + iterator2.sum + iterator3.sum))
    zipPartition3RDD.collect()

RDD的缓存机制

　　RDD的缓存机制，你就可以理解成把RDD的数据缓存到内存或者磁盘中

　　persist(StorageLevel) 其中StorageLevel就是缓存级别，决定缓存在哪，其中persist的源码里，就是把缓存级别赋值给一个storageLevel的成员变量，为什么赋值给这个成员变量，就可以达到缓存的效果？那这里还要看下在读取数据时的Iterator方法，因为在compute方法，会调用这方法读取数据，在iterator方法中，会获取这个存储级别进行判断，比如不为None，就是设置了存储级别，就会调用getOrCompute方法，这个方法中，主要去blockManager获取数据，能获取到就返回，获取不到就进行计算，然后放到blockManager中，下次获取时，直接从这个缓存对象里面获取

　　cache 缓存在内存中，它调用就是persist方法，因为persist默认缓存级别就是内存中

　　unpersist 从缓存中移除掉，不让其缓存

package com.twq.spark.rdd

import org.apache.spark.storage.StorageLevel
import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by tangweiqun on 2017/8/19.
  */
object RDDPersistApiTest {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("word count")

    val sc = new SparkContext(conf)

    val hdfsFileRDD = sc.textFile("hdfs://master:9999/users/hadoop-twq/person.json")

    val mapRDD = hdfsFileRDD.flatMap(str => str.split(" "))
    //存储级别：
    //MEMORY_ONLY: 只存在内存中
    //DISK_ONLY: 只存在磁盘中
    //MEMORY_AND_DISK: 先存在内存中，内存不够的话则存在磁盘中
    //OFF_HEAP: 存在堆外内存中
    hdfsFileRDD.persist(StorageLevel.MEMORY_ONLY)
    hdfsFileRDD.getStorageLevel

    mapRDD.getStorageLevel // None
    mapRDD.cache() //表示只存在内存中

    mapRDD.count()

    mapRDD.collect()

    mapRDD.unpersist()

    hdfsFileRDD.unpersist()

  }

}

　　Checkpoint机制

　　那么上图已经很明显说了checkpoint机制的作用，和persist一样都做持久化，但是persist不会切断RDD之前的依赖关系，RDD依赖关系太长，会影响计算性能，所以checkpoint机制在某些场景下，可以提高性能

　　需要注意的是：只有触发了action动作，才会checkpoint，并且触发过action动作，立即checkpoint没有用

package com.twq.spark.rdd.checkpoint

import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by tangweiqun on 2017/8/23.
  */
object  CheckPointTest {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("word count")

    val sc = new SparkContext(conf)

    val pairRDD = sc.parallelize[(Int, Int)](Seq((1, 2), (3, 4), (3, 6)), 2)

    val filterRDD = pairRDD.filter { case (key, value) => key > 2 }

    val mapRDD = filterRDD.map { case (key, value) => (key + 1, value + 1) }

    mapRDD.toDebugString

    mapRDD.localCheckpoint()

    mapRDD.collect()

    mapRDD.toDebugString

    val otherFilterRDD = mapRDD.filter {case (key, value) => key + value > 1}

    val otherMapRDD = otherFilterRDD.map { case (key, value) => (key + 1, value + 1) }

    otherMapRDD.toDebugString

    sc.setCheckpointDir("hdfs://master:9999/users/hadoop-twq/checkpoint")  //hdfs上缓存需要设置 hdfs存储地址

    otherMapRDD.checkpoint()
    otherMapRDD.toDebugString

    val someMapRDD = otherMapRDD.map { case (key, value) => (key + 1, value + 1) }

    someMapRDD.toDebugString
    someMapRDD.collect()

    someMapRDD.checkpoint()//没有用，因为这个rdd已经执行了job了
    someMapRDD.collect()
  }

}

　　checkpoint原理

　　源码我们会发现，checkpoint api实现方式和persist一样，赋值给一个成员变量，checkpoint里是赋值给checkpointData，那它这里怎么触发checkpoint以及如何读取checkpoint的数据？

　　上面也说只有触发action动作才会触发checkpoint动作，比如collect动作，你往下看，它会调用RDD的doCheckpoint方法，在这里面会对checkpointData是否定义进行判断，如果定义了，就会调用checkpointData方法.get.checkpoint()，在这里面会checkpoint数据到对应的存储地方(实例化checkpoint RDD)并标记checkpoint过，以及清除依赖

　　说了触发，那读取checkpoint的数据呢？还是看iterator方法，之前说了，如果设置了storageLevel，那么就会走persist，没设置，那么就会走checkpoint，在这个分支下，会先判断是否checkpopint的标志，如果checkpoint过，直接读取checkpoint对应RDD的数据，如果没有，重新计算，并checkoutpoint

***broadcast的机制及其用法

　　broadcast翻译过来就是广播，它的应用场景是这样的，driver端负责放spark代码及其相关的数据，而Executor端是负责执行操作分区的task，其中driver还存有一些固定不变的配置数据，task执行时需要，如果采用传输的方式，性能低，所以就有了这么一个广播的机制，给到Executor端，前提是这个数据不能变

package com.twq.spark.rdd

import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by tangweiqun on 2017/9/3.
  */
object BroadcastTest {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("test")

    val sc = new SparkContext(conf)

    val lookupTable = Map("plane" -> "sky", "fish" -> "sea", "people" -> "earth")

    val lookupTableB = sc.broadcast(lookupTable)

    val logData = sc.parallelize(Seq("plane", "fish", "duck", "dirty", "people", "plane"), 2)

    logData.foreach(str => {
      val replaceStrOpt = lookupTableB.value.get(str)
      println("element is : " + str)
      if (replaceStrOpt.isDefined) {
        println(s"============找到了[${str}]对应的值[${replaceStrOpt.get}]")
      }
    })
  }
}

　　总结：针对不变的少量的配置信息通过广播broadcast下发到Executor，可以提高性能

　　accumulator

　　这个可以监控task的运行情况，在driver端对task信息进行汇总，Accumutator运行在driver端，Executor端在运行task时可以直接访问Accumutator的数据

package com.twq.spark.rdd

import java.util.concurrent.ConcurrentHashMap
import java.util.function.BiConsumer

import org.apache.spark.util.AccumulatorV2
import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by tangweiqun on 2017/9/3.
  */
object AccumulatorTest {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("test")

    val sc = new SparkContext(conf)

    val longAccumulator = sc.longAccumulator("count mapped data")
    val collectionAccumulator = sc.collectionAccumulator[String]("collect mapped data")
    val mapAccumulator = new CustomAccumulator
    sc.register(mapAccumulator)  //自定义的 需要进行注册

    val logData = sc.parallelize(Seq("plane", "fish", "duck", "dirty", "people", "plane"), 2)
    // 上面代码在driver端执行
    logData.foreach(str => {  // 这里就在Executor端执行
      if (str == "plane") {
        longAccumulator.add(1L)
      }
      try {
        // some code
      } catch {
        case e: Exception => {
          collectionAccumulator.add(e.getMessage)   //这里实例了 统计task发生异常信息
        }
      }

      mapAccumulator.add(str)
    })

    longAccumulator.sum // 6
    collectionAccumulator.value // "plane", "fish", "duck", "dirty", "people", "plane"
    mapAccumulator.value //"plane -> 2", "fish -> 1", "duck -> 1", "dirty -> 1", "people -> 1",
  }
}

class CustomAccumulator extends AccumulatorV2[String, ConcurrentHashMap[String, Int]] {

  private val map = new ConcurrentHashMap[String, Int]()

  override def isZero: Boolean = map.isEmpty

  override def copy(): AccumulatorV2[String, ConcurrentHashMap[String, Int]] = {
    val newAcc = new CustomAccumulator()
    newAcc.map.putAll(map)
    newAcc
  }

  override def reset(): Unit = map.clear()
  // 在task端执行
  override def add(v: String): Unit = {
    map.synchronized {
      if (map.containsKey(v)) {
        map.put(v, map.get(v) + 1)
      } else map.put(v, 1)
    }
  }
  //在 driver端执行
  override def merge(other: AccumulatorV2[String, ConcurrentHashMap[String, Int]]): Unit = other match {
    case o: CustomAccumulator => {
      o.map.forEach(new BiConsumer[String, Int] {
        override def accept(key: String, value: Int): Unit = {
          if (map.containsKey(key)) {
            map.put(key, map.get(key) + value)
          } else {
            map.put(key, value)
          }
        }
      })
    }
    case _ => throw new UnsupportedOperationException(
      s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
  }

  override def value: ConcurrentHashMap[String, Int] = map
}

　　总结：accumulator用于统计Executor上task的执行情况，比如统计元素个数，定义driver端，可以在Executor上直接访问

spark支持的读写存储系统

package com.twq.spark.rdd.sources

import java.sql.{DriverManager, ResultSet}

import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.hadoop.mapreduce.lib.input.{TextInputFormat => NewTextInputFormat}
import org.apache.hadoop.mapreduce.lib.output.{TextOutputFormat => NewTextOutputFormat}
import org.apache.spark.rdd.JdbcRDD
import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by tangweiqun on 2017/8/26.
  */
object FileSystemApiTest {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("test")

    val sc = new SparkContext(conf)

    val data = sc.parallelize(Seq("just test", "hello world"), 1)

    //jdbc mysql and oracle
    def createConnection() = {
      Class.forName("com.mysql.jdbc.Driver")
      DriverManager.getConnection("jdbc:mysql://localhost/test?user=hhh")
    }

    def extractValues(r: ResultSet) = {
      (r.getInt(1), r.getString(2))
    }

    val sql = "select * from test where ? <= id and id <= ?"

    val dataJdbc = new JdbcRDD(sc, createConnection,
      sql, lowerBound = 1, upperBound = 3, numPartitions = 2, mapRow = extractValues)
    dataJdbc.collect()

    data.saveAsTextFile("file:///home/hadoop-twq/spark-course/test")

    //本地文件系统中写读文件
    sc.textFile("file:///home/hadoop-twq/spark-course/echo.sh").collect()

    //hdfs文件系统中读写文件
    // use old api
    data.saveAsTextFile("hdfs://master:9999/users/hadoop-twq/test")
    val keyValueRDD = sc.hadoopFile("hdfs://master:9999/users/hadoop-twq/test/part-00000",
      classOf[TextInputFormat], classOf[LongWritable], classOf[Text]).map {case (key, value) =>
      (key.get(), value.toString)
    }
    keyValueRDD.collect()
    // 当这里说 新老api 都是mapreduce 读写HDFS， spark会把这些api集成进去
    // 新的api 使用抽象类代替了老的接口，这样的优点：在api演化方面更加灵活
    // 新的api 有content的概念，参数可以在这里面，在兼容性方面更好，后面加参数往这里面加

    val data2 = sc.hadoopFile("hdfs://master:9999/users/hadoop-twq/test/part-00000",
      classOf[TextInputFormat], classOf[LongWritable], classOf[Text])  //需要注意的是   新的api读取的数据必须是key-value类型
    // use new api
    data2.saveAsNewAPIHadoopFile[NewTextOutputFormat[LongWritable, Text]](
      "hdfs://master:9999/users/hadoop-twq/test2")
    sc.newAPIHadoopFile("hdfs://master:9999/users/hadoop-twq/test/part-00000",
      classOf[NewTextInputFormat], classOf[LongWritable], classOf[Text]).
      map { case (_, value) => value.toString } collect()

    //s3文件系统中
    sc.hadoopConfiguration.set("fs.s3n.awsAccessKeyId", "YOUR_KEY_ID")
    sc.hadoopConfiguration.set("fs.s3n.awsSecretAccessKey", "YOUR_SECRET")
    data.saveAsTextFile("s3n://bucket/test")
    val s3FileInput = sc.textFile("s3n://bucket/*.log")
    s3FileInput.collect()

  }

}

package com.twq.spark.rdd.sources

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.SequenceFile.CompressionType
import org.apache.hadoop.io.compress.Lz4Codec
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.mapred._
import org.apache.hadoop.mapreduce.TaskType

/**
  * Created by tangweiqun on 2017/8/27.
  */
object MapReduceWriterOldApiTest {
  def main(args: Array[String]): Unit = {
    //1 设置配置的属性，包括写入key的类型、value的类型以及写入数据的文件格式
    val conf = new Configuration()
    val jobConf = new JobConf(conf)
    jobConf.setOutputKeyClass(classOf[NullWritable])
    jobConf.setOutputValueClass(classOf[Text])
    jobConf.setOutputFormat(classOf[TextOutputFormat[NullWritable, Text]])

    //2 写入数据的压缩设置
    val codec = Some(classOf[Lz4Codec])
    for (c <- codec) {
      jobConf.setCompressMapOutput(true)
      jobConf.set("mapred.output.compress", "true")
      jobConf.setMapOutputCompressorClass(c)
      jobConf.set("mapred.output.compression.codec", c.getCanonicalName)
      jobConf.set("mapred.output.compression.type", CompressionType.BLOCK.toString)
    }

    //3 设置数据写入到的文件夹
    val path = "hdfs://master:9999/users/hadoop-twq/test"
    val tempPath = new Path(path)
    val tempFs = tempPath.getFileSystem(conf)
    val finalOutputPath = tempPath.makeQualified(tempFs.getUri, tempFs.getWorkingDirectory)
    FileOutputFormat.setOutputPath(jobConf, finalOutputPath)

    //4 设置job相关的id等
    val jobId = new JobID("jobtrackerID", 123)
    val jobContext = new JobContextImpl(jobConf, jobId)
    val taId = new TaskAttemptID(new TaskID(jobId, TaskType.MAP, 0), 0)
    conf.set("mapred.tip.id", taId.getTaskID.toString)
    conf.set("mapred.task.id", taId.toString)
    conf.setBoolean("mapred.task.is.map", true)
    conf.setInt("mapred.task.partition", 0)
    conf.set("mapred.job.id", jobId.toString)

    //5 设置数据写入文件的文件名称
    val outputName = "part-" + System.currentTimeMillis()
    val outputPath = FileOutputFormat.getOutputPath(jobConf)
    val fs = outputPath.getFileSystem(jobConf)

    //6 构建一个写数据的writer，然后写文件
    val writer = jobConf.getOutputFormat
      .asInstanceOf[OutputFormat[AnyRef, AnyRef]]
      .getRecordWriter(fs, jobConf, outputName, Reporter.NULL)

    val key = null
    val value = "test"
    writer.write(key, value)

    //7 关闭writer
    writer.close(Reporter.NULL)

  }
}

MapReduce old write

package com.twq.spark.rdd.sources

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptID, TaskType}
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl

/**
  * Created by tangweiqun on 2017/8/27.
  */
object MapReduceWriterNewApiTest {
  def main(args: Array[String]): Unit = {
    //1 构建并配置写数据的一个job实例，包括写入key的类型、value的类型以及写入数据的文件格式
    val conf = new Configuration()
    val job = Job.getInstance(conf)
    job.setOutputKeyClass(classOf[NullWritable])
    job.setOutputValueClass(classOf[Text])
    job.setOutputFormatClass(classOf[TextOutputFormat[NullWritable, Text]])
    val jobConfiguration = job.getConfiguration
    jobConfiguration.set("mapred.output.dir", "hdfs://master:9999/users/hadoop-twq/test")

    //2 构建job相关的id
    val attemptId = new TaskAttemptID("jobtrackerID", 0, TaskType.REDUCE, 0, 0)
    val hadoopContext = new TaskAttemptContextImpl(jobConfiguration, attemptId)

    //3 获取输出数据的文件类型
    val format = job.getOutputFormatClass.newInstance

    //4 获取输出的committer
    val committer = format.getOutputCommitter(hadoopContext)
    committer.setupTask(hadoopContext)

    //5 获取写文件的writer，并写文件
    val writer = format.getRecordWriter(hadoopContext).asInstanceOf[RecordWriter[NullWritable, Text]]

    val key = null
    val value = "test"
    writer.write(null, new Text("teet"))

    //6 关闭writer
    writer.close(hadoopContext)

    committer.commitTask(hadoopContext)
  }
}

MapReduce new write

package com.twq.spark.rdd.sources

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapred._
import org.apache.hadoop.util.ReflectionUtils

/**
  * Created by tangweiqun on 2017/8/22.
  */
object MapReduceFileReaderOldApi {
  def main(args: Array[String]): Unit = {
    //1 构建配置
    val hadoopConf = new Configuration()
    val jobConf = new JobConf(hadoopConf)

    //2 设置数据存储的文件
    FileInputFormat.setInputPaths(jobConf, "hdfs://master:9999/users/hadoop-twq/word.txt")

    //3 获取读取文件的格式
    val inputFormat = ReflectionUtils.newInstance(classOf[TextInputFormat], jobConf)
      .asInstanceOf[InputFormat[LongWritable, Text]]

    //4 获取需要读取文件的数据块的分区信息
    //4.1 获取文件被分成多少数据块了
    val minSplit = 1
    val inputSplits = inputFormat.getSplits(jobConf, minSplit)
    val firstSplit = inputSplits(0)

    //4.2 获取第一个数据块的存储信息
    val splitInfoReflections = new SplitInfoReflections
    val lsplit = splitInfoReflections.inputSplitWithLocationInfo.cast(firstSplit)
    val preferLocations = splitInfoReflections.getLocationInfo.invoke(lsplit).asInstanceOf[Array[AnyRef]]

    val firstPreferLocation = preferLocations(0)
    val locationStr = splitInfoReflections.getLocation.invoke(firstPreferLocation).asInstanceOf[String]
    val isMemory = splitInfoReflections.isInMemory.invoke(firstPreferLocation).asInstanceOf[Boolean]

    //5 读取第一个数据块的数据
    val reader = inputFormat.getRecordReader(firstSplit, jobConf, Reporter.NULL)
    val key = reader.createKey()
    val value = reader.createValue()

    val finished = !reader.next(key, value)

  }
}

class SplitInfoReflections {
  def classForName(className: String): Class[_] = {
    Class.forName(className, true, Thread.currentThread().getContextClassLoader)
    // scalastyle:on classforname
  }

  val inputSplitWithLocationInfo =
    classForName("org.apache.hadoop.mapred.InputSplitWithLocationInfo")
  val getLocationInfo = inputSplitWithLocationInfo.getMethod("getLocationInfo")
  val newInputSplit = classForName("org.apache.hadoop.mapreduce.InputSplit")
  val newGetLocationInfo = newInputSplit.getMethod("getLocationInfo")
  val splitLocationInfo = classForName("org.apache.hadoop.mapred.SplitLocationInfo")
  val isInMemory = splitLocationInfo.getMethod("isInMemory")
  val getLocation = splitLocationInfo.getMethod("getLocation")
}

MapReduce old read

package com.twq.spark.rdd.sources

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, TextInputFormat}
import org.apache.hadoop.mapreduce.task.{JobContextImpl, TaskAttemptContextImpl}

/**
  * Created by tangweiqun on 2017/8/22.
  */
object MapReduceFileReaderNewApi {
  def main(args: Array[String]): Unit = {
    //1 构建一个job实例
    val hadoopConf = new Configuration()
    val job = Job.getInstance(hadoopConf)

    //2 设置需要读取的文件全路径
    FileInputFormat.setInputPaths(job, "hdfs://master:9999/users/hadoop-twq/word.txt")

    //3 获取读取文件的格式
    val inputFormat = classOf[TextInputFormat].newInstance()

    val updateConf = job.getConfiguration

    //4 获取需要读取文件的数据块的分区信息
    //4.1 获取文件被分成多少数据块了
    val minSplit = 1
    val jobId = new JobID("jobTrackerId", 123)
    val jobContext = new JobContextImpl(updateConf, jobId)
    val inputSplits = inputFormat.getSplits(jobContext).toArray
    val firstSplit = inputSplits(0).asInstanceOf[InputSplit]

    //4.2 获取第一个数据块的存储信息
    val splitInfoReflections = new SplitInfoReflections
    val lsplit = splitInfoReflections.inputSplitWithLocationInfo.cast(firstSplit)
    val preferLocations = splitInfoReflections.getLocationInfo.invoke(lsplit).asInstanceOf[Array[AnyRef]]

    val firstPreferLocation = preferLocations(0)
    val locationStr = splitInfoReflections.getLocation.invoke(firstPreferLocation).asInstanceOf[String]
    val isMemory = splitInfoReflections.isInMemory.invoke(firstPreferLocation).asInstanceOf[Boolean]


    //5 读取第一个数据块的数据
    val attemptId = new TaskAttemptID("jobTrackerId", 123, TaskType.MAP, 0, 0)
    val hadoopAttemptContext = new TaskAttemptContextImpl(updateConf, attemptId)
    val reader = inputFormat.createRecordReader(firstSplit, hadoopAttemptContext)
    reader.initialize(firstSplit, hadoopAttemptContext)
    val isFirst = reader.nextKeyValue()
    val key = reader.getCurrentKey
    val value = reader.getCurrentValue

  }
}

MapReduce new read

　　HadoopRDD的原理和实现

　　后续更新。。。。。

spark支持的通用文件格式

　　sequence file可以用于小文件处理的场景，key为文件名，value就是文件流

　　其中

package com.twq.spark.rdd.sources

import java.io.{StringReader, StringWriter}

import au.com.bytecode.opencsv.{CSVReader, CSVWriter}
import org.apache.hadoop.io.SequenceFile.CompressionType
import org.apache.hadoop.io.{IntWritable, Text}
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.spark.{SparkConf, SparkContext}


/**
  * Created by tangweiqun on 2017/8/24.
  */
object CommonFileFormatApiTest {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("test")

    val sc = new SparkContext(conf)

    // text file format
    val data = sc.parallelize(Seq("i am the first test", "what about you", "hello world"), 3)

    data.saveAsTextFile("hdfs://master:9999/users/hadoop-twq/text/")

    val textFileInputFromHdfs = sc.textFile("hdfs://master:9999/users/hadoop-twq/text/part-00001")
    textFileInputFromHdfs.collect()

    // csv file format
    val persons = sc.parallelize(Seq(Person("jeffy", 30), Person("tom", 24)), 1)
    persons.map(person => List(person.name, person.age.toString).toArray).mapPartitions(people => {
      import scala.collection.JavaConversions._
      val stringWriter = new StringWriter()
      val csvWriter = new CSVWriter(stringWriter)
      csvWriter.writeAll(people.toList)
      Iterator(stringWriter.toString)
    }).saveAsTextFile("hdfs://master:9999/users/hadoop-twq/csv/")

    val peopleWithCsv = sc.textFile("hdfs://master:9999/users/hadoop-twq/csv/part-00000").map(line => {
      val reader = new CSVReader(new StringReader(line))
      reader.readNext()
    })
    peopleWithCsv.collect()

    // sequence file format
    val sequenceFileData = sc.parallelize(List(("panda", 3), ("kay", 6), ("snail", 2)))

    sc.hadoopConfiguration.setBoolean(FileOutputFormat.COMPRESS, true)
    sc.hadoopConfiguration.set(FileOutputFormat.COMPRESS_TYPE, CompressionType.NONE.toString)
    //sc.hadoopConfiguration.set(FileOutputFormat.COMPRESS_TYPE, CompressionType.RECORD.toString)
    //sc.hadoopConfiguration.set(FileOutputFormat.COMPRESS_TYPE, CompressionType.BLOCK.toString)
    sequenceFileData.saveAsSequenceFile("hdfs://master:9999/users/hadoop-twq/sequence/")

    val sequenceFileInput = sc.sequenceFile("hdfs://master:9999/users/hadoop-twq/sequence/part-00003",
      classOf[Text], classOf[IntWritable])

    //sequenceFileInput.collect()   //不支持直接collect

    sequenceFileInput.map { case (x, y) => (x.toString, y.get()) }.collect()




    // object file format
    // 就是key为org.apache.hadoop.io.NullWritable的sequence file
    persons.saveAsObjectFile("hdfs://master:9999/users/hadoop-twq/object")

    val objectFileInput = sc.objectFile[Person]("hdfs://master:9999/users/hadoop-twq/object/part-00000")
    objectFileInput.collect()

  }

}

case class Person(name: String, age: Int)

　　binaryFiles和binaryRecords两者都是对二进制文件进行读取，其中binaryRecords要求读取的每条记录长度一致

　　binaryFiles因为读取的数据是流，基本不占空间，所以我可以用它通过广播的形式把一些脚本给到Executor端下载执行

package com.twq.spark.rdd.sources

import java.io.FileOutputStream

import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by tangweiqun on 2017/9/2.
  */
object BinaryDataFileFormat {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("test")

    val sc = new SparkContext(conf)

    val wholeTextFiles = sc.wholeTextFiles("hdfs://master:9999/users/hadoop-twq/text/")
    wholeTextFiles.collect()
    // 和wholeTextFiles效果类似，都是把文件名当key，而文件内容当value
    // wholeTextFiles是读取出来的是文件的文本内容，而binaryFiles读取出来的是文件二进制数据流(如果要查看具体内容，可以将二进制流转化成String)
    val binaryFilesRDD = sc.binaryFiles("hdfs://master:9999/users/hadoop-twq/text/")

    binaryFilesRDD.collect()

    binaryFilesRDD.map { case (fileName, stream) =>
      (fileName, new String(stream.toArray()))
    }.collect()


    //可以用于将hdfs上的脚本同步到每一个executor上
    val binaryFilesStreams = binaryFilesRDD.collect()
    val binaryFilesStreamsB = sc.broadcast(binaryFilesStreams)

    val data = sc.parallelize(Seq(2, 3, 5, 2, 1), 2)
    data.foreachPartition(iter => {
      val allFileStreams = binaryFilesStreamsB.value
      allFileStreams.foreach { case (fileName, stream) =>
        val inputStream = stream.open()
        val fileOutputStream = new FileOutputStream(s"/local/path/fileName-${fileName}")

        val buf = new Array[Byte](4096)
        var hasData = true

        while (true) {
          val r = inputStream.read(buf)
          if (r == -1) hasData = false
          fileOutputStream.write(buf, 0, r)
        }
      }
    })


    val binaryFileData = sc.parallelize[Array[Byte]](List(Array[Byte](2, 3),
      Array[Byte](3, 4), Array[Byte](5, 6)))

    binaryFileData.saveAsTextFile("hdfs://master:9999/users/hadoop-twq/binary/")

    val binaryRecordsRDD = sc.binaryRecords("hdfs://master:9999/users/hadoop-twq/binary/part-00002", 2)
    binaryRecordsRDD.collect()
  }
}

　　Avro file行式存储和Parquet file列式存储

        <dependency>
            <groupId>org.apache.parquet</groupId>
            <artifactId>parquet-column</artifactId>
            <version>1.8.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.parquet</groupId>
            <artifactId>parquet-hadoop</artifactId>
            <version>1.8.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.parquet</groupId>
            <artifactId>parquet-avro</artifactId>
            <version>1.8.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.avro</groupId>
            <artifactId>avro</artifactId>
            <version>1.7.7</version>
        </dependency>
        <dependency>
            <groupId>org.apache.avro</groupId>
            <artifactId>avro-mapred</artifactId>
            <version>1.7.7</version>
        </dependency>

package com.twq.spark.rdd.sources

import org.apache.spark.sql.{SaveMode, SparkSession}
import com.databricks.spark.avro._


/**
  * Created by tangweiqun on 2017/8/24.
  *
  */
object ParquetAvroApiTest {

  def main(args: Array[String]): Unit = {

    val spark = SparkSession
      .builder()
      .appName("Spark SQL basic example")
      .config("spark.some.config.option", "some-value")
      .getOrCreate()

    import spark.implicits._

    val personDf =
      spark.sparkContext.parallelize(Seq(Person("jeffy", 30), Person("tomy", 23)), 1).toDF()

    //avro
    personDf.write.mode(SaveMode.Overwrite).avro("hdfs://master:9999/users/hadoop-twq/avro")
    val avroPersonDf = spark.read.avro("hdfs://master:9999/users/hadoop-twq/avro")
    avroPersonDf.show()

    //parquet
    personDf.write.mode(SaveMode.Overwrite).parquet("hdfs://master:9999/users/hadoop-twq/parquet")
    val parquetPersonDF = spark.read.parquet("hdfs://master:9999/users/hadoop-twq/parquet")
    parquetPersonDF.show()

    //json
    personDf.write.mode(SaveMode.Overwrite).json("hdfs://master:9999/users/hadoop-twq/json")
    val jsonPersonDf = spark.read.json("hdfs://master:9999/users/hadoop-twq/json")
    jsonPersonDf.show()
  }

}

　　问题：文件格式特点以及使用场景，以及和spark sql存储有啥区别

posted @ 2020-06-10 22:10 财经知识狂魔阅读(219) 评论(0) 编辑收藏举报

刷新页面返回顶部

财经知识狂魔

不要把自己逼到没有改变机会的绝境上再后悔自己浪费了青春