spart_排序、聚合、分区、随机数及散列(分区变更) - 陕西小楞娃

排序

    val conf = new SparkConf().setMaster("local").setAppName("sort")
    val sc = new SparkContext(conf)
    //需求: 根据数据计算个网站的PV、UV、同时、只显示top3 // 设置三个分区
    //数据格式：199.111.148.214    重庆    2018-11-12    1542011088714    6755235587059844279    www.taobao.com    Comment
    val fileRDD = sc.textFile("data/bigdata-spark_data_pvuvdata.txt", 3)
    val mapRDD = fileRDD.map(line => (line.split("\t")(5), 1))
    //www.taobao.com 1
    val wordCount = mapRDD.reduceByKey(_ + _)
    //www.taobao.com n
    val fanzhuan = wordCount.map(x => {
      (x._2, x._1)
    })
    //n www.taobao.com
    val sortKey = fanzhuan.sortByKey(false)
    val wordByTop5 = sortKey.map(_.swap) 
    // _.swap 互换、相当于(x=> {(x._2, x._1)
    val tuples = wordByTop5.take(3)
    tuples.foreach(println)
    /**
     * 也可以用top，不用排序
     * (www.taobao.com,18771)
     * (www.mi.com,18728)
     * (www.baidu.com,18636)
     */

// 需求: 根据ip去重统计
    val mapList = fileRDD.map(line =>
      {
        val split = line.split("\t")
        (split(5), split(0))
      }
    )

    val disWords = mapList.distinct()
    // www.taobao.com 199.111.148.214
    val words = disWords.map(info => (info._1,1))
    //www.taobao.com 1
    val wordsCounts = words.reduceByKey(_+_)
    val tuples = wordsCounts.sortBy(_._2, false).take(5)

    /**
     * (www.taobao.com,15791)
     * (www.mi.com,15769)
     * (www.gome.com.cn,15740)
     * (www.dangdang.com,15690)
     * (www.baidu.com,15653)
     */
    tuples.foreach(println)

聚合

    val conf = new SparkConf().setAppName("aggregation").setMaster("local")
    val sc = new SparkContext(conf)
    sc.setLogLevel("ERROR")


    val data = sc.parallelize(List(
      ("zhangsan",111),
      ("zhangsan",222),
      ("zhangsan",333),
      ("lisi",444),
      ("lisi",555),
      ("lisi",666),
      ("wangwu",777)
    ))

    val list02 = data.groupByKey()
    list02.foreach(println)
    /**
     * combineByKeyWithClassTag
     * (zhangsan,CompactBuffer(111, 222, 333))
     * (wangwu,CompactBuffer(777))
     * (lisi,CompactBuffer(444, 555, 666))
     */


    // 行转列  一变多使用flatMap
    val list3 = list02.flatMap(x=> x._2.map(e => (x._1, e)).iterator)
    list3.foreach(println)
    /**
     * (zhangsan,111)
     * (zhangsan,222)
     * (zhangsan,333)
     * (wangwu,777)
     * (lisi,444)
     * (lisi,555)
     * (lisi,666)
     */
    val list04 = list02.flatMapValues(e => e.iterator)
    list04.foreach(println)
    /**
     * 1.自动帮拼接key 2.可以不加iterator
     * (zhangsan,111)
     * (zhangsan,222)
     * (zhangsan,333)
     * (wangwu,777)
     * (lisi,444)
     * (lisi,555)
     * (lisi,666)
     */
    println("- - - - - - - -- - -")
      // 取值每个key的前两个
    list02.mapValues(e => e.toList.sorted.take(2)).foreach(println)
    /**
     * (zhangsan,List(111, 222))
     * (wangwu,List(777))
     * (lisi,List(444, 555))
     */
    list02.flatMapValues(e=> e.toList.sorted.take(2)).foreach(println)
    /**
     * (zhangsan,111)
     * (zhangsan,222)
     * (wangwu,777)
     * (lisi,444)
     * (lisi,555)
     */

    println("- - - - - - - -")
    val sum = data.reduceByKey(_+_)
    sum.foreach(println)

    /**
     * (zhangsan,666)
     * (wangwu,777)
     * (lisi,1665)
     */
    println("sum- - - - - - - -")
    val max = data.reduceByKey((up, down) => if(up > down) up else down)
    max.foreach(println)

    /**
     * (zhangsan,333)
     * (wangwu,777)
     * (lisi,666)
     */
    println("max- - - - - - - -")
    val min = data.reduceByKey((up, down) => if(up < down) up else down)
    min.foreach(println)

    /**
     * (zhangsan,111)
     * (wangwu,777)
     * (lisi,444)
     */
    println("min- - - - - - - -")
    val count = data.mapValues(e => 1).reduceByKey(_+_)
    count.foreach(println)

    /**
     * (zhangsan,3)
     * (wangwu,1)
     * (lisi,3)
     */
    println("count- - - - - - - -")
    val tmp = sum.join(count)
    tmp.foreach(println)

    /**
     * (zhangsan,(666,3))
     * (wangwu,(777,1))
     * (lisi,(1665,3))
     */
    println("tmp- - - - - - - -")
    val avg = tmp.mapValues(e => e._1/e._2) //拉取两次 计算多次
    avg.foreach(println)

    /**
     * (zhangsan,222)
     * (wangwu,777)
     * (lisi,555)
     */

    // 优化 -> 拉取一次 计算一次
    val tmpx = data.combineByKey(
      /**
       * 源码:
       * createCombiner: V => C,
       * mergeValue: (C, V) => C,
       * mergeCombiners: (C, C) => C,
       */
      // 第一条记录的value，怎么放入hashmap
      (value:Int) => (value, 1),
      // 如果有第二条，第二条及以后的value放入到hashMap里
      (oldValue:(Int, Int), newValue:Int) => (oldValue._1 + newValue, oldValue._2 + 1),
      // 合并溢血结果的函数
      (v1:(Int, Int), v2:(Int,Int)) => (v1._1 + v2._2, v1._2 + v2._2)
    )

    /**
     * (zhangsan,(666,3))
     * (wangwu,(777,1))
     * (lisi,(1665,3))
     */
    tmpx.foreach(println)
    println("- - - - - - -- - -  -- - -")
    tmpx.mapValues(e => e._1 / e._2).foreach(println)

    /**
     * (zhangsan,222)
     * (wangwu,777)
     * (lisi,555)
     */

分区

    val conf = new SparkConf().setMaster("local").setAppName("partitions")
    val sc = new SparkContext(conf)
    sc.setLogLevel("ERROR")

    val data = sc.parallelize(1 to 4, 2)
    val sqlInfo = data.map(value =>{
      println("------conn--mysql----")
      println(s"-----select $value-----")
      println("-----close--mysql------")
      value + "selected"
    })
    sqlInfo.foreach(println)

    /**
     * 问题: 连接多次
     * ------conn--mysql----
     * -----select 1-----
     * -----close--mysql------
     * 1selected
     * ------conn--mysql----
     * -----select 2-----
     * -----close--mysql------
     * 2selected
     * ------conn--mysql----
     * -----select 3-----
     * -----close--mysql------
     * 3selected
     * ------conn--mysql----
     * -----select 4-----
     * -----close--mysql------
     * 4selected
     */



    val sqlInfo = data.mapPartitionsWithIndex(
      (p_index, p_iter)=> {
        val lb = new ListBuffer[String]
        println(s"--$p_index----conn--mysql----")
        while (p_iter.hasNext){
          val value = p_iter.next()
          println(s"-----select $value-----")
          lb += (value + "select")
        }
        println(s"--$p_index-----close--mysql------")
        lb.iterator
      }
    )
    sqlInfo.foreach(println)

    /**
     * 问题：虽然优化了mysql连接次数，但是带来新问题，val lb = new ListBuffer[String] 可能内存撑爆OOM
     * ------conn--mysql----
     * -----select 1-----
     * -----select 2-----
     * -----close--mysql------
     * 1select
     * 2select
     * ------conn--mysql----
     * -----select 3-----
     * -----select 4-----
     * -----close--mysql------
     * 3select
     * 4select
     */



    // 解决val lb = new ListBuffer[String] 内存OOM撑爆问题
    // 1.写入到文件. 写文件就不是一个高级架构师的思路，我们要规避io
    // 2.使用迭代器模式. 所以需要迭代器嵌套，不能让数据在中间缓存，防止OOM
    val  sqlInfo = data.mapPartitionsWithIndex(
      (p_index, p_iter) => {

        // 模仿map或flatMap的思路,map一进一出, flatMap一进多出(父类也是个迭代器就用flatMap,模仿它的源码)
        new Iterator[String] {
          println(s"--$p_index----conn--mysql----")
          override def hasNext = if(p_iter.hasNext == false){
            println(s"--$p_index-----close--mysql------")
            false
          }else true

          override def next() = {
            val value = p_iter.next()
            println(s"-----select $value-----")
            value + "selected"
          }
        }
      }
    )
    sqlInfo.foreach(println)

    /**
     * pipeline模式，来一个数据处理一个，处理特别快，不占用内存，而且一个分区只连接一次mysql
     * --0----conn--mysql----
     * -----select 1-----
     * 1selected
     * -----select 2-----
     * 2selected
     * --0-----close--mysql------
     * --1----conn--mysql----
     * -----select 3-----
     * 3selected
     * -----select 4-----
     * 4selected
     * --1-----close--mysql------
     */

随机数及散列(分区变更)

    val conf = new SparkConf().setAppName("gaoji").setMaster("local")
    val sc = new SparkContext(conf)
    sc.setLogLevel("ERROR")

    val data = sc.parallelize(1 to 100)
    // sample(withReplacement,fraction,seed) 随机抽取几个数，withReplacement：是否重复抽取 fraction:10% seed: seed一样，抽取的值一样
    data.sample(false, 0.1).foreach(println)
    println("- - -- - - - -")

    data.sample(true, 0.1, 222).foreach(println)
    println("- - -- - - - -")


    val data1 = sc.parallelize(1 to 100, 5)
    println(s"data:${data1.getNumPartitions}") // 5
    val partitionData = data1.repartition(4) //5个分区切换为4个
    println(s"data:${partitionData.getNumPartitions}") // 4
    partitionData.foreach(println)




    val data2 = sc.parallelize(1 to 10, 5)
    val info2 = data2.mapPartitionsWithIndex(
      (p_index, p_iter)=>{
        p_iter.map(e => (p_index,e))
      }
    )
    info2.foreach(println)

    /**
     * 数据分配倒了五个分区
     * (0,1)
     * (0,2)
     * (1,3)
     * (1,4)
     * (2,5)
     * (2,6)
     * (3,7)
     * (3,8)
     * (4,9)
     * (4,10)
     */





    val data2 = sc.parallelize(1 to 10, 5)
    val date3 = info2.repartition(3)
    // repartition 无论调大调小都会触发shuffle
    // repartition底层使用coalesce(numPartitions, shuffle = true) 当分区变多时候，必须有shuffle(如果没有shuffle,增加的分区是没有数据的,因为没有进行重新计算，数据重新分配操作),当分区表少时候，可以不需要shuffle(把删除的分区挪移到最后一个分区上)
    val info4 = date3.mapPartitionsWithIndex(
      (p_index, p_iter)=>{
        p_iter.map(e => (p_index,e))
      }
    )
    info4.foreach(println)

    /**
     * 可以看到数据又进行了一次散列
     * (0,(1,4))
     * (0,(3,8))
     * (0,(4,10))
     * (1,(0,1))
     * (1,(2,5))
     * (2,(0,2))
     * (2,(1,3))
     * (2,(2,6))
     * (2,(3,7))
     * (2,(4,9))
     */

posted on 2021-02-06 22:13 陕西小楞娃阅读(211) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部