Spark 自定义分区及区内二次排序demo

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.Partitioner
import org.apache.spark.HashPartitioner

object Demo {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[*]").setAppName("app")
    val sc = new SparkContext(conf)
    val data = sc.textFile("F:\\test\\test\\ssort.txt")
    
    //先分区, 再区内排序
    data.map{x=>
      val arr = x.split(" ")
      (arr(0),arr(1).toInt)
    }.partitionBy(new MySparkPartition(2)).mapPartitions{x=>
      //此处的sortBy为scala中list集合的方法, 与Spark中RDD的sortBy方法不一样,注意区分!!!
      x.toList.sortBy{case(x,y)=>
        (x, -y)
      }.toIterator
    }.saveAsTextFile("F:\\test\\test\\output")
    
    //data.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).top(3)(Ordering.by(_._2)).foreach(println)
    
   /*data.map{x=>
     (new SecondarySortKey(x.split(" ")(0), x.split(" ")(1).toInt))
   }.sortBy(x=>x, true).map{x=>(x.first, x.second)}.foreach(println)*/
    
    /*data.sortBy({x=>
      (new SecondarySortKey(x.split(" ")(0), x.split(" ")(1).toInt))
    }, true).foreach(println)*/
    
    /*val data1 = data.sortBy({x=>
      (new SecondarySortKey(x.split(" ")(0),x.split(" ")(1).toInt))  
    },true).map{x=>
      val arr = x.split(" ")
      (arr(0), arr(1))
    }.partitionBy(new MySparkPartition(2)).saveAsTextFile("F:\\test\\test\\output")*/
    
    /*val l1 = List[(String,Int)](("a",1),("b",2),("d",4),("c",3),("a",2))
    //l1.sortBy(x=>(x._1,x._2))(Ordering.Tuple2(Ordering.String,Ordering.Int.reverse))
    l1.sortBy{case(x,y) =>
      (x, -y)
    }
    .foreach(println)*/
  }
}

class MySparkPartition(numsPartitions:Int) extends Partitioner{
  def numPartitions:Int = numsPartitions
  
  override def getPartition(key:Any):Int={
    if(key == "aa"){
      return 1
    }else{
      return 0
    }
  }
}

class SecondarySortKey(val first:String, val second:Int) extends Ordered[SecondarySortKey] with Serializable{
  def compare(other:SecondarySortKey):Int={
    var comp = this.first.compareTo(other.first)
    if(comp == 0){
      other.second.compareTo(this.second)
    }else{
      comp
    }
  }
}
posted @ 2019-08-08 17:22 virus丶舒阅读(503) 评论(0) 编辑收藏举报
刷新页面返回顶部
Spark 自定义分区及区内二次排序demo

公告