spark textfile rdd 日记
批量处理模板方法, 核心处理方法为内部方法
def batchProces(sc: SparkContext, locationFlag: Int, minid: Int, maxid: Int, numPartitions: Int) = { //自定义RDD,此处为demo val dataRDD = sc.makeRDD(List(1, 2), numPartitions) dataRDD.mapPartitions(iterator => { val rawData = iterator.toList var lstT = new ListBuffer[(Int, Int)]() rawData.foreach(v => { if (lstT.size < 50) { lstT.append((v, 1)) } else { //每50处理一次 procesData() } }) //剩余的继续处理 procesData() //批量处理逻辑 def procesData() = { //核心处理逻辑 // doProcess //很重要 lstT.clear() } lstT.iterator }).map((_, 1)).reduceByKey(_ + _).sortByKey().saveAsTextFile("hdfs://hdfscluster/tmp/logs/") }
批量处理模板方法, 核心处理方法为外部方法
def process_outer(lst: List[(Int, Int)]) = { //外部核心处理逻辑,如Request请求等 RequestUtil.postJson("http://xxx", "{paraData}", 1000) } def batchProces_processOuter(sc: SparkContext, locationFlag: Int, minid: Int, maxid: Int, numPartitions: Int) = { val fooCount = sc.longAccumulator("fooCount") //自定义RDD,此处为demo val dataRDD = sc.makeRDD(List(1, 2), numPartitions) dataRDD.foreachPartition(iterator => { val rawData = iterator.toList var lstT = new ListBuffer[(Int, Int)]() rawData.foreach(v => { if (lstT.size < 50) { lstT.append((v, 1)) } else { //每50处理一次 process_outer(lstT.toList) fooCount.add(lstT.size) lstT.clear() } }) //剩余的继续处理 if (lstT.size > 0) { process_outer(lstT.toList) fooCount.add(lstT.size) lstT.clear() } }); println("total =>" + fooCount.value) }
针对文本文件RDD的一些处理逻辑:
//针对单个文件,每行数据超长的情况, 先对行进行拆分,再重新分区,将数据交给多个executor去执行 def bigLine(sc: SparkContext, locationFlag: Int, minid: Int, maxid: Int, numPartitions: Int) = { val fileRDD = sc.textFile("hdfs://hdfscluster/tmp/logs/abc.txt", numPartitions) //对于长文本, 先拆分,然后重新分区,提高并发机器利用率, 减少job执行时间 fileRDD.flatMap(_.split(",")).repartition(24).foreach(println(_)) } //针对无规律零散路径,循环内部使用sc def handlerPath_lingsan(sc: SparkContext, locationFlag: Int, minid: Int, maxid: Int, numPartitions: Int, filep: String) = { val rawPath: List[String] = List("hdfs://hdfscluster/tmp1/path1", "hdfs://hdfscluster/tmp2/path2", "hdfs://hdfscluster/tmp3/path3") val lsResult = rawPath.flatMap(v => { sc.textFile(v).map((_, 1)).collect().toList }).toList.foreach(println(_)) } //针对文件夹, def handlerPath_directroy(sc: SparkContext, locationFlag: Int, minid: Int, maxid: Int, numPartitions: Int, filep: String) = { //按行输出指定文件夹下所有文件,分区有效 val txtRDD = sc.textFile("hdfs://hdfscluster/tmp1/*", numPartitions) //重新分区,便于输出结果 txtRDD.map((_, 1)).repartition(1) .saveAsTextFile("hdfs://hdfscluster/tmp/logs/ssoHot3") } //针对文件夹,且路径下文件数量比较多且比较小的情况 def handlerPath_directroy(sc: SparkContext, locationFlag: Int, minid: Int, maxid: Int, numPartitions: Int, filep: String) = { //返回结果key=文件路径,val=文件内容, 如果content太大的话,容易造成OOM val dirRDD = sc.wholeTextFiles("hdfs://hdfscluster/tmp1/*", numPartitions) dirRDD.flatMap(v => { v._2.split(System.lineSeparator()).map((_, 1)) }).repartition(1).saveAsTextFile("hdfs://hdfscluster/tmp/logs/ssoHot3") }
//java scala转换
def java_scala_collection_convert = { var lstT = new ListBuffer[Int]() //注意java,scala转换 import scala.collection.JavaConverters._ val lstBack = SensitiveDevice.batchDecrypt(lstT.toList.asJava).asScala }