spark实战@wordcount-处理目录下的多个文件

import org.apache.hadoop.fs.{Path, FileSystem}
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

class WordCount {

}

/**

处理目录下每个文件，进行wordcount计算，并将结果保存为list
*/
object WordCount {

def main(args: Array[String]) {
val conf = new SparkConf().setAppName("WordCount")
val sc = new SparkContext(conf)
var resultList = List(String, Int) // 保存结果集

val fs = FileSystem.get(new java.net.URI("hdfs://cluster1"), new org.apache.hadoop.conf.Configuration())
  .listStatus(new Path(args(0)))
for (f <- fs) {
  println("YTQ-FilePath => " + f.getPath.toString)
  resultList = resultList ::: sc.textFile(f.getPath.toString).
    flatMap(_.split("\t")).map((_, 1)).reduceByKey(_ + _).collect.toList
}

// 再次处理最后的结果集
sc.parallelize(resultList).reduceByKey(_ + _).saveAsTextFile(args(1))

sc.stop()

}

posted @ 2016-01-20 11:28 yifan888 阅读(738) 评论(0) 编辑收藏举报

刷新页面返回顶部

yifan888

spark实战@wordcount-处理目录下的多个文件

公告