初识Flink-从WorldCount开始
Apache Flink是一个用于分布式流和批处理数据处理的开源平台。Flink的核心是流数据流引擎,为数据流上的分布式计算提供数据分发,通信和容错。Flink在流引擎之上构建批处理,覆盖本机迭代支持,托管内存和程序优化。
import org.apache.flink.api.scala.ExecutionEnvironment object BatchWordCountScala { def main(args: Array[String]): Unit = { val inputPath = "E:\\data\\file" //存放文件路径 val outPut = "E:\\data\\result" //结果 val env = ExecutionEnvironment.getExecutionEnvironment val text = env.readTextFile(inputPath) //引入隐式转换 import org.apache.flink.api.scala._ val counts = text.flatMap(_.toLowerCase.split("\\W+")) .filter(_.nonEmpty) .map((_,1)) .groupBy(0) .sum(1) counts.writeAsCsv(outPut,"\n"," ").setParallelism(1) env.execute("batch word count") } }