Flink Source:数据源
Flink Source:数据源
Flink 在流处理和批处理上的 source 大概有 4 类:
1、基于本地集合的 source、
2、基于文件的 source、
3、基于网络套接字的 source、
4、自定义的 source。自定义的 source 常见的有 Apache kafka、Amazon Kinesis Streams、RabbitMQ、Twitter Streaming API、Apache NiFi 等,当然你也可以定义自己的 source。
1、基于本地集合的 source
package com.shujia.flink.source
import org.apache.flink.streaming.api.scala._
object Demo1ListSource {
def main(args: Array[String]): Unit = {
/**
* 创建环境
*
*/
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
/**
* 基于本地集合构建DS --- 有界流
*
*/
val listDS: DataStream[String] = env.fromCollection(List("java,spark", "java,java","spark,hadoop"))
listDS
.flatMap(_.split(","))
.map((_, 1))
.keyBy(_._1)
.sum(1)
.print()
//job的名称不指定也可以,Flink会给一个默认值
env.execute()
}
}
2、基于文件的 source
package com.shujia.flink.source
import org.apache.flink.streaming.api.scala._
object Demo1FileSource {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
/**
* 读取文件构建DS - - 有界流
* readTextFile() -- 指定一个文件的路径,也可以加编码格式
*/
val studentDS: DataStream[String] = env.readTextFile("data/students.txt")
studentDS
.map(stu => {
val clazz: String = stu.split(",")(4)
(clazz, 1)
})
.keyBy(_._1)
.sum(1)
.print()
env.execute()
}
}
3、基于网络套接字的 source
package com.shujia.flink.source
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
object Demo3SocketSource {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
/**
* 读取socket构建DS -- 无界流
*
*/
val socketDS: DataStream[String] = env.socketTextStream("master", 8888)
socketDS.print()
env.execute()
}
}
4、自定义的 source
package com.shujia.flink.source
import org.apache.flink.streaming.api.functions.source.SourceFunction
import org.apache.flink.streaming.api.scala._
object Demo4SourceFunction {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
/**
* 读取自定义的数据
*
*/
val myDS: DataStream[Int] = env.addSource(new MySource)
myDS.print()
env.execute()
}
}
/**
* 自定义source,实现SourceFunction接口,并指定泛型(读取数据的类型)
*
*/
class MySource extends SourceFunction[Int] {
/**
* run : 用于生成数据的方法,只执行一次
* ctx : 用于将数据发送到下游的对象
*/
override def run(ctx: SourceFunction.SourceContext[Int]): Unit = {
var i = 0
//死循环模拟无界流
while (true) {
i += 1
//将数据发送到下游
ctx.collect(i)
//不让数据发送太快
Thread.sleep(100)
}
}
//任务被取消的时候执行,一般用于回收资源
override def cancel(): Unit = {
}
}
自定义 source 读取MySQL中数据
package com.shujia.flink.source
import java.sql.{Connection, DriverManager, PreparedStatement, ResultSet}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.source.{RichParallelSourceFunction, RichSourceFunction, SourceFunction}
import org.apache.flink.streaming.api.scala._
object Demo5MysqlSource {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(4)
val mysqlDS: DataStream[(String, String, Int, String, String)] = env.addSource(new MysqlSource)
mysqlDS.print()
env.execute()
}
}
/**
* SourceFunction : 基础的SourceFunction,单线程的source
* RichSourceFunction : 比SourceFunction多了open和close方法 -- 单线程的source
* ParallelSourceFunction : 多并行的source
* RichParallelSourceFunction : 比SourceFunction多了open和close方法 -- 多并行的source
*/
class MysqlSource extends RichSourceFunction[(String, String, Int, String, String)] {
/**
* open: 在run方法之前执行,一般用于初始化,比如创建网络链接
*/
var con: Connection = _
override def open(parameters: Configuration): Unit = {
println("创建链接")
Class.forName("com.mysql.jdbc.Driver")
//建立链接
con = DriverManager.getConnection("jdbc:mysql://master:3306/bigdata", "root", "123456")
}
/**
* close: 在run方法之后执行,一般用于回收资源
*
*/
override def close(): Unit = {
println("关闭链接")
//关闭链接
con.close()
}
/**
* 在run方法中读取数据,如果run会结束 -- 有界流,如果run不会结束 -- 无界流
*
*/
override def run(ctx: SourceFunction.SourceContext[(String, String, Int, String, String)]): Unit = {
println("run")
//使用jdbc读取mysq
//查询数据
val stat: PreparedStatement = con.prepareStatement("select * from student")
//执行查询
val resultSet: ResultSet = stat.executeQuery()
//循环解析数据
while (resultSet.next()) {
val id: String = resultSet.getString("id")
val name: String = resultSet.getString("name")
val age: Int = resultSet.getInt("age")
val gender: String = resultSet.getString("gender")
val clazz: String = resultSet.getString("clazz")
//将数据发送到下游
ctx.collect((id, name, age, gender, clazz))
}
}
//任务被取消的时候执行,一般用于回收资源
override def cancel(): Unit = {
}
}