Flink DataStream Source(二)
Flink Source
上下文环境
import org.apache.flink.api.scala.ExecutionEnvironment
val env = ExecutionEnvironment.getExecutionEnvironment;//批处理运行上下文环境
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
val streamenv = StreamExecutionEnvironment.getExecutionEnvironment//流处理运行上下文环境
对象、文本、socket Source
import org.apache.flink.streaming.api.scala.createTypeInformation
streamenv.fromElements[String]("1","2","3","4","5").print()
streamenv.fromCollection(Array("6","7","8","9","10")).print()
streamenv.readTextFile("/data/qujian.csv").print()
streamenv.socketTextStream("***.***.***.***",7777).print()
读Parquet
import org.apache.flink.formats.parquet.ParquetRowInputFormat
import org.apache.flink.core.fs.Path
// long 可以用 INT64 替换
// |-- a: string (nullable = true)
// |-- b: long (nullable = true)
// |-- c: string (nullable = true)
// |-- d: string (nullable = true)
// |-- e: long (nullable = true)
// |-- f: long (nullable = true)
// |-- g: string (nullable = true)
// |-- h: long (nullable = true)
// |-- i: long (nullable = true)
// |-- j: integer (nullable = true)
import org.apache.parquet.schema.{MessageType, PrimitiveType}
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
import org.apache.parquet.schema.Type.Repetition
val a = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, "a")
val b = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.INT64, "b")
val c = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, "c")
val d = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, "d")
val e = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.INT64, "e")
val f = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.INT64, "f")
val g = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, "g")
val h = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.INT64, "h")
val i = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.INT64, "i")
val j = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.INT32, "j")
val schema = new MessageType("dataschema", a, b,c,d,e,f,g,h,i,j)
streamenv.readFile(new ParquetRowInputFormat(
new Path("/data/data.parquet"), schema),"/data/data.parquet").print()
//没有写错,就是需要写两遍路径
读kafka 方式一
import org.apache.flink.connector.kafka.source.KafkaSource
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer
import org.apache.flink.api.common.eventtime.WatermarkStrategy
import org.apache.flink.streaming.api.scala.createTypeInformation
val source = KafkaSource.builder()
.setBootstrapServers("127.0.0.1:9092")
.setTopics("events")
.setGroupId("group")
.setStartingOffsets(OffsetsInitializer.earliest())
.setValueOnlyDeserializer(new SimpleStringSchema())
.build();
streamenv.fromSource(source, WatermarkStrategy.noWatermarks(), "Kafka Source").print()
读kafka 方式二
import org.apache.flink.api.common.serialization.SimpleStringSchema
import java.util.Properties
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
val properties = new Properties();
properties.setProperty("bootstrap.servers", "127.0.0.1:9092");
properties.setProperty("auto.offset.reset", "earliest");
properties.setProperty("group.id", "group");
val kafkaConsumer = new FlinkKafkaConsumer[String]("events",new SimpleStringSchema(),properties);
streamenv.addSource(kafkaConsumer).print()
读jdbc 方式一
import org.apache.flink.connector.jdbc.JdbcInputFormat
import org.apache.flink.api.common.typeinfo.{BasicTypeInfo}
import org.apache.flink.api.java.typeutils.RowTypeInfo
val jdbcInputFormat = JdbcInputFormat.buildJdbcInputFormat()
.setDrivername("com.mysql.jdbc.Driver")
.setDBUrl("jdbc:mysql://127.0.0.1:3306/test?characterEncoding=UTF-8")
.setUsername("root").setPassword("123456")
.setQuery("SELECT id,name,score FROM student")
.setRowTypeInfo(new RowTypeInfo(BasicTypeInfo.INT_TYPE_INFO,
BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO))
.finish();
streamenv.createInput(jdbcInputFormat).print()
读jdbc 方式二
import java.sql.{Connection, DriverManager, PreparedStatement}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.source.RichSourceFunction
import org.apache.flink.streaming.api.functions.source.SourceFunction
class MyRichSourcejdbc extends RichSourceFunction[(Int,String,Int)]{
var conn:Connection = _
var selectStatement:PreparedStatement = _
override def open(parameters: Configuration): Unit = {
conn = DriverManager.getConnection("jdbc:mysql://127.0.0.1:3306/test","root","123456")
selectStatement = conn.prepareStatement("SELECT id,name,score FROM student");
}
override def run(ctx: SourceFunction.SourceContext[(Int,String,Int)]): Unit = {
val resultSet = selectStatement.executeQuery()
while (resultSet.next()) {
ctx.collect(resultSet.getInt(1),resultSet.getString(2),resultSet.getInt(3))
}
}
override def cancel(): Unit = {
selectStatement.close()
conn.close()
}
}
streamenv.addSource(new MyRichSourcejdbc()).print()//写MySQL方法
读redis
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.source.{RichSourceFunction,SourceFunction}
import redis.clients.jedis.{JedisPool,JedisPoolConfig,Protocol}
class MyRedisSource extends RichSourceFunction[(String, String)]() {
var jedisPool:JedisPool = _
override def open(parameters: Configuration): Unit = {
jedisPool = new JedisPool(new JedisPoolConfig, "127.0.0.1", 6379, Protocol.DEFAULT_TIMEOUT)
}
override def run(ctx:SourceFunction.SourceContext[(String, String)]): Unit = {
val jedis = jedisPool.getResource()
ctx.collect(("test",jedis.get("test")))
jedis.close()//实现RichSourceFunction抽象方法,加载数据源数据到流中
}
override def cancel(): Unit = {
jedisPool.close()
}
}
import org.apache.flink.streaming.api.scala.createTypeInformation
streamenv.addSource(new MyRedisSource()).print()
读redis异步
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.scala.async.{ResultFuture, RichAsyncFunction}
import redis.clients.jedis.{JedisPool,JedisPoolConfig,Protocol}
import org.apache.flink.streaming.api.scala.{AsyncDataStream}
import java.util.concurrent.TimeUnit
import scala.concurrent.{Future,ExecutionContext}
//基本逻辑就是一样的 但是异步的方法 你不知道它几十会执行 这个也是最大的缺陷
//flink就是要实时算出来,这个特点和flink不匹配了
class RedisAsyncFunction extends RichAsyncFunction[String,String]{
var jedisPool:JedisPool = _
override def open(parameters: Configuration): Unit = {
jedisPool = new JedisPool(new JedisPoolConfig, "127.0.0.1", 6379, Protocol.DEFAULT_TIMEOUT)
}
override def asyncInvoke(input: String, resultFuture: ResultFuture[String]): Unit = {
val jedis = jedisPool.getResource()
println(jedis.get(input))
if(jedis.get(input) != null){
resultFuture.complete(Array(jedis.get(input)))
}else{
resultFuture.complete(Array("-99"))
}
jedis.close()
}
override def close(): Unit = {
jedisPool.close()
}
}
val stream = streamenv.fromElements[String]("test","test1","test2","test3","test4","test5")
AsyncDataStream.unorderedWait(stream,new RedisAsyncFunction(), 10000, TimeUnit.MILLISECONDS, 100).print()
搬砖多年终不得要领,遂载源码看之望得真经。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?