spark streaming 缉查布控
package com.shujia.spark.streaming import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.kafka.common.serialization.StringDeserializer import org.apache.spark.SparkConf import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} import org.apache.spark.streaming.dstream.{DStream, InputDStream} import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe import org.apache.spark.streaming.kafka010.KafkaUtils import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent import org.apache.spark.streaming.{Durations, StreamingContext} object Demo8BlackFilter { def main(args: Array[String]): Unit = { /** * 动态修改广播变量 * */ val spark: SparkSession = SparkSession .builder() .appName("black") .master("local[2]") .getOrCreate() import spark.implicits._ val ssc = new StreamingContext(spark.sparkContext, Durations.seconds(5)) val kafkaParams: Map[String, Object] = Map[String, Object]( "bootstrap.servers" -> "master:9092,node1:9092,node2:9092", "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> "asdasdasdas", "auto.offset.reset" -> "latest", //latest:读取新的数据 "enable.auto.commit" -> "false" ) //topic 列表 val topics = Array("dianxin") val linesDS: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String]( ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams) ) /** * driver 端,只运行一次 * */ println("foreachRDD外面") linesDS.foreachRDD(rdd => { /** * driver 端,每隔batch会运行一次 * */ println("foreachRDD内部,算子外部") /** * 读取黑名单 * */ val blackListDF: DataFrame = spark.read .format("jdbc") .option("url", "jdbc:mysql://master:3306") .option("dbtable", "student.t_blacklist") .option("user", "root") .option("password", "123456") .load() //黑名单的列表 val blackList: Array[String] = blackListDF.as[String].collect() //将黑名单广播 val broadCastBlackList: Broadcast[Array[String]] = spark.sparkContext.broadcast(blackList) val blockRDD: RDD[ConsumerRecord[String, String]] = rdd.filter(record => { val value: String = record.value() val mdn: String = value.split(",")(0) //获取广播变量 val blackListvalue: Array[String] = broadCastBlackList.value blackListvalue.contains(mdn) }) blockRDD.map(_.value()).foreach(println) //将数据保存到mysql中 blockRDD .map(_.value()) .toDF("line") .write .mode(SaveMode.Append) .format("jdbc") .option("url", "jdbc:mysql://master:3306") .option("dbtable", "student.dianxin_black") .option("user", "root") .option("password", "123456") .save() //清除广播变量 broadCastBlackList.unpersist() }) ssc.start() ssc.awaitTermination() ssc.stop() } }
分类:
spark
标签:
spark streaming
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· C#/.NET/.NET Core优秀项目和框架2025年2月简报
· DeepSeek在M芯片Mac上本地化部署
· 葡萄城 AI 搜索升级:DeepSeek 加持,客户体验更智能