spark streaming 缉查布控

复制代码
package com.shujia.spark.streaming

import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.{Durations, StreamingContext}

object Demo8BlackFilter {
  def main(args: Array[String]): Unit = {
    /**
      * 动态修改广播变量
      *
      */
    val spark: SparkSession = SparkSession
      .builder()
      .appName("black")
      .master("local[2]")
      .getOrCreate()
    import spark.implicits._


    val ssc = new StreamingContext(spark.sparkContext, Durations.seconds(5))
    val kafkaParams: Map[String, Object] = Map[String, Object](
      "bootstrap.servers" -> "master:9092,node1:9092,node2:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "asdasdasdas",
      "auto.offset.reset" -> "latest", //latest:读取新的数据
      "enable.auto.commit" -> "false"
    )

    //topic 列表
    val topics = Array("dianxin")

    val linesDS: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
      ssc,
      PreferConsistent,
      Subscribe[String, String](topics, kafkaParams)
    )

    /**
      * driver 端,只运行一次
      *
      */
    println("foreachRDD外面")

    linesDS.foreachRDD(rdd => {
      /**
        * driver 端,每隔batch会运行一次
        *
        */
      println("foreachRDD内部,算子外部")

      /**
        * 读取黑名单
        *
        */
      val blackListDF: DataFrame = spark.read
        .format("jdbc")
        .option("url", "jdbc:mysql://master:3306")
        .option("dbtable", "student.t_blacklist")
        .option("user", "root")
        .option("password", "123456")
        .load()
      //黑名单的列表
      val blackList: Array[String] = blackListDF.as[String].collect()

      //将黑名单广播
      val broadCastBlackList: Broadcast[Array[String]] = spark.sparkContext.broadcast(blackList)

      val blockRDD: RDD[ConsumerRecord[String, String]] = rdd.filter(record => {
        val value: String = record.value()
        val mdn: String = value.split(",")(0)

        //获取广播变量
        val blackListvalue: Array[String] = broadCastBlackList.value

        blackListvalue.contains(mdn)
      })

      blockRDD.map(_.value()).foreach(println)

      //将数据保存到mysql中
      blockRDD
        .map(_.value())
        .toDF("line")
        .write
        .mode(SaveMode.Append)
        .format("jdbc")
        .option("url", "jdbc:mysql://master:3306")
        .option("dbtable", "student.dianxin_black")
        .option("user", "root")
        .option("password", "123456")
        .save()

      //清除广播变量
      broadCastBlackList.unpersist()

    })

    ssc.start()
    ssc.awaitTermination()
    ssc.stop()

  }
}
复制代码

 

posted @   坤坤无敌  阅读(39)  评论(0编辑  收藏  举报
编辑推荐:
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
阅读排行:
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· C#/.NET/.NET Core优秀项目和框架2025年2月简报
· DeepSeek在M芯片Mac上本地化部署
· 葡萄城 AI 搜索升级:DeepSeek 加持,客户体验更智能
点击右上角即可分享
微信分享提示