spark streaming 7 Driect

复制代码
package com.shujia.spark.streaming

import java.util

import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Durations, StreamingContext}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import redis.clients.jedis.Jedis

import scala.collection.mutable

object Demo7Direct {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
      .setAppName("streaming")
      .setMaster("local[4]")


    /**
      * 创建streaming 上下文对象,指定batch的间隔时间,多久计算一次
      *
      */
    val ssc = new StreamingContext(conf, Durations.seconds(5))


    /**
      * earliest
      * 当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,从头开始消费
      * latest
      * 当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,消费新产生的该分区下的数据
      * none
      * topic各分区都存在已提交的offset时,从offset后开始消费;只要有一个分区不存在已提交的offset,则抛出异常
      *
      */

    val groupId = "asdasda"

    val kafkaParams: Map[String, Object] = Map[String, Object](
      "bootstrap.servers" -> "master:9092,node1:9092,node2:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> groupId,
      "auto.offset.reset" -> "earliest",
      "enable.auto.commit" -> "false"
    )

    val topics = Array("test_topic2")


    val key: String = groupId + ":" + "test_topic2"

    /**
      * 读取redis中的消费偏移量
      *
      */

    val redis = new Jedis("master", 6379)

    val map: util.Map[String, String] = redis.hgetAll(key)

    import scala.collection.JavaConverters._

    val scalaMap: mutable.Map[String, String] = map.asScala

    //消费的偏移量,topic 分区  偏移量
    val partitionOffset: Map[TopicPartition, Long] = scalaMap.map(kv => {
      val partition: String = kv._1
      val offset: String = kv._2

      val tp = new TopicPartition("test_topic2", partition.toInt)

      (tp, offset.toLong)
    }).toMap

    println(partitionOffset)


    val stream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
      ssc,
      PreferConsistent,
      Subscribe[String, String](topics, kafkaParams, partitionOffset) //读取数据的时候指定消费偏移量
    )


    stream.foreachRDD(rdd => {


      //编写用户自定义的代码逻辑
      rdd.map(_.value()).foreach(println)

      //获取消费偏移量
      val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges


      /**
        * 可以将偏移量保存到redis中
        * key : 消费者组 + topi + partition
        * value : untilOffset 作为value
        *
        * 启动redis
        * ./redis-server redis.conf
        */


      //1、创建redis链接
      val jedis = new Jedis("master", 6379)

      for (offsetRange <- offsetRanges) {
        val fromOffset: Long = offsetRange.fromOffset

        val partition: Int = offsetRange.partition
        val topic: String = offsetRange.topic
        val untilOffset: Long = offsetRange.untilOffset

        println(topic + "\t" + partition + "\t" + fromOffset + "\t" + untilOffset)



        //保存消费偏移量到redis中
        jedis.hset(key, partition.toString, untilOffset.toString)

      }

    })


    ssc.start()
    ssc.awaitTermination()
    ssc.stop()

  }
}
复制代码

 

posted @   坤坤无敌  阅读(41)  评论(0编辑  收藏  举报
编辑推荐:
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
阅读排行:
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· C#/.NET/.NET Core优秀项目和框架2025年2月简报
· DeepSeek在M芯片Mac上本地化部署
· 葡萄城 AI 搜索升级:DeepSeek 加持,客户体验更智能
点击右上角即可分享
微信分享提示