Fork me on GitHub

sparkstreaming用redis管理偏移量

sparkstreaming用redis管理偏移量

RedisUtils.scala

import java.io.FileInputStream
import java.util.Properties
​
import redis.clients.jedis.{Jedis, JedisPool, JedisPoolConfig}
​
object RedisUtils {
  private val properties = new Properties()
  // 获取当前目录下
  val path: String = Thread.currentThread().getContextClassLoader.getResource("jedis.properties").getPath
  properties.load(new FileInputStream(path))
  val host: String = properties.getProperty("redis.host")
  val auth: String = properties.getProperty("redis.auth")
  val port: Int = properties.getProperty("redis.port").toInt
  val config = new JedisPoolConfig
  config.setMaxTotal(properties.getProperty("redis.maxConn").toInt)
  config.setMaxIdle(properties.getProperty("redis.maxIdle").toInt)
//  val pool: JedisPool = new JedisPool(config, host, port, 10000, auth)
  val pool: JedisPool = new JedisPool(config, host, port, 10000)
  def getConnections(): Jedis ={
    pool.getResource
  }
}
​```

### OffsetKafkaRedis.scala


```java
import java.util
​
import org.apache.kafka.clients.consumer.ConsumerRecord
​
import scala.collection.mutable
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, HasOffsetRanges, KafkaUtils, LocationStrategies, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
import redis.clients.jedis.{Jedis, Pipeline}
import org.apache.log4j.Logger
import scala.util.Try
import scala.collection.JavaConverters._
​
​
​
object OffsetKafaRedis {
  private val logger: Logger = Logger.getLogger(this.getClass)
  def getOffset(topics:Array[String], groupId:String): mutable.Map[TopicPartition, Long] = {
    val fromOffset = scala.collection.mutable.Map[TopicPartition, Long]()
    val jedis: Jedis = RedisUtils.getConnections()
    topics.foreach(topic => {
      val keys: util.Set[String] = jedis.keys(s"kafka_offset:${groupId}:${topic}:*")
      if (!keys.isEmpty) {
        keys.asScala.foreach(key => {
          val offset: String = jedis.get(key)
          val partition:String = Try(key.split(s"kafka_offset:${groupId}:${topic}:").apply(1)).getOrElse("0")
          println(s"[INFO] 当前主题:${topic}, 当前分区:${partition}, 当前偏移量:${offset}")
          fromOffset.put(new TopicPartition(topic, partition.toInt), offset.toLong)
        })
      }
    })
    jedis.close()
    fromOffset
  }
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setAppName("redisOffsetDemo").setMaster("local[2]")
    val context = new SparkContext(conf)
    context.setLogLevel("WARN")
    val ssc: StreamingContext = new StreamingContext(context, Seconds(10))
    val topics = Array("offsetDemo")
    val groupId = "g1"
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> "linux01:9092,linux02:9092,linux03:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> groupId,
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )
    val offsets: mutable.Map[TopicPartition, Long] = getOffset(topics, groupId)
​
    val kafkaDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
      ssc,
      LocationStrategies.PreferConsistent,
      ConsumerStrategies.Subscribe[String, String](topics, kafkaParams, offsets)
    )
    kafkaDStream.foreachRDD(
      rdd => {
        // 获取offset
        val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
        // 连接redis
        val jedis: Jedis = RedisUtils.getConnections()
        // 开启事务
        val pipeline: Pipeline = jedis.pipelined()
        pipeline.multi()
//        val transaction: Transaction = jedis.multi()
        // 应用逻辑
        try {
          val result: RDD[(String, Int)] = rdd.map(_.value()).flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _)
          result.foreach(println)
          offsetRanges.foreach(
            iter => {
              val key: String = s"kafka_offset:${groupId}:${iter.topic}:${iter.partition}"
              val value: Long = iter.untilOffset
              println(s"[INOF]所属键:${key}更新偏移量:${value}")
              pipeline.set(key, value.toString)
​
            }
          )
          // 提交事务
          pipeline.exec()
          // 关闭pipeline
          pipeline.sync()
        } catch  {
          case e:Exception => {
            logger.error("[ERROR]",e)
            pipeline.discard()
          }
        } finally {
          pipeline.close()
          jedis.close()
        }
      }
    )
    ssc.start()
    ssc.awaitTermination()
​
  }
}

​```
posted @   是阿凯啊  阅读(142)  评论(0编辑  收藏  举报
编辑推荐:
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
阅读排行:
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· 单线程的Redis速度为什么快?
· 展开说说关于C#中ORM框架的用法!
· Pantheons:用 TypeScript 打造主流大模型对话的一站式集成库
历史上的今天:
2020-08-09 猴子补丁
2020-08-09 设计模式创造者模式--python
点击右上角即可分享
微信分享提示