Kafka+SparkStreaming+Zookeeper(ZK存储Offset,解决checkpoint问题)

创建一个topic

1
./kafka-topics.sh --create --zookeeper 192.168.1.244:2181,192.168.1.245:2181,192.168.1.246:2181 --replication-factor 1<br>--partitions 1 --topic topic_test_zk_minOffset_zkGroup

查看topic列表

1
./kafka-topics.sh --list --zookeeper 192.168.1.244:2181,192.168.1.245:2181,192.168.1.246:2181

producer 代码如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
package com.kafka.test;
 
import java.util.Properties;
 
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;
 
/**
* @author:FengZhen
* @create:2018年8月9日
*/
public class Producer_zk {
 
    public static void main(String[] args) {
        Properties props = new Properties();
        props.put("bootstrap.servers", "192.168.1.244:6667,192.168.1.247:6667");
        //props.put("zookeeper.connect", "192.168.1.244:2181,192.168.1.245:2181,192.168.1.246:2181");
        props.put("acks", "all");
        props.put("retries", 0);
        props.put("batch.size", 16384);
        props.put("linger.ms", 1);
        props.put("buffer.memory", 33554432);
        props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
 
        KafkaProducer<String, String> producer = new KafkaProducer<String, String>(props);
        for (int i = 30; i < 40; i++)
            producer.send(new ProducerRecord<String, String>("topic_test_zk_minOffset_zkGroup", Integer.toString(i), "中文测试-"+Integer.toString(i)));
 
        producer.close();
    }
     
}

Streaming代码如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
package streaming
 
import kafka.api.{OffsetRequest, PartitionOffsetRequestInfo, TopicMetadataRequest}
import kafka.common.TopicAndPartition
import kafka.consumer.SimpleConsumer
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import kafka.utils.{ZKGroupTopicDirs, ZkUtils}
import org.I0Itec.zkclient.ZkClient
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
 
object KafkaLog_local_zk_minOffset_zkGroup {
 
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("KafkaLog_local_zk_minOffset_zkGroup").setMaster("local[2]")
    val sc = new SparkContext(conf)
    sc.setLogLevel("WARN")
    val ssc = new StreamingContext(sc, Seconds(5))
 
    val broker_servers = "192.168.1.244:6667,192.168.1.247:6667"
    val zk_host = "192.168.1.244:2181,192.168.1.245:2181,192.168.1.246:2181"
    //消费的 topic 名字
    val topic : String = "topic_test_zk_minOffset_zkGroup"
    //创建 stream 时使用的 topic 名字集合
    val topics : Set[String] = Set(topic)
 
    var kafkaParam:Map[String,String] = Map()
    kafkaParam += ("bootstrap.servers" -> broker_servers)
    kafkaParam += ("group.id" -> "test")
    kafkaParam += ("enable.auto.commit" -> "true")
    kafkaParam += ("auto.commit.interval.ms" -> "100")
 
    //创建一个 ZKGroupTopicDirs 对象,对保存
    val topicDirs = new ZKGroupTopicDirs("topic_test_zk_minOffset_zkGroup_group", topic)
 
    //获取 zookeeper 中的路径,这里会变成 /consumers/test_spark_streaming_group/offsets/topic_name
    // /consumers/topic_test_zk_minOffset_zkGroup_group/offsets/topic_test_zk_minOffset_zkGroup/0
    val zkTopicPath = s"${topicDirs.consumerOffsetDir}"
 
    //zookeeper 的host 和 ip,创建一个 client
    val zkClient = new ZkClient(zk_host)
    //查询该路径下是否字节点(默认有字节点为我们自己保存不同 partition 时生成的)
    val children = zkClient.countChildren(zkTopicPath)
 
    var kafkaStream : InputDStream[(String, String)] = null
 
    //如果 zookeeper 中有保存 offset,我们会利用这个 offset 作为 kafkaStream 的起始位置
    var fromOffsets: Map[TopicAndPartition, Long] = Map()
 
    //如果保存过 offset,这里更好的做法,还应该和  kafka 上最小的 offset 做对比,不然会报 OutOfRange 的错误
    if (children > 0) {
      for (i <- 0 until children) {
        val topic2 = List(topic)
        val req = new TopicMetadataRequest(topic2, 0)
        // 第一个参数是 kafka broker 的host,第二个是 port
        val getLeaderConsumer = new SimpleConsumer("192.168.1.244", 6667, 10000, 10000, "OffsetLookup")
        val res = getLeaderConsumer.send(req)
        val topicMetaOption = res.topicsMetadata.headOption
        val partitions = topicMetaOption match {
          // 将结果转化为 partition -> leader 的映射关系
          case Some(tm) =>
            tm.partitionsMetadata.map(pm => (pm.partitionId, pm.leader.get.host)).toMap[Int, String]
          case None =>
            Map[Int, String]()
        }
        //去出分片对应的leader host
        val brokerLeaderHost = partitions.get(i).toString.replace("Some(", "").replace(")","")
 
        val partitionOffset = zkClient.readData[String](s"${zkTopicPath}/${i}")
        val tp = TopicAndPartition(topic, i)
 
        val requestMin = OffsetRequest(Map(tp -> PartitionOffsetRequestInfo(OffsetRequest.EarliestTime, 1)))
        val consumerMin = new SimpleConsumer(brokerLeaderHost, 6667, 10000, 10000, "getMinOffset")
        val curOffsets = consumerMin.getOffsetsBefore(requestMin).partitionErrorAndOffsets(tp).offsets
        var nextOffset = partitionOffset.toLong
        // 通过比较从 kafka 上该 partition 的最小 offset 和 zk 上保存的 offset,进行选择
        if (curOffsets.length > 0 && nextOffset < curOffsets.head) {
          nextOffset = curOffsets.head
        }
        //设置正确的 offset,这里将 nextOffset 设置为 0(0 只是一个特殊值),可以观察到 offset 过期的想想
        fromOffsets += (tp -> nextOffset)
        println("@@@@@@ topic[" + topic + "] partition[" + i + "] offset[" + partitionOffset + "] @@@@@@")
      }
 
      //这个会将 kafka 的消息进行 transform,最终 kafak 的数据都会变成 (topic_name, message) 这样的 tuple
      val messageHandler = (mmd : MessageAndMetadata[String, String]) => (mmd.topic, mmd.message())
      kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParam, fromOffsets, messageHandler)
    }
    else {
      //如果未保存,根据 kafkaParam 的配置使用最新或者最旧的 offset
      kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParam, topics)
    }
 
    var offsetRanges = Array[OffsetRange]()
    //得到该 rdd 对应 kafka 的消息的 offset
    kafkaStream.transform{ rdd =>
      offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      rdd
    }.foreachRDD { rdd => //.map(msg => Utils.msgDecode(msg))
      for (o <- offsetRanges) {
        val zkPath = s"${zkTopicPath}/${o.partition}"
        //将该 partition 的 offset 保存到 zookeeper
        ZkUtils.updatePersistentPath(zkClient, zkPath, o.fromOffset.toString)
        println(s"@@@@@@ topic  ${o.topic}  partition ${o.partition}  fromoffset ${o.fromOffset}  untiloffset ${o.untilOffset} #######")
      }
      rdd.foreachPartition(
        message => {
          while(message.hasNext) {
            println(s"@^_^@   [" + message.next() + "] @^_^@")
          }
        }
      )
    }
    //开启流式计算
    ssc.start()
    //一直会阻塞,等待退出
    ssc.awaitTermination()
  }
}

 

出现的问题

使用simpleConsumer时报错

1
2
3
4
5
6
7
8
9
Exception in thread "main" java.nio.channels.ClosedChannelException
    at kafka.network.BlockingChannel.send(BlockingChannel.scala:100)
    at kafka.consumer.SimpleConsumer.liftedTree1$1(SimpleConsumer.scala:78)
    at kafka.consumer.SimpleConsumer.kafka$consumer$SimpleConsumer$$sendRequest(SimpleConsumer.scala:68)
    at kafka.consumer.SimpleConsumer.getOffsetsBefore(SimpleConsumer.scala:127)
    at streaming.KafkaLog_local_zk_minOffset$$anonfun$main$1.apply$mcVI$sp(KafkaLog_local_zk_minOffset.scala:64)
    at scala.collection.immutable.Range.foreach$mVc$sp(Range.scala:160)
    at streaming.KafkaLog_local_zk_minOffset$.main(KafkaLog_local_zk_minOffset.scala:44)
    at streaming.KafkaLog_local_zk_minOffset.main(KafkaLog_local_zk_minOffset.scala)
1
解决将Kafka config下的server.properties的参数修改下
1
2
num.network.threads=3
zookeeper.connection.timeout.ms=6000

 再次尝试即可.

posted on   嘣嘣嚓  阅读(726)  评论(0编辑  收藏  举报

编辑推荐:
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
阅读排行:
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· 记一次.NET内存居高不下排查解决与启示
历史上的今天:
2017-08-14 XShell 连接虚拟机中的服务器 失败 、连接中断(Connection closed by foreign host.)

导航

< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5
点击右上角即可分享
微信分享提示