Kafka+SparkStreaming+Zookeeper(ZK存储Offset,解决checkpoint问题)
创建一个topic
1 | ./kafka-topics.sh --create --zookeeper 192.168.1.244:2181,192.168.1.245:2181,192.168.1.246:2181 --replication-factor 1<br>--partitions 1 --topic topic_test_zk_minOffset_zkGroup |
查看topic列表
1 | ./kafka-topics.sh --list --zookeeper 192.168.1.244:2181,192.168.1.245:2181,192.168.1.246:2181 |
producer 代码如下
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | package com.kafka.test; import java.util.Properties; import org.apache.kafka.clients.producer.KafkaProducer; import org.apache.kafka.clients.producer.ProducerRecord; /** * @author:FengZhen * @create:2018年8月9日 */ public class Producer_zk { public static void main(String[] args) { Properties props = new Properties(); props.put( "bootstrap.servers" , "192.168.1.244:6667,192.168.1.247:6667" ); //props.put("zookeeper.connect", "192.168.1.244:2181,192.168.1.245:2181,192.168.1.246:2181"); props.put( "acks" , "all" ); props.put( "retries" , 0); props.put( "batch.size" , 16384); props.put( "linger.ms" , 1); props.put( "buffer.memory" , 33554432); props.put( "key.serializer" , "org.apache.kafka.common.serialization.StringSerializer" ); props.put( "value.serializer" , "org.apache.kafka.common.serialization.StringSerializer" ); KafkaProducer<String, String> producer = new KafkaProducer<String, String>(props); for ( int i = 30; i < 40; i++) producer.send( new ProducerRecord<String, String>( "topic_test_zk_minOffset_zkGroup" , Integer.toString(i), "中文测试-" +Integer.toString(i))); producer.close(); } } |
Streaming代码如下
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | package streaming import kafka.api.{OffsetRequest, PartitionOffsetRequestInfo, TopicMetadataRequest} import kafka.common.TopicAndPartition import kafka.consumer.SimpleConsumer import kafka.message.MessageAndMetadata import kafka.serializer.StringDecoder import kafka.utils.{ZKGroupTopicDirs, ZkUtils} import org.I0Itec.zkclient.ZkClient import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object KafkaLog_local_zk_minOffset_zkGroup { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName( "KafkaLog_local_zk_minOffset_zkGroup" ).setMaster( "local[2]" ) val sc = new SparkContext(conf) sc.setLogLevel( "WARN" ) val ssc = new StreamingContext(sc, Seconds(5)) val broker_servers = "192.168.1.244:6667,192.168.1.247:6667" val zk_host = "192.168.1.244:2181,192.168.1.245:2181,192.168.1.246:2181" //消费的 topic 名字 val topic : String = "topic_test_zk_minOffset_zkGroup" //创建 stream 时使用的 topic 名字集合 val topics : Set[String] = Set(topic) var kafkaParam:Map[String,String] = Map() kafkaParam += ( "bootstrap.servers" -> broker_servers) kafkaParam += ( "group.id" -> "test" ) kafkaParam += ( "enable.auto.commit" -> "true" ) kafkaParam += ( "auto.commit.interval.ms" -> "100" ) //创建一个 ZKGroupTopicDirs 对象,对保存 val topicDirs = new ZKGroupTopicDirs( "topic_test_zk_minOffset_zkGroup_group" , topic) //获取 zookeeper 中的路径,这里会变成 /consumers/test_spark_streaming_group/offsets/topic_name // /consumers/topic_test_zk_minOffset_zkGroup_group/offsets/topic_test_zk_minOffset_zkGroup/0 val zkTopicPath = s "${topicDirs.consumerOffsetDir}" //zookeeper 的host 和 ip,创建一个 client val zkClient = new ZkClient(zk_host) //查询该路径下是否字节点(默认有字节点为我们自己保存不同 partition 时生成的) val children = zkClient.countChildren(zkTopicPath) var kafkaStream : InputDStream[(String, String)] = null //如果 zookeeper 中有保存 offset,我们会利用这个 offset 作为 kafkaStream 的起始位置 var fromOffsets: Map[TopicAndPartition, Long] = Map() //如果保存过 offset,这里更好的做法,还应该和 kafka 上最小的 offset 做对比,不然会报 OutOfRange 的错误 if (children > 0) { for (i <- 0 until children) { val topic2 = List(topic) val req = new TopicMetadataRequest(topic2, 0) // 第一个参数是 kafka broker 的host,第二个是 port val getLeaderConsumer = new SimpleConsumer( "192.168.1.244" , 6667, 10000, 10000, "OffsetLookup" ) val res = getLeaderConsumer.send(req) val topicMetaOption = res.topicsMetadata.headOption val partitions = topicMetaOption match { // 将结果转化为 partition -> leader 的映射关系 case Some(tm) => tm.partitionsMetadata.map(pm => (pm.partitionId, pm.leader. get .host)).toMap[Int, String] case None => Map[Int, String]() } //去出分片对应的leader host val brokerLeaderHost = partitions. get (i).toString.replace( "Some(" , "" ).replace( ")" , "" ) val partitionOffset = zkClient.readData[String](s "${zkTopicPath}/${i}" ) val tp = TopicAndPartition(topic, i) val requestMin = OffsetRequest(Map(tp -> PartitionOffsetRequestInfo(OffsetRequest.EarliestTime, 1))) val consumerMin = new SimpleConsumer(brokerLeaderHost, 6667, 10000, 10000, "getMinOffset" ) val curOffsets = consumerMin.getOffsetsBefore(requestMin).partitionErrorAndOffsets(tp).offsets var nextOffset = partitionOffset.toLong // 通过比较从 kafka 上该 partition 的最小 offset 和 zk 上保存的 offset,进行选择 if (curOffsets.length > 0 && nextOffset < curOffsets.head) { nextOffset = curOffsets.head } //设置正确的 offset,这里将 nextOffset 设置为 0(0 只是一个特殊值),可以观察到 offset 过期的想想 fromOffsets += (tp -> nextOffset) println( "@@@@@@ topic[" + topic + "] partition[" + i + "] offset[" + partitionOffset + "] @@@@@@" ) } //这个会将 kafka 的消息进行 transform,最终 kafak 的数据都会变成 (topic_name, message) 这样的 tuple val messageHandler = (mmd : MessageAndMetadata[String, String]) => (mmd.topic, mmd.message()) kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParam, fromOffsets, messageHandler) } else { //如果未保存,根据 kafkaParam 的配置使用最新或者最旧的 offset kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParam, topics) } var offsetRanges = Array[OffsetRange]() //得到该 rdd 对应 kafka 的消息的 offset kafkaStream.transform{ rdd => offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges rdd }.foreachRDD { rdd => //.map(msg => Utils.msgDecode(msg)) for (o <- offsetRanges) { val zkPath = s "${zkTopicPath}/${o.partition}" //将该 partition 的 offset 保存到 zookeeper ZkUtils.updatePersistentPath(zkClient, zkPath, o.fromOffset.toString) println(s "@@@@@@ topic ${o.topic} partition ${o.partition} fromoffset ${o.fromOffset} untiloffset ${o.untilOffset} #######" ) } rdd.foreachPartition( message => { while (message.hasNext) { println(s "@^_^@ [" + message.next() + "] @^_^@" ) } } ) } //开启流式计算 ssc.start() //一直会阻塞,等待退出 ssc.awaitTermination() } } |
出现的问题
使用simpleConsumer时报错
1 2 3 4 5 6 7 8 9 | Exception in thread "main" java.nio.channels.ClosedChannelException at kafka.network.BlockingChannel.send(BlockingChannel.scala:100) at kafka.consumer.SimpleConsumer.liftedTree1$1(SimpleConsumer.scala:78) at kafka.consumer.SimpleConsumer.kafka$consumer$SimpleConsumer$$sendRequest(SimpleConsumer.scala:68) at kafka.consumer.SimpleConsumer.getOffsetsBefore(SimpleConsumer.scala:127) at streaming.KafkaLog_local_zk_minOffset$$anonfun$main$1.apply$mcVI$sp(KafkaLog_local_zk_minOffset.scala:64) at scala.collection.immutable.Range. foreach $mVc$sp(Range.scala:160) at streaming.KafkaLog_local_zk_minOffset$.main(KafkaLog_local_zk_minOffset.scala:44) at streaming.KafkaLog_local_zk_minOffset.main(KafkaLog_local_zk_minOffset.scala) |
1 | 解决将Kafka config下的server.properties的参数修改下 |
1 2 | num.network.threads=3 zookeeper.connection.timeout.ms=6000 |
再次尝试即可.
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· 记一次.NET内存居高不下排查解决与启示
2017-08-14 XShell 连接虚拟机中的服务器 失败 、连接中断(Connection closed by foreign host.)