Kafka客户端版本0.10.1.0 导致 spark 只获取到一个分区的数据
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.10.1.0</version> <!-- 该版本有问题 -->
</dependency>
替换依赖版本为 0.10.0.1、0.10.2.0 或更高即可
https://stackoverflow.com/questions/42239005/spark-structured-stream-get-messages-from-only-one-partition-of-kafka
有一个 topic 的分区数为 2 ,欲使用spark streaming读取,结果只能获取到一个分区的数据,附代码
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName(getClass.getName)
.setMaster("local[*]")
val ssc = new StreamingContext(conf, Seconds(5))
ssc.sparkContext.setLogLevel("warn")
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> bootstrapServer,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupId,
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = Array(topic)
val stream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
stream.foreachRDD(rdd => {
// 获取偏移量
val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
println(offsetRanges.mkString("Array( ", " , ", " )"))
// action
rdd.foreach(println)
// 提交偏移量
stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
})
ssc.start()
ssc.awaitTermination()
}