SparkStreaming 读取 Kafka中数据【SparkStreaming 2.3.1 + Kafka 0.11 Direct模式整合】

 1 package com.it.baizhan.scalacode.Streaming
 2 
 3 import org.apache.kafka.clients.consumer.ConsumerRecord
 4 import org.apache.kafka.common.serialization.StringDeserializer
 5 import org.apache.spark.{SparkConf, TaskContext}
 6 import org.apache.spark.streaming.dstream.{DStream, InputDStream}
 7 import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils, OffsetRange}
 8 import org.apache.spark.streaming.{Durations, StreamingContext}
 9 import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
10 import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
11 
12 /**
13   *  SparkStreaming 2.3.1 + Kafka 0.11 Direct模式整合
14   */
15 object SparkStremaingReadKafka {
16   def main(args: Array[String]): Unit = {
17     val conf = new SparkConf()
18     conf.setMaster("local")
19     conf.setAppName("SparkStreamingReadKafka")
20     val ssc = new StreamingContext(conf,Durations.seconds(5))
21 //    ssc.sparkContext.setLogLevel("Error")
22 
23     val kafkaParams = Map[String, Object](
24       "bootstrap.servers" -> "mynode1:9092,mynode2:9092,mynode3:9092", // kafka集群
25       "key.deserializer" -> classOf[StringDeserializer],  //指定读取kafka数据 key的序列化格式
26       "value.deserializer" -> classOf[StringDeserializer], //指定读取Kafka 数据value的序列化格式
27       "group.id" -> "firstgroup", // 指定消费者组,利用kafka管理消费者offset时,需要以组为单位存储。
28       /**
29         * latest :连接kafka之后,读取向kafka中生产的数据
30         * earliest : 如果kafka中有当前消费者组存储的消费者offset,就接着这个位置读取数据继续消费。如果kafka中没有当前消费者组对应的消费offset,
31         *       就从最早的位置消费数据。
32         */
33       "auto.offset.reset" -> "earliest",
34       "enable.auto.commit" -> (false: java.lang.Boolean)//是否开启自动向Kafka 提交消费者offset,周期5s
35     )
36 
37     val topics = Array[String]("streamingtopic")
38 
39     val ds: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
40       ssc,
41       PreferConsistent, //接收Kafka数据的策略,这种策略是均匀将Kafka中的数据接收到Executor中处理。
42       Subscribe[String, String](topics, kafkaParams)
43     )
44     val lines: DStream[String] = ds.map(cr => {
45       println(s"message key = ${cr.key()}")
46       println(s"message value = ${cr.value()}")
47       cr.value()
48     })
49     val words : DStream[String]= lines.flatMap(line=>{line.split("\t")})
50     val pairWords: DStream[(String,Int)] = words.map(word=>{(word,1)})
51     val result = pairWords.reduceByKey((v1,v2)=>{v1+v2})
52     result.print()
53 
54     //保证业务逻辑处理完成的情况下,异步将当前批次offset提交给kafka,异步提交DStream需要使用源头的DStream
55     ds.foreachRDD { rdd =>
56       val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
57       // some time later, after outputs have completed
58       ds.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges) // 异步向Kafka中提交消费者offset
59     }
60     ssc.start()
61     ssc.awaitTermination()
62   }
63 
64 }
package com.it.baizhan.scalacode.Streaming

import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.{SparkConf, TaskContext}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Durations, StreamingContext}
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe

/**
* SparkStreaming 2.3.1 + Kafka 0.11 Direct模式整合
*/
object SparkStremaingReadKafka {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local")
conf.setAppName("SparkStreamingReadKafka")
val ssc = new StreamingContext(conf,Durations.seconds(5))
// ssc.sparkContext.setLogLevel("Error")

val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "mynode1:9092,mynode2:9092,mynode3:9092", // kafka集群
"key.deserializer" -> classOf[StringDeserializer], //指定读取kafka数据 key的序列化格式
"value.deserializer" -> classOf[StringDeserializer], //指定读取Kafka 数据value的序列化格式
"group.id" -> "firstgroup", // 指定消费者组,利用kafka管理消费者offset时,需要以组为单位存储。
/**
* latest :连接kafka之后,读取向kafka中生产的数据
* earliest : 如果kafka中有当前消费者组存储的消费者offset,就接着这个位置读取数据继续消费。如果kafka中没有当前消费者组对应的消费offset,
* 就从最早的位置消费数据。
*/
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> (false: java.lang.Boolean)//是否开启自动向Kafka 提交消费者offset,周期5s
)

val topics = Array[String]("streamingtopic")

val ds: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent, //接收Kafka数据的策略,这种策略是均匀将Kafka中的数据接收到Executor中处理。
Subscribe[String, String](topics, kafkaParams)
)
val lines: DStream[String] = ds.map(cr => {
println(s"message key = ${cr.key()}")
println(s"message value = ${cr.value()}")
cr.value()
})
val words : DStream[String]= lines.flatMap(line=>{line.split("\t")})
val pairWords: DStream[(String,Int)] = words.map(word=>{(word,1)})
val result = pairWords.reduceByKey((v1,v2)=>{v1+v2})
result.print()

//保证业务逻辑处理完成的情况下,异步将当前批次offset提交给kafka,异步提交DStream需要使用源头的DStream
ds.foreachRDD { rdd =>
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
// some time later, after outputs have completed
ds.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges) // 异步向Kafka中提交消费者offset
}


ssc.start()
ssc.awaitTermination()
}

}
posted @ 2021-04-20 15:01  大数据程序员  阅读(244)  评论(0编辑  收藏  举报