1 package com.it.baizhan.scalacode.Streaming
2
3 import org.apache.kafka.clients.consumer.ConsumerRecord
4 import org.apache.kafka.common.serialization.StringDeserializer
5 import org.apache.spark.{SparkConf, TaskContext}
6 import org.apache.spark.streaming.dstream.{DStream, InputDStream}
7 import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils, OffsetRange}
8 import org.apache.spark.streaming.{Durations, StreamingContext}
9 import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
10 import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
11
12 /**
13 * SparkStreaming 2.3.1 + Kafka 0.11 Direct模式整合
14 */
15 object SparkStremaingReadKafka {
16 def main(args: Array[String]): Unit = {
17 val conf = new SparkConf()
18 conf.setMaster("local")
19 conf.setAppName("SparkStreamingReadKafka")
20 val ssc = new StreamingContext(conf,Durations.seconds(5))
21 // ssc.sparkContext.setLogLevel("Error")
22
23 val kafkaParams = Map[String, Object](
24 "bootstrap.servers" -> "mynode1:9092,mynode2:9092,mynode3:9092", // kafka集群
25 "key.deserializer" -> classOf[StringDeserializer], //指定读取kafka数据 key的序列化格式
26 "value.deserializer" -> classOf[StringDeserializer], //指定读取Kafka 数据value的序列化格式
27 "group.id" -> "firstgroup", // 指定消费者组,利用kafka管理消费者offset时,需要以组为单位存储。
28 /**
29 * latest :连接kafka之后,读取向kafka中生产的数据
30 * earliest : 如果kafka中有当前消费者组存储的消费者offset,就接着这个位置读取数据继续消费。如果kafka中没有当前消费者组对应的消费offset,
31 * 就从最早的位置消费数据。
32 */
33 "auto.offset.reset" -> "earliest",
34 "enable.auto.commit" -> (false: java.lang.Boolean)//是否开启自动向Kafka 提交消费者offset,周期5s
35 )
36
37 val topics = Array[String]("streamingtopic")
38
39 val ds: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
40 ssc,
41 PreferConsistent, //接收Kafka数据的策略,这种策略是均匀将Kafka中的数据接收到Executor中处理。
42 Subscribe[String, String](topics, kafkaParams)
43 )
44 val lines: DStream[String] = ds.map(cr => {
45 println(s"message key = ${cr.key()}")
46 println(s"message value = ${cr.value()}")
47 cr.value()
48 })
49 val words : DStream[String]= lines.flatMap(line=>{line.split("\t")})
50 val pairWords: DStream[(String,Int)] = words.map(word=>{(word,1)})
51 val result = pairWords.reduceByKey((v1,v2)=>{v1+v2})
52 result.print()
53
54 //保证业务逻辑处理完成的情况下,异步将当前批次offset提交给kafka,异步提交DStream需要使用源头的DStream
55 ds.foreachRDD { rdd =>
56 val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
57 // some time later, after outputs have completed
58 ds.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges) // 异步向Kafka中提交消费者offset
59 }
60 ssc.start()
61 ssc.awaitTermination()
62 }
63
64 }
package com.it.baizhan.scalacode.Streaming
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.{SparkConf, TaskContext}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Durations, StreamingContext}
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
/**
* SparkStreaming 2.3.1 + Kafka 0.11 Direct模式整合
*/
object SparkStremaingReadKafka {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local")
conf.setAppName("SparkStreamingReadKafka")
val ssc = new StreamingContext(conf,Durations.seconds(5))
// ssc.sparkContext.setLogLevel("Error")
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "mynode1:9092,mynode2:9092,mynode3:9092", // kafka集群
"key.deserializer" -> classOf[StringDeserializer], //指定读取kafka数据 key的序列化格式
"value.deserializer" -> classOf[StringDeserializer], //指定读取Kafka 数据value的序列化格式
"group.id" -> "firstgroup", // 指定消费者组,利用kafka管理消费者offset时,需要以组为单位存储。
/**
* latest :连接kafka之后,读取向kafka中生产的数据
* earliest : 如果kafka中有当前消费者组存储的消费者offset,就接着这个位置读取数据继续消费。如果kafka中没有当前消费者组对应的消费offset,
* 就从最早的位置消费数据。
*/
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> (false: java.lang.Boolean)//是否开启自动向Kafka 提交消费者offset,周期5s
)
val topics = Array[String]("streamingtopic")
val ds: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent, //接收Kafka数据的策略,这种策略是均匀将Kafka中的数据接收到Executor中处理。
Subscribe[String, String](topics, kafkaParams)
)
val lines: DStream[String] = ds.map(cr => {
println(s"message key = ${cr.key()}")
println(s"message value = ${cr.value()}")
cr.value()
})
val words : DStream[String]= lines.flatMap(line=>{line.split("\t")})
val pairWords: DStream[(String,Int)] = words.map(word=>{(word,1)})
val result = pairWords.reduceByKey((v1,v2)=>{v1+v2})
result.print()
//保证业务逻辑处理完成的情况下,异步将当前批次offset提交给kafka,异步提交DStream需要使用源头的DStream
ds.foreachRDD { rdd =>
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
// some time later, after outputs have completed
ds.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges) // 异步向Kafka中提交消费者offset
}
ssc.start()
ssc.awaitTermination()
}
}