sparkstreming整合kafka(Spark Streaming相当于是Kafka的一个消费者)

引入pom依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.example</groupId>
    <artifactId>kafkacode</artifactId>
    <version>1.0-SNAPSHOT</version>

    <dependencies>
        // spark-streaming-kafka-0-10_2.11中引入的依赖已经包含了该依赖,所以这里
        <!--<dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-clients</artifactId>
            <version>0.11.0.0</version>
        </dependency>-->

        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka_2.12</artifactId>
            <version>0.11.0.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>2.3.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
            <version>2.3.1</version>
        </dependency>
    </dependencies>
</project>

sparkstreaming作为消费者代码

package sparkstreaming_kafka

import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent

/**
 * Spark Streaming整合Kafka数据源的方式(整合的含义其实就是基于Kafka数据源创建一个DStream)
 * 1、Spark Streaming整合Kafka的时候,Spark Streaming相当于是Kafka的一个消费者
 */
object KafkaAndSparkStreaming {
  def main(args: Array[String]): Unit = {
    // 1、创建一个StreamingContext上下文对象
    val sparkConf = new SparkConf().setAppName("kafka").setMaster("local[2]")
    val ssc = new StreamingContext(sparkConf,Seconds(10))

    // 2、通过Kafka数据源创建DStream--直连Kafka集群的方式
    val topics = Array("student")
    val kafkaParam = Map(
      "bootstrap.servers" -> "192.168.200.111:9092,192.168.200.112:9092,192.168.200.113:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "spark",
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )

    /**
     * 三个值
     *  1、streamingcontext
     *  2、类型
     *  3、spark Streaming订阅的kafka的主题以及连接kafka集群所需要的参数
     *
     *  读取回来Kafka数据在在DStream是一个ConsumerRecord类型----代表的是kafka中一条消息
     */
    val dStream: DStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParam))
    dStream.foreachRDD((rdd:RDD[ConsumerRecord[String, String]]) => {
      rdd.foreach((data:ConsumerRecord[String, String]) => {
        println("sparkstreaming读取处理了一条kafka的数据"+data.key()+data.value())
      })
    })

    ssc.start()
    ssc.awaitTermination()
  }
}

启动

# 命令行生产数据
[root@node3 ~]# kafka-console-producer.sh -broker-list node1:9092,node2:9092,node3:9092 --topic student

# java代码生成数据
package new_callback_pro;

import org.apache.kafka.clients.producer.Callback;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.clients.producer.RecordMetadata;

import java.util.Properties;

// 新版本的生产者API  带有回调函数的生产者
public class NewProducerCallBack {
    public static void main(String[] args) {
        // 1、定义生产者连接Kafka集群的配置项   key-value格式的
        Properties prop = new Properties();
        prop.put("bootstrap.servers", "192.168.200.111:9092,192.168.200.112:9092,192.168.200.113:9092");
        // key序列化
        prop.put("key.serializer", "org.apache.kafka.common.serialization.IntegerSerializer");
        // value序列化
        prop.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");

        // 2. 创建一个生产者
        KafkaProducer<Integer, String> producer = new KafkaProducer<Integer, String>(prop);

        // 3. 使用生产者向某个主题发送一个数据
        for (int i = 0; i < 100; i++) {
            // 生产者需要发送的消息
            ProducerRecord<Integer, String> record = new ProducerRecord<Integer, String>("student", i, "message" + i);
            producer.send(record, new Callback() {
                /**
                 * 方法就是kafka生产者消息发送完成之后,触发的一个回调函数
                 * @param recordMetadata  包含当前这个消息在Topic主题中的分区和offset
                 * @param e
                 */
                public void onCompletion(RecordMetadata recordMetadata, Exception e) {
                    System.out.println("当前这个数据的分区为:"+recordMetadata.partition() + "---offset:" + recordMetadata.offset());
                    System.out.println("当前的主题为"+recordMetadata.topic());
                    System.out.println("key为:" + recordMetadata.serializedKeySize() + "---value为:" + recordMetadata.serializedValueSize());
                }
            });
        }

        // 4. 代表将生产者生产的数据刷新到topic中
        producer.flush();
    }
}
posted @ 2022-09-08 10:13  jsqup  阅读(80)  评论(0编辑  收藏  举报