ZhangZhihui's Blog

公告

昵称： ZhangZhihuiAAA
园龄： 5年9个月
粉丝： 1
关注： 0
+加关注

日历

2025年3月

日

一

二

三

四

五

六

随笔档案 (761)

阅读排行榜

PySpark - Processing Streaming Data

from delta import configure_spark_with_delta_pip, DeltaTable
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

builder = (SparkSession.builder
           .appName("delta-write-streaming")
           .master("spark://ZZHPC:7077")
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))

spark = configure_spark_with_delta_pip(builder,['org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1']).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

get_ipython().run_line_magic('load_ext', 'sparksql_magic')
get_ipython().run_line_magic('config', 'SparkSql.limit=20')

%%sparksql
CREATE OR REPLACE TABLE default.users (
    id INT,
    name STRING,
    age INT,
    gender STRING,
    country STRING 
) USING DELTA LOCATION '/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/delta-write-streaming/users';

df = (spark.readStream.format("kafka")
      .option("kafka.bootstrap.servers", "localhost:9092")
      .option("subscribe", "users")
      .option("startingOffsets", "earliest")
      .load())

schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('name', StringType(), True),
    StructField('age', IntegerType(), True),
    StructField('gender', StringType(), True),
    StructField('country', StringType(), True)])

df = df.withColumn('value', from_json(col('value').cast("STRING"), schema))

df = df.select(
    col('value.id').alias('id'),
    col('value.name').alias('name'),
    col('value.age').alias('age'),
    col('value.gender').alias('gender'),
    col('value.country').alias('country'))

query = (df.writeStream.format("delta")
   .outputMode("append")
   .option("checkpointLocation", "/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/delta-write-streaming/users/_checkpoints/")
   .start("/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/delta-write-streaming/users"))

%%sparksql
SELECT COUNT(*) FROM delta.`/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/delta-write-streaming/users`;

%%sparksql
SELECT COUNT(*) FROM delta.`/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/delta-write-streaming/users`;

query.stop()

spark.stop()

%%sparksql
CREATE OR REPLACE TABLE default.users (
    id INT,
    name STRING,
    age INT,
    gender STRING,
    country STRING 
) USING DELTA LOCATION '/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/idempotent-stream-write-delta/users';

df = (spark.readStream.format("kafka")
      .option("kafka.bootstrap.servers", "localhost:9092")
      .option("subscribe", "users")
      .option("startingOffsets", "earliest")
      .load())

schema = StructType([
    StructField('id', IntegerType(), Tru.e),
    StructField('name', StringType(), True),
    StructField('age', IntegerType(), True),
    StructField('gender', StringType(), True),
    StructField('country', StringType(), True)])

df = df.withColumn('value', from_json(col('value').cast("STRING"), schema))

df = df.select(
    col('value.id').alias('id'),
    col('value.name').alias('name'),
    col('value.age').alias('age'),
    col('value.gender').alias('gender'),
    col('value.country').alias('country'))

query = (df.writeStream
   .format("delta")
   .outputMode("append")
   .option("checkpointLocation", "/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/idempotent-stream-write-delta/users/_checkpoints/")
   .start("/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/idempotent-stream-write-delta/users"))

# Define a function writing to two destinations
app_id = 'idempotent-stream-write-delta'
def writeToDeltaLakeTableIdempotent(batch_df, batch_id):
    # location 1
    (batch_df.filter("country IN ('India','China')")
     .write.format("delta")
     .mode("append")
     .option("txnVersion", batch_id)
     .option("txnAppId", app_id)
     .save("/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/idempotent-stream-write-delta/user_asia"))
    # location 2
    (batch_df.filter("country IN ('USA','Canada','Brazil')")
     .write.format("delta")
     .mode("append")
     .option("txnVersion", batch_id)
     .option("txnAppId", app_id)
     .save("/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/idempotent-stream-write-delta/user_americas"))

# Apply the function against the micro-batches using ‘foreachBatch’
write_query = (df.writeStream.format("delta")
 .queryName("Users By Region")
 .foreachBatch(writeToDeltaLakeTableIdempotent)
 .start())

%%sparksql
SELECT COUNT(*) FROM delta.`/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/idempotent-stream-write-delta/user_asia`;

%%sparksql
SELECT COUNT(*) FROM delta.`/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/idempotent-stream-write-delta/user_americas`;

%%sparksql
SELECT COUNT(*) FROM delta.`/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/idempotent-stream-write-delta/user_asia`;

%%sparksql
SELECT COUNT(*) FROM delta.`/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/idempotent-stream-write-delta/user_americas`;

query.stop()
write_query.stop()

spark.stop()

%%sparksql
CREATE OR REPLACE TABLE default.users (
    id INT,
    name STRING,
    age INT,
    gender STRING,
    country STRING 
) USING DELTA LOCATION '/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/merge-cdc-streaming/users';

df = (spark.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "localhost:9092")
      .option("subscribe", "users")
      .option("startingOffsets", "earliest")
      .load())

schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('name', StringType(), True),
    StructField('age', IntegerType(), True),
    StructField('gender', StringType(), True),
    StructField('country', StringType(), True)])

df = df.withColumn('value', from_json(col('value').cast("STRING"), schema))

df = df.select(
    col('value.id').alias('id'),
    col('value.name').alias('name'),
    col('value.age').alias('age'),
    col('value.gender').alias('gender'),
    col('value.country').alias('country'))

def upsertToDelta(microBatchDf, batchId):
    deltaTable = DeltaTable.forPath(spark, "/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/merge-cdc-streaming/users" )
    (deltaTable.alias("dt")
     .merge(source=microBatchDf.alias("sdf"), condition="sdf.id = dt.id")
     .whenMatchedUpdate(set={
         "id": "sdf.id",
         "name": "sdf.name",
         "age": "sdf.gender",
         "country": "sdf.country"
     })
     .whenNotMatchedInsert(values={
         "id": "sdf.id",
         "name": "sdf.name",
         "age": "sdf.gender",
         "country": "sdf.country"
     })
    .execute())

query = (df.writeStream.format("delta")
         .foreachBatch(upsertToDelta)
         .outputMode("update")
         .option("checkpointLocation", "/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/merge-cdc-streaming/users/_checkpoints/")
         .start("/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/merge-cdc-streaming/users"))

%%sparksql
DESCRIBE HISTORY delta.`/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/merge-cdc-streaming/users`;

query.stop()

spark.stop()

import random
import json
from kafka import KafkaProducer
import time
import datetime

bootstrap_servers = "localhost:9092"
topic = "orders"

producer = KafkaProducer(bootstrap_servers=bootstrap_servers)

# Define the product IDs and the quantities
product_ids = [1001, 1002, 1003, 1004, 1005]
quantities = [1, 2, 3, 4, 5]

# Define a function to generate random event data
def generate_orders():
    current_time = time.time()
    order_id = random.randint(100000, 999999)
    product_id = random.choice(product_ids)
    quantity = random.choice(quantities)
    timestamp = datetime.datetime.fromtimestamp(current_time).strftime("%m/%d/%Y, %H:%M:%S")
    
    # Create an order dictionary
    return {"order_id": order_id, "product_id": product_id, "quantity": quantity,"timestamp": timestamp}

# Loop to generate and send events
while True:
  # Generate a random event
  order = generate_orders()
  # Print the event to the console
  print(order)
  # Send the event to the Kafka topic
    
  producer.send(topic, value=json.dumps(order).encode('utf-8'))
  time.sleep(30)

from delta import configure_spark_with_delta_pip, DeltaTable
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json,to_timestamp
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

builder = (SparkSession.builder
           .appName("joining-stream-static-data")
           .master("spark://ZZHPC:7077")
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))

spark = configure_spark_with_delta_pip(builder,['org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1']).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

get_ipython().run_line_magic('load_ext', 'sparksql_magic')
get_ipython().run_line_magic('config', 'SparkSql.limit=20')

# Define the schema of the streaming data
streaming_schema = StructType([
    StructField("order_id", IntegerType()),
    StructField("product_id", IntegerType()),
    StructField("quantity", IntegerType()),
    StructField("timestamp", IntegerType())
])

streaming_df = (spark.readStream.format("kafka")
      .option("kafka.bootstrap.servers", "localhost:9092")
      .option("subscribe", "orders")
      .option("startingOffsets", "earliest")
      .option("failOnDataLoss", "false")
      .load()
      .withColumn('value', from_json(col('value').cast("STRING"), streaming_schema)))

streaming_df = (streaming_df
      .select(
          col('value.order_id').alias('order_id'),
          col('value.product_id').alias('product_id'),
          col('value.quantity').alias('quantity'),
          to_timestamp(col("timestamp"), "MM/dd/yyyy, HH:mm:ss" ).alias('timestamp'))
     )

# Define a list of tuples
product_details = [
    (1001, "Laptop", 999.99),
    (1002, "Mouse", 19.99),
    (1003, "Keyboard", 29.99),
    (1004, "Monitor", 199.99),
    (1005, "Speaker", 49.99)
]

# Define a list of column names
columns = ["product_id", "name", "price"]

# Create a DataFrame from the list of tuples
static_df = spark.createDataFrame(product_details, columns)

# Join the streaming data with the static data
joined_df = (streaming_df.join(static_df, streaming_df.product_id == static_df.product_id, "inner")
             .drop(static_df.product_id)
             .withColumn('invoice_amount', streaming_df.quantity * static_df.price))

query = (joined_df.writeStream.format("delta")
   .outputMode("append")
   .option("failOnDataLoss", "true")
   .option("checkpointLocation", "/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/joining-stream-static/orders/_checkpoints/")
   .start("/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/joining-stream-static/orders")
)

%%sparksql
SELECT * FROM delta.`/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/joining-stream-static/orders`;

query.stop()

spark.stop()

schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('name', StringType(), True),
    StructField('age', IntegerType(), True),
    StructField('gender', StringType(), True),
    StructField('country', StringType(), True),
    StructField('timestamp', StringType(), True)])

users_df = (spark.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "localhost:9092")
      .option("subscribe", "users")
      .option("startingOffsets", "latest")
      .load()
      .withColumn('value', from_json(col('value').cast("STRING"), schema)))

users_df = users_df.select(
    col('value.id').alias('id'),
    col('value.name').alias('name'),
    col('value.age').alias('age'),
    col('value.gender').alias('gender'),
    col('value.country').alias('country'),
    to_timestamp(col('value.timestamp'), "MM/dd/yyyy, HH:mm:ss").alias('timestamp'))

schema = StructType([
    StructField('user_id', IntegerType(), True),
    StructField('event_type', StringType(), True),
    StructField('event_time', StringType(), True),
    StructField('processing_time', StringType(), True)])

events_df = (spark.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "localhost:9092")
      .option("subscribe", "events")
      .option("startingOffsets", "latest")
      .load()
      .withColumn('value', from_json(col('value').cast("STRING"), schema)))

events_df = (events_df.select(
          col('value.user_id').alias('user_id'),
          col('value.event_type').alias('event_type'),
          col('value.event_time').alias('event_time'),
          col('value.processing_time').alias('processing_time'))
      .withColumn("event_time", to_timestamp(col("event_time"), "MM/dd/yyyy, HH:mm:ss" ))
      .withColumn("processing_time", to_timestamp(col("processing_time"), "MM/dd/yyyy, HH:mm:ss")))

# Join the two streaming DataFrames on user
join_df = (events_df.join(users_df.withWatermark("timestamp", "10 minutes"), # Define watermark for users stream
                          events_df.user_id == users_df.id, # Join condition
                          "inner") # Join type
                    .withWatermark("event_time", "1 minutes") # Define watermark for events stream
                    .drop(users_df.id))

query = (join_df.writeStream.format("delta")
   .outputMode("append")
   .option("failOnDataLoss", "true")
   .option("checkpointLocation", "/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/joining-stream-stream/user_events/_checkpoints/")
   .start("/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/joining-stream-stream/user_events"))

%%sparksql
SELECT event_type, gender, country, count(user_id) 
FROM delta.`/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/joining-stream-stream/user_events` 
GROUP BY ALL;

sparksql的执行结果是一个时点的结果，并不会实时更新。想要看到更新的结果，需要再次执行。

query.stop()

spark.stop()

schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('name', StringType(), True),
    StructField('age', IntegerType(), True),
    StructField('gender', StringType(), True),
    StructField('country', StringType(), True),
    StructField('timestamp', StringType(), True)])

users_df = (spark.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "localhost:9092")
      .option("subscribe", "users")
      .option("startingOffsets", "latest")
      .load()
      .withColumn('value', from_json(col('value').cast("STRING"), schema)))

users_df = users_df.select(
    col('value.id').alias('id'),
    col('value.name').alias('name'),
    col('value.age').alias('age'),
    col('value.gender').alias('gender'),
    col('value.country').alias('country'),
    to_timestamp(col('value.timestamp'), "MM/dd/yyyy, HH:mm:ss").alias('timestamp'))

query = (users_df.writeStream.format("delta")
   .queryName("user-kafka-stream")
   .outputMode("append")
   .option("checkpointLocation", "/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/monitor-streams/users/_checkpoints/")
   .start("/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/monitor-streams/users"))

query.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

query.status

{'message': 'Processing new data',
 'isDataAvailable': True,
 'isTriggerActive': True}

query.recentProgress

[{'id': '24ff8275-0172-4aae-96a7-aff88a4882e7',
  'runId': '808e2e6e-d777-4809-8f11-1c8a75135883',
  'name': 'user-kafka-stream',
  'timestamp': '2025-02-08T04:09:31.533Z',
  'batchId': 0,
  'numInputRows': 0,
  'inputRowsPerSecond': 0.0,
  'processedRowsPerSecond': 0.0,
  'durationMs': {'addBatch': 8919,
   'commitOffsets': 31,
   'getBatch': 12,
   'latestOffset': 510,
   'queryPlanning': 279,
   'triggerExecution': 9798,
   'walCommit': 30},
  'stateOperators': [],
  'sources': [{'description': 'KafkaV2[Subscribe[users]]',
    'startOffset': None,
    'endOffset': {'users': {'2': 0, '1': 0, '0': 0}},
    'latestOffset': {'users': {'2': 0, '1': 0, '0': 0}},
    'numInputRows': 0,
    'inputRowsPerSecond': 0.0,
    'processedRowsPerSecond': 0.0,
    'metrics': {'avgOffsetsBehindLatest': '0.0',
     'maxOffsetsBehindLatest': '0',
     'minOffsetsBehindLatest': '0'}}],
  'sink': {'description': 'DeltaSink[/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/monitor-streams/users]',
   'numOutputRows': -1}},
 {'id': '24ff8275-0172-4aae-96a7-aff88a4882e7',
  'runId': '808e2e6e-d777-4809-8f11-1c8a75135883',
  'name': 'user-kafka-stream',
  'timestamp': '2025-02-08T04:09:51.347Z',
  'batchId': 1,
  'numInputRows': 0,
  'inputRowsPerSecond': 0.0,
  'processedRowsPerSecond': 0.0,
  'durationMs': {'latestOffset': 2, 'triggerExecution': 2},
  'stateOperators': [],
  'sources': [{'description': 'KafkaV2[Subscribe[users]]',
    'startOffset': {'users': {'2': 0, '1': 0, '0': 0}},
    'endOffset': {'users': {'2': 0, '1': 0, '0': 0}},
    'latestOffset': {'users': {'2': 0, '1': 0, '0': 0}},
    'numInputRows': 0,
    'inputRowsPerSecond': 0.0,
    'processedRowsPerSecond': 0.0,
    'metrics': {'avgOffsetsBehindLatest': '0.0',
     'maxOffsetsBehindLatest': '0',
     'minOffsetsBehindLatest': '0'}}],
  'sink': {'description': 'DeltaSink[/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/monitor-streams/users]',
   'numOutputRows': -1}},
 {'id': '24ff8275-0172-4aae-96a7-aff88a4882e7',
  'runId': '808e2e6e-d777-4809-8f11-1c8a75135883',
  'name': 'user-kafka-stream',
  'timestamp': '2025-02-08T04:10:01.349Z',
  'batchId': 1,
  'numInputRows': 0,
  'inputRowsPerSecond': 0.0,
  'processedRowsPerSecond': 0.0,
  'durationMs': {'latestOffset': 1, 'triggerExecution': 1},
  'stateOperators': [],
  'sources': [{'description': 'KafkaV2[Subscribe[users]]',
    'startOffset': {'users': {'2': 0, '1': 0, '0': 0}},
    'endOffset': {'users': {'2': 0, '1': 0, '0': 0}},
    'latestOffset': {'users': {'2': 0, '1': 0, '0': 0}},
    'numInputRows': 0,
    'inputRowsPerSecond': 0.0,
    'processedRowsPerSecond': 0.0,
    'metrics': {'avgOffsetsBehindLatest': '0.0',
     'maxOffsetsBehindLatest': '0',
     'minOffsetsBehindLatest': '0'}}],
  'sink': {'description': 'DeltaSink[/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/monitor-streams/users]',
   'numOutputRows': -1}},
 {'id': '24ff8275-0172-4aae-96a7-aff88a4882e7',
  'runId': '808e2e6e-d777-4809-8f11-1c8a75135883',
  'name': 'user-kafka-stream',
  'timestamp': '2025-02-08T04:10:11.358Z',
  'batchId': 1,
  'numInputRows': 0,
  'inputRowsPerSecond': 0.0,
  'processedRowsPerSecond': 0.0,
  'durationMs': {'latestOffset': 1, 'triggerExecution': 1},
  'stateOperators': [],
  'sources': [{'description': 'KafkaV2[Subscribe[users]]',
    'startOffset': {'users': {'2': 0, '1': 0, '0': 0}},
    'endOffset': {'users': {'2': 0, '1': 0, '0': 0}},
    'latestOffset': {'users': {'2': 0, '1': 0, '0': 0}},
    'numInputRows': 0,
    'inputRowsPerSecond': 0.0,
    'processedRowsPerSecond': 0.0,
    'metrics': {'avgOffsetsBehindLatest': '0.0',
     'maxOffsetsBehindLatest': '0',
     'minOffsetsBehindLatest': '0'}}],
  'sink': {'description': 'DeltaSink[/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/monitor-streams/users]',
   'numOutputRows': -1}},
 {'id': '24ff8275-0172-4aae-96a7-aff88a4882e7',
  'runId': '808e2e6e-d777-4809-8f11-1c8a75135883',
  'name': 'user-kafka-stream',
  'timestamp': '2025-02-08T04:10:21.362Z',
  'batchId': 1,
  'numInputRows': 0,
  'inputRowsPerSecond': 0.0,
  'processedRowsPerSecond': 0.0,
  'durationMs': {'latestOffset': 1, 'triggerExecution': 1},
  'stateOperators': [],
  'sources': [{'description': 'KafkaV2[Subscribe[users]]',
    'startOffset': {'users': {'2': 0, '1': 0, '0': 0}},
    'endOffset': {'users': {'2': 0, '1': 0, '0': 0}},
    'latestOffset': {'users': {'2': 0, '1': 0, '0': 0}},
    'numInputRows': 0,
    'inputRowsPerSecond': 0.0,
    'processedRowsPerSecond': 0.0,
    'metrics': {'avgOffsetsBehindLatest': '0.0',
     'maxOffsetsBehindLatest': '0',
     'minOffsetsBehindLatest': '0'}}],
  'sink': {'description': 'DeltaSink[/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/monitor-streams/users]',
   'numOutputRows': -1}},
 {'id': '24ff8275-0172-4aae-96a7-aff88a4882e7',
  'runId': '808e2e6e-d777-4809-8f11-1c8a75135883',
  'name': 'user-kafka-stream',
  'timestamp': '2025-02-08T04:10:31.370Z',
  'batchId': 1,
  'numInputRows': 0,
  'inputRowsPerSecond': 0.0,
  'processedRowsPerSecond': 0.0,
  'durationMs': {'latestOffset': 0, 'triggerExecution': 1},
  'stateOperators': [],
  'sources': [{'description': 'KafkaV2[Subscribe[users]]',
    'startOffset': {'users': {'2': 0, '1': 0, '0': 0}},
    'endOffset': {'users': {'2': 0, '1': 0, '0': 0}},
    'latestOffset': {'users': {'2': 0, '1': 0, '0': 0}},
    'numInputRows': 0,
    'inputRowsPerSecond': 0.0,
    'processedRowsPerSecond': 0.0,
    'metrics': {'avgOffsetsBehindLatest': '0.0',
     'maxOffsetsBehindLatest': '0',
     'minOffsetsBehindLatest': '0'}}],
  'sink': {'description': 'DeltaSink[/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/monitor-streams/users]',
   'numOutputRows': -1}},
 {'id': '24ff8275-0172-4aae-96a7-aff88a4882e7',
  'runId': '808e2e6e-d777-4809-8f11-1c8a75135883',
  'name': 'user-kafka-stream',
  'timestamp': '2025-02-08T04:10:38.085Z',
  'batchId': 1,
  'numInputRows': 1,
  'inputRowsPerSecond': 76.92307692307692,
  'processedRowsPerSecond': 0.3155569580309246,
  'durationMs': {'addBatch': 3052,
   'commitOffsets': 53,
   'getBatch': 0,
   'latestOffset': 3,
   'queryPlanning': 15,
   'triggerExecution': 3169,
   'walCommit': 45},
  'stateOperators': [],
  'sources': [{'description': 'KafkaV2[Subscribe[users]]',
    'startOffset': {'users': {'2': 0, '1': 0, '0': 0}},
    'endOffset': {'users': {'2': 1, '1': 0, '0': 0}},
    'latestOffset': {'users': {'2': 1, '1': 0, '0': 0}},
    'numInputRows': 1,
    'inputRowsPerSecond': 76.92307692307692,
    'processedRowsPerSecond': 0.3155569580309246,
    'metrics': {'avgOffsetsBehindLatest': '0.0',
     'maxOffsetsBehindLatest': '0',
     'minOffsetsBehindLatest': '0'}}],
  'sink': {'description': 'DeltaSink[/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/monitor-streams/users]',
   'numOutputRows': -1}},
 {'id': '24ff8275-0172-4aae-96a7-aff88a4882e7',
  'runId': '808e2e6e-d777-4809-8f11-1c8a75135883',
  'name': 'user-kafka-stream',
  'timestamp': '2025-02-08T04:10:51.257Z',
  'batchId': 2,
  'numInputRows': 0,
  'inputRowsPerSecond': 0.0,
  'processedRowsPerSecond': 0.0,
  'durationMs': {'latestOffset': 0, 'triggerExecution': 0},
  'stateOperators': [],
  'sources': [{'description': 'KafkaV2[Subscribe[users]]',
    'startOffset': {'users': {'2': 1, '1': 0, '0': 0}},
    'endOffset': {'users': {'2': 1, '1': 0, '0': 0}},
    'latestOffset': {'users': {'2': 1, '1': 0, '0': 0}},
    'numInputRows': 0,
    'inputRowsPerSecond': 0.0,
    'processedRowsPerSecond': 0.0,
    'metrics': {'avgOffsetsBehindLatest': '0.0',
     'maxOffsetsBehindLatest': '0',
     'minOffsetsBehindLatest': '0'}}],
  'sink': {'description': 'DeltaSink[/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/monitor-streams/users]',
   'numOutputRows': -1}},
 {'id': '24ff8275-0172-4aae-96a7-aff88a4882e7',
  'runId': '808e2e6e-d777-4809-8f11-1c8a75135883',
  'name': 'user-kafka-stream',
  'timestamp': '2025-02-08T04:10:58.046Z',
  'batchId': 2,
  'numInputRows': 1,
  'inputRowsPerSecond': 90.90909090909092,
  'processedRowsPerSecond': 0.4564125969876769,
  'durationMs': {'addBatch': 2113,
   'commitOffsets': 45,
   'getBatch': 0,
   'latestOffset': 2,
   'queryPlanning': 9,
   'triggerExecution': 2191,
   'walCommit': 21},
  'stateOperators': [],
  'sources': [{'description': 'KafkaV2[Subscribe[users]]',
    'startOffset': {'users': {'2': 1, '1': 0, '0': 0}},
    'endOffset': {'users': {'2': 2, '1': 0, '0': 0}},
    'latestOffset': {'users': {'2': 2, '1': 0, '0': 0}},
    'numInputRows': 1,
    'inputRowsPerSecond': 90.90909090909092,
    'processedRowsPerSecond': 0.4564125969876769,
    'metrics': {'avgOffsetsBehindLatest': '0.0',
     'maxOffsetsBehindLatest': '0',
     'minOffsetsBehindLatest': '0'}}],
  'sink': {'description': 'DeltaSink[/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/monitor-streams/users]',
   'numOutputRows': -1}},
 {'id': '24ff8275-0172-4aae-96a7-aff88a4882e7',
  'runId': '808e2e6e-d777-4809-8f11-1c8a75135883',
  'name': 'user-kafka-stream',
  'timestamp': '2025-02-08T04:11:10.238Z',
  'batchId': 3,
  'numInputRows': 0,
  'inputRowsPerSecond': 0.0,
  'processedRowsPerSecond': 0.0,
  'durationMs': {'latestOffset': 1, 'triggerExecution': 1},
  'stateOperators': [],
  'sources': [{'description': 'KafkaV2[Subscribe[users]]',
    'startOffset': {'users': {'2': 2, '1': 0, '0': 0}},
    'endOffset': {'users': {'2': 2, '1': 0, '0': 0}},
    'latestOffset': {'users': {'2': 2, '1': 0, '0': 0}},
    'numInputRows': 0,
    'inputRowsPerSecond': 0.0,
    'processedRowsPerSecond': 0.0,
    'metrics': {'avgOffsetsBehindLatest': '0.0',
     'maxOffsetsBehindLatest': '0',
     'minOffsetsBehindLatest': '0'}}],
  'sink': {'description': 'DeltaSink[/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/monitor-streams/users]',
   'numOutputRows': -1}},
 {'id': '24ff8275-0172-4aae-96a7-aff88a4882e7',
  'runId': '808e2e6e-d777-4809-8f11-1c8a75135883',
  'name': 'user-kafka-stream',
  'timestamp': '2025-02-08T04:11:18.055Z',
  'batchId': 3,
  'numInputRows': 1,
  'inputRowsPerSecond': 90.90909090909092,
  'processedRowsPerSecond': 0.3972983710766786,
  'durationMs': {'addBatch': 2445,
   'commitOffsets': 25,
   'getBatch': 0,
   'latestOffset': 1,
   'queryPlanning': 16,
   'triggerExecution': 2517,
   'walCommit': 29},
  'stateOperators': [],
  'sources': [{'description': 'KafkaV2[Subscribe[users]]',
    'startOffset': {'users': {'2': 2, '1': 0, '0': 0}},
    'endOffset': {'users': {'2': 2, '1': 1, '0': 0}},
    'latestOffset': {'users': {'2': 2, '1': 1, '0': 0}},
    'numInputRows': 1,
    'inputRowsPerSecond': 90.90909090909092,
    'processedRowsPerSecond': 0.3972983710766786,
    'metrics': {'avgOffsetsBehindLatest': '0.0',
     'maxOffsetsBehindLatest': '0',
     'minOffsetsBehindLatest': '0'}}],
  'sink': {'description': 'DeltaSink[/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/monitor-streams/users]',
   'numOutputRows': -1}},
 {'id': '24ff8275-0172-4aae-96a7-aff88a4882e7',
  'runId': '808e2e6e-d777-4809-8f11-1c8a75135883',
  'name': 'user-kafka-stream',
  'timestamp': '2025-02-08T04:11:30.581Z',
  'batchId': 4,
  'numInputRows': 0,
  'inputRowsPerSecond': 0.0,
  'processedRowsPerSecond': 0.0,
  'durationMs': {'latestOffset': 1, 'triggerExecution': 1},
  'stateOperators': [],
  'sources': [{'description': 'KafkaV2[Subscribe[users]]',
    'startOffset': {'users': {'2': 2, '1': 1, '0': 0}},
    'endOffset': {'users': {'2': 2, '1': 1, '0': 0}},
    'latestOffset': {'users': {'2': 2, '1': 1, '0': 0}},
    'numInputRows': 0,
    'inputRowsPerSecond': 0.0,
    'processedRowsPerSecond': 0.0,
    'metrics': {'avgOffsetsBehindLatest': '0.0',
     'maxOffsetsBehindLatest': '0',
     'minOffsetsBehindLatest': '0'}}],
  'sink': {'description': 'DeltaSink[/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/monitor-streams/users]',
   'numOutputRows': -1}},
 {'id': '24ff8275-0172-4aae-96a7-aff88a4882e7',
  'runId': '808e2e6e-d777-4809-8f11-1c8a75135883',
  'name': 'user-kafka-stream',
  'timestamp': '2025-02-08T04:11:38.048Z',
  'batchId': 4,
  'numInputRows': 1,
  'inputRowsPerSecond': 90.90909090909092,
  'processedRowsPerSecond': 0.48995590396864286,
  'durationMs': {'addBatch': 1982,
   'commitOffsets': 32,
   'getBatch': 0,
   'latestOffset': 0,
   'queryPlanning': 7,
   'triggerExecution': 2041,
   'walCommit': 19},
  'stateOperators': [],
  'sources': [{'description': 'KafkaV2[Subscribe[users]]',
    'startOffset': {'users': {'2': 2, '1': 1, '0': 0}},
    'endOffset': {'users': {'2': 2, '1': 1, '0': 1}},
    'latestOffset': {'users': {'2': 2, '1': 1, '0': 1}},
    'numInputRows': 1,
    'inputRowsPerSecond': 90.90909090909092,
    'processedRowsPerSecond': 0.48995590396864286,
    'metrics': {'avgOffsetsBehindLatest': '0.0',
     'maxOffsetsBehindLatest': '0',
     'minOffsetsBehindLatest': '0'}}],
  'sink': {'description': 'DeltaSink[/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/monitor-streams/users]',
   'numOutputRows': -1}},
 {'id': '24ff8275-0172-4aae-96a7-aff88a4882e7',
  'runId': '808e2e6e-d777-4809-8f11-1c8a75135883',
  'name': 'user-kafka-stream',
  'timestamp': '2025-02-08T04:11:50.095Z',
  'batchId': 5,
  'numInputRows': 0,
  'inputRowsPerSecond': 0.0,
  'processedRowsPerSecond': 0.0,
  'durationMs': {'latestOffset': 1, 'triggerExecution': 1},
  'stateOperators': [],
  'sources': [{'description': 'KafkaV2[Subscribe[users]]',
    'startOffset': {'users': {'2': 2, '1': 1, '0': 1}},
    'endOffset': {'users': {'2': 2, '1': 1, '0': 1}},
    'latestOffset': {'users': {'2': 2, '1': 1, '0': 1}},
    'numInputRows': 0,
    'inputRowsPerSecond': 0.0,
    'processedRowsPerSecond': 0.0,
    'metrics': {'avgOffsetsBehindLatest': '0.0',
     'maxOffsetsBehindLatest': '0',
     'minOffsetsBehindLatest': '0'}}],
  'sink': {'description': 'DeltaSink[/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/monitor-streams/users]',
   'numOutputRows': -1}},
 {'id': '24ff8275-0172-4aae-96a7-aff88a4882e7',
  'runId': '808e2e6e-d777-4809-8f11-1c8a75135883',
  'name': 'user-kafka-stream',
  'timestamp': '2025-02-08T04:11:58.055Z',
  'batchId': 5,
  'numInputRows': 1,
  'inputRowsPerSecond': 90.90909090909092,
  'processedRowsPerSecond': 0.5817335660267597,
  'durationMs': {'addBatch': 1670,
   'commitOffsets': 22,
   'getBatch': 0,
   'latestOffset': 0,
   'queryPlanning': 8,
   'triggerExecution': 1719,
   'walCommit': 19},
  'stateOperators': [],
  'sources': [{'description': 'KafkaV2[Subscribe[users]]',
    'startOffset': {'users': {'2': 2, '1': 1, '0': 1}},
    'endOffset': {'users': {'2': 2, '1': 2, '0': 1}},
    'latestOffset': {'users': {'2': 2, '1': 2, '0': 1}},
    'numInputRows': 1,
    'inputRowsPerSecond': 90.90909090909092,
    'processedRowsPerSecond': 0.5817335660267597,
    'metrics': {'avgOffsetsBehindLatest': '0.0',
     'maxOffsetsBehindLatest': '0',
     'minOffsetsBehindLatest': '0'}}],
  'sink': {'description': 'DeltaSink[/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/monitor-streams/users]',
   'numOutputRows': -1}},
 {'id': '24ff8275-0172-4aae-96a7-aff88a4882e7',
  'runId': '808e2e6e-d777-4809-8f11-1c8a75135883',
  'name': 'user-kafka-stream',
  'timestamp': '2025-02-08T04:12:09.775Z',
  'batchId': 6,
  'numInputRows': 0,
  'inputRowsPerSecond': 0.0,
  'processedRowsPerSecond': 0.0,
  'durationMs': {'latestOffset': 2, 'triggerExecution': 2},
  'stateOperators': [],
  'sources': [{'description': 'KafkaV2[Subscribe[users]]',
    'startOffset': {'users': {'2': 2, '1': 2, '0': 1}},
    'endOffset': {'users': {'2': 2, '1': 2, '0': 1}},
    'latestOffset': {'users': {'2': 2, '1': 2, '0': 1}},
    'numInputRows': 0,
    'inputRowsPerSecond': 0.0,
    'processedRowsPerSecond': 0.0,
    'metrics': {'avgOffsetsBehindLatest': '0.0',
     'maxOffsetsBehindLatest': '0',
     'minOffsetsBehindLatest': '0'}}],
  'sink': {'description': 'DeltaSink[/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/monitor-streams/users]',
   'numOutputRows': -1}}]

from pyspark.sql.streaming import StreamingQueryListener

# Define a custom listener class
class MyListener(StreamingQueryListener):

    # Override the onQueryStarted method
    def onQueryStarted(self, event):
        # Print the query name and id when it starts
        print(f"'{event.name}' [{event.id}] got started!")

    # Override the onQueryProgress method
    def onQueryProgress(self, event):
        # Print the input rate and processing rate when it progresses
        print(f"Query made progress: " + str(event.progress))

    # Override the onQueryTerminated method
    def onQueryTerminated(self, event):
        # Print the exception message when it terminates
        if event.exception:
            print(f"Query with id {event.id} terminated with exception: {event}")
        else:
            print(f"Query with id {event.id} terminated normally")

# Create an instance of the listener class
listener = MyListener()

# Register the listener with spark.streams
spark.streams.addListener(listener)

Query made progress: {
  "id" : "24ff8275-0172-4aae-96a7-aff88a4882e7",
  "runId" : "808e2e6e-d777-4809-8f11-1c8a75135883",
  "name" : "user-kafka-stream",
  "timestamp" : "2025-02-08T04:31:11.695Z",
  "batchId" : 18,
  "numInputRows" : 1,
  "inputRowsPerSecond" : 90.90909090909092,
  "processedRowsPerSecond" : 0.5858230814294083,
  "durationMs" : {
    "addBatch" : 1654,
    "commitOffsets" : 26,
    "getBatch" : 0,
    "latestOffset" : 0,
    "queryPlanning" : 6,
    "triggerExecution" : 1707,
    "walCommit" : 20
  },
  "stateOperators" : [ ],
  "sources" : [ {
    "description" : "KafkaV2[Subscribe[users]]",
    "startOffset" : {
      "users" : {
        "2" : 8,
        "1" : 4,
        "0" : 5
      }
    },
    "endOffset" : {
      "users" : {
        "2" : 8,
        "1" : 5,
        "0" : 5
      }
    },
    "latestOffset" : {
      "users" : {
        "2" : 8,
        "1" : 5,
        "0" : 5
      }
    },
    "numInputRows" : 1,
    "inputRowsPerSecond" : 90.90909090909092,
    "processedRowsPerSecond" : 0.5858230814294083,
    "metrics" : {
      "avgOffsetsBehindLatest" : "0.0",
      "maxOffsetsBehindLatest" : "0",
      "minOffsetsBehindLatest" : "0"
    }
  } ],
  "sink" : {
    "description" : "DeltaSink[/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/monitor-streams/users]",
    "numOutputRows" : -1
  }
}
Query made progress: {
  "id" : "24ff8275-0172-4aae-96a7-aff88a4882e7",
  "runId" : "808e2e6e-d777-4809-8f11-1c8a75135883",
  "name" : "user-kafka-stream",
  "timestamp" : "2025-02-08T04:31:21.700Z",
  "batchId" : 19,
  "numInputRows" : 1,
  "inputRowsPerSecond" : 90.90909090909092,
  "processedRowsPerSecond" : 0.6506180871828238,
  "durationMs" : {
    "addBatch" : 1492,
    "commitOffsets" : 22,
    "getBatch" : 0,
    "latestOffset" : 0,
    "queryPlanning" : 4,
    "triggerExecution" : 1537,
    "walCommit" : 18
  },
  "stateOperators" : [ ],
  "sources" : [ {
    "description" : "KafkaV2[Subscribe[users]]",
    "startOffset" : {
      "users" : {
        "2" : 8,
        "1" : 5,
        "0" : 5
      }
    },
    "endOffset" : {
      "users" : {
        "2" : 9,
        "1" : 5,
        "0" : 5
      }
    },
    "latestOffset" : {
      "users" : {
        "2" : 9,
        "1" : 5,
        "0" : 5
      }
    },
    "numInputRows" : 1,
    "inputRowsPerSecond" : 90.90909090909092,
    "processedRowsPerSecond" : 0.6506180871828238,
    "metrics" : {
      "avgOffsetsBehindLatest" : "0.0",
      "maxOffsetsBehindLatest" : "0",
      "minOffsetsBehindLatest" : "0"
    }
  } ],
  "sink" : {
    "description" : "DeltaSink[/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/monitor-streams/users]",
    "numOutputRows" : -1
  }
}
Query made progress: {
  "id" : "24ff8275-0172-4aae-96a7-aff88a4882e7",
  "runId" : "808e2e6e-d777-4809-8f11-1c8a75135883",
  "name" : "user-kafka-stream",
  "timestamp" : "2025-02-08T04:31:31.697Z",
  "batchId" : 20,
  "numInputRows" : 1,
  "inputRowsPerSecond" : 83.33333333333333,
  "processedRowsPerSecond" : 0.5162622612287042,
  "durationMs" : {
    "addBatch" : 1895,
    "commitOffsets" : 19,
    "getBatch" : 0,
    "latestOffset" : 0,
    "queryPlanning" : 5,
    "triggerExecution" : 1937,
    "walCommit" : 18
  },
  "stateOperators" : [ ],
  "sources" : [ {
    "description" : "KafkaV2[Subscribe[users]]",
    "startOffset" : {
      "users" : {
        "2" : 9,
        "1" : 5,
        "0" : 5
      }
    },
    "endOffset" : {
      "users" : {
        "2" : 9,
        "1" : 6,
        "0" : 5
      }
    },
    "latestOffset" : {
      "users" : {
        "2" : 9,
        "1" : 6,
        "0" : 5
      }
    },
    "numInputRows" : 1,
    "inputRowsPerSecond" : 83.33333333333333,
    "processedRowsPerSecond" : 0.5162622612287042,
    "metrics" : {
      "avgOffsetsBehindLatest" : "0.0",
      "maxOffsetsBehindLatest" : "0",
      "minOffsetsBehindLatest" : "0"
    }
  } ],
  "sink" : {
    "description" : "DeltaSink[/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/delta_lake/monitor-streams/users]",
    "numOutputRows" : -1
  }
}
......

Ran 'query.stop()' and the output updated:

......
Query with id 24ff8275-0172-4aae-96a7-aff88a4882e7 terminated normally

spark.stop()

posted on 2025-02-07 18:21 ZhangZhihuiAAA 阅读(3) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

相关博文：

· PySpark - Ingesting Streaming Data

· PySpark - Manipulate Data with Delta Lake

· Spark-秘籍-全-

· Spark 3.2.1 Structured Streaming编程指南

· Spark大数据-Spark+Kafka构建实时分析Dashboard

阅读排行：
· 震惊！C++程序真的从main开始吗？99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码？零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾（3.3-3.9）
· winform 绘制太阳，地球，月球运作规律

导航

搜索

常用链接

我的标签

随笔档案 (761)

阅读排行榜


Copyright © 2025 ZhangZhihuiAAA Powered by .NET 9.0 on Kubernetes 博客园