ZhangZhihui's Blog  

 

 

import duckdb


records = duckdb.read_csv("data/C11/Pedestrian_Counting_System_Monthly_counts_per_hour_may_2009_to_14_dec_2022.csv")
records.show(max_width=200).limit(10)

 

复制代码
┌─────────┬───────────────────────────────┬───────┬──────────┬───────┬──────────┬───────┬───────────┬───────────────────────────────┬───────────────┐
│   ID    │           Date_Time           │ Year  │  Month   │ Mdate │   Day    │ Time  │ Sensor_ID │          Sensor_Name          │ Hourly_Counts │
│  int64  │            varchar            │ int64 │ varchar  │ int64 │ varchar  │ int64 │   int64   │            varchar            │     int64     │
├─────────┼───────────────────────────────┼───────┼──────────┼───────┼──────────┼───────┼───────────┼───────────────────────────────┼───────────────┤
│ 2887628 │ November 01, 2019 05:00:00 PM │  2019 │ November │     1 │ Friday   │    17 │        34 │ Flinders St-Spark La          │           300 │
│ 2887629 │ November 01, 2019 05:00:00 PM │  2019 │ November │     1 │ Friday   │    17 │        39 │ Alfred Place                  │           604 │
│ 2887630 │ November 01, 2019 05:00:00 PM │  2019 │ November │     1 │ Friday   │    17 │        37 │ Lygon St (East)               │           216 │
│ 2887631 │ November 01, 2019 05:00:00 PM │  2019 │ November │     1 │ Friday   │    17 │        40 │ Lonsdale St-Spring St (West)  │           627 │
│ 2887632 │ November 01, 2019 05:00:00 PM │  2019 │ November │     1 │ Friday   │    17 │        36 │ Queen St (West)               │           774 │
│ 2887633 │ November 01, 2019 05:00:00 PM │  2019 │ November │     1 │ Friday   │    17 │        29 │ St Kilda Rd-Alexandra Gardens │           644 │
│ 2887634 │ November 01, 2019 05:00:00 PM │  2019 │ November │     1 │ Friday   │    17 │        42 │ Grattan St-Swanston St (West) │           453 │
│ 2887635 │ November 01, 2019 05:00:00 PM │  2019 │ November │     1 │ Friday   │    17 │        43 │ Monash Rd-Swanston St (West)  │           387 │
│ 2887636 │ November 01, 2019 05:00:00 PM │  2019 │ November │     1 │ Friday   │    17 │        44 │ Tin Alley-Swanston St (West)  │            27 │
│ 2887637 │ November 01, 2019 05:00:00 PM │  2019 │ November │     1 │ Friday   │    17 │        35 │ Southbank                     │          2691 │
│    ·    │               ·               │    ·  │    ·     │     · │   ·      │     · │         · │     ·                         │            ·  │
│    ·    │               ·               │    ·  │    ·     │     · │   ·      │     · │         · │     ·                         │            ·  │
│    ·    │               ·               │    ·  │    ·     │     · │   ·      │     · │         · │     ·                         │            ·  │
│ 2897597 │ November 09, 2019 10:00:00 AM │  2019 │ November │     9 │ Saturday │    10 │        27 │ QV Market-Peel St             │           371 │
│ 2897598 │ November 09, 2019 10:00:00 AM │  2019 │ November │     9 │ Saturday │    10 │        28 │ The Arts Centre               │          1188 │
│ 2897599 │ November 09, 2019 10:00:00 AM │  2019 │ November │     9 │ Saturday │    10 │        31 │ Lygon St (West)               │           229 │
│ 2897600 │ November 09, 2019 10:00:00 AM │  2019 │ November │     9 │ Saturday │    10 │        30 │ Lonsdale St (South)           │           391 │
│ 2897601 │ November 09, 2019 10:00:00 AM │  2019 │ November │     9 │ Saturday │    10 │        34 │ Flinders St-Spark La          │           111 │
│ 2897602 │ November 09, 2019 10:00:00 AM │  2019 │ November │     9 │ Saturday │    10 │        37 │ Lygon St (East)               │           133 │
│ 2897603 │ November 09, 2019 10:00:00 AM │  2019 │ November │     9 │ Saturday │    10 │        40 │ Lonsdale St-Spring St (West)  │           154 │
│ 2897604 │ November 09, 2019 10:00:00 AM │  2019 │ November │     9 │ Saturday │    10 │        36 │ Queen St (West)               │           249 │
│ 2897605 │ November 09, 2019 10:00:00 AM │  2019 │ November │     9 │ Saturday │    10 │        29 │ St Kilda Rd-Alexandra Gardens │           448 │
│ 2897606 │ November 09, 2019 10:00:00 AM │  2019 │ November │     9 │ Saturday │    10 │        42 │ Grattan St-Swanston St (West) │           288 │
├─────────┴───────────────────────────────┴───────┴──────────┴───────┴──────────┴───────┴───────────┴───────────────────────────────┴───────────────┤
│ ? rows (>9999 rows, 20 shown)                                                                                                          10 columns │
└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
复制代码

 

records = duckdb.read_csv(
    "data/C11/Pedestrian_Counting_System_Monthly_counts_per_hour_may_2009_to_14_dec_2022.csv",
    dtype={"Date_Time": "TIMESTAMP"},
    timestamp_format="%B %d, %Y %H:%M:%S %p",
)

 

records.limit(5).show(max_width=200)

 

复制代码
┌─────────┬─────────────────────┬───────┬──────────┬───────┬─────────┬───────┬───────────┬──────────────────────────────┬───────────────┐
│   ID    │      Date_Time      │ Year  │  Month   │ Mdate │   Day   │ Time  │ Sensor_ID │         Sensor_Name          │ Hourly_Counts │
│  int64  │      timestamp      │ int64 │ varchar  │ int64 │ varchar │ int64 │   int64   │           varchar            │     int64     │
├─────────┼─────────────────────┼───────┼──────────┼───────┼─────────┼───────┼───────────┼──────────────────────────────┼───────────────┤
│ 2887628 │ 2019-11-01 17:00:00 │  2019 │ November │     1 │ Friday  │    17 │        34 │ Flinders St-Spark La         │           300 │
│ 2887629 │ 2019-11-01 17:00:00 │  2019 │ November │     1 │ Friday  │    17 │        39 │ Alfred Place                 │           604 │
│ 2887630 │ 2019-11-01 17:00:00 │  2019 │ November │     1 │ Friday  │    17 │        37 │ Lygon St (East)              │           216 │
│ 2887631 │ 2019-11-01 17:00:00 │  2019 │ November │     1 │ Friday  │    17 │        40 │ Lonsdale St-Spring St (West) │           627 │
│ 2887632 │ 2019-11-01 17:00:00 │  2019 │ November │     1 │ Friday  │    17 │        36 │ Queen St (West)              │           774 │
└─────────┴─────────────────────┴───────┴──────────┴───────┴─────────┴───────┴───────────┴──────────────────────────────┴───────────────┘
复制代码

 

 

Before loading our dataset into an on-disk database for analysis, we can also consider whether there are any data transformations we may want to apply.

transformed = records.select("* EXCLUDE ID").sort("Date_Time")

 

transformed.limit(5).show(max_width=200)

 

复制代码
┌─────────────────────┬───────┬─────────┬───────┬─────────┬───────┬───────────┬───────────────────────────────────┬───────────────┐
│      Date_Time      │ Year  │  Month  │ Mdate │   Day   │ Time  │ Sensor_ID │            Sensor_Name            │ Hourly_Counts │
│      timestamp      │ int64 │ varchar │ int64 │ varchar │ int64 │   int64   │              varchar              │     int64     │
├─────────────────────┼───────┼─────────┼───────┼─────────┼───────┼───────────┼───────────────────────────────────┼───────────────┤
│ 2009-05-01 00:00:00 │  2009 │ May     │     1 │ Friday  │     0 │         1 │ Bourke Street Mall (North)        │            53 │
│ 2009-05-01 00:00:00 │  2009 │ May     │     1 │ Friday  │     0 │         2 │ Bourke Street Mall (South)        │            52 │
│ 2009-05-01 00:00:00 │  2009 │ May     │     1 │ Friday  │     0 │         4 │ Town Hall (West)                  │           209 │
│ 2009-05-01 00:00:00 │  2009 │ May     │     1 │ Friday  │     0 │         5 │ Princes Bridge                    │           157 │
│ 2009-05-01 00:00:00 │  2009 │ May     │     1 │ Friday  │     0 │         6 │ Flinders Street Station Underpass │           139 │
└─────────────────────┴───────┴─────────┴───────┴─────────┴───────┴───────────┴───────────────────────────────────┴───────────────┘
复制代码

 

复制代码
with duckdb.connect("pedestrian.duckdb") as conn:
    result = (
        conn.read_csv(
            "data/C11/Pedestrian_Counting_System_Monthly_counts_per_hour_may_2009_to_14_dec_2022.csv",
            dtype={"Date_Time": "TIMESTAMP"},
            timestamp_format="%B %d, %Y %H:%M:%S %p",
        )
        .select("* EXCLUDE ID")
        .sort("Date_Time")
    )
    result.to_table("pedestrian_counts")
复制代码

 

conn = duckdb.connect("pedestrian.duckdb")
conn.sql(
    """
    SELECT sum(Hourly_Counts) AS Total_Counts
    FROM pedestrian_counts
    WHERE Year = 2022 AND Sensor_Name = 'Melbourne Central'
    """
)

 

┌──────────────┐
│ Total_Counts │
│    int128    │
├──────────────┤
│      6897406 │
└──────────────┘

 

conn.close()

 

%load_ext sql

 

conn = duckdb.connect()
%sql conn --alias duckdb

 

conn.close()

 

In our context, we want to work with the database we created in the previous section; so, let’s configure JupySQL so that it uses a new connection to our on-disk pedestrian.duckdb database while also giving it an appropriate alias:

conn = duckdb.connect("pedestrian.duckdb")
%sql conn --alias pedestrian.duckdb

 

%config SqlMagic.autopandas = True

 

 

%%sql
SELECT sum(Hourly_Counts) AS Total_Counts
FROM pedestrian_counts
WHERE Year = 2022 AND Sensor_Name = 'Melbourne Central'

 

type(_)

# pandas.core.frame.DataFrame

 

%%sql sensors_2022_df <<
SELECT Sensor_Name, sum(Hourly_Counts)::BIGINT AS Total_Counts
FROM pedestrian_counts
WHERE Year = 2022
GROUP BY Sensor_Name
ORDER BY Total_Counts DESC

 

sensors_2022_df.head(10)

 

 

sensors_2022_df.dtypes

 

Sensor_Name     object
Total_Counts     int64
dtype: object

In addition to confirming that the Total Counts column of our dataframe has an int64 data type, we can also see that the Sensor Name column contains the dtype object. While this can be a little misleading at first, this is pandas’ default way of representing string values and is therefore appropriate for this column.

 

import plotly.express as px

With the Plotly Express module loaded, let’s see it in action by using the px.bar() function to create a bar chart of the top 10 sensors by traffic in 2022 while using the sensors_2022_df dataframe we created in the previous section:

figure = px.bar(
    sensors_2022_df.head(10),
    x="Sensor_Name",
    y="Total_Counts",
    height=500,
    title="Top 10 sensors by traffic for 2022",
)
figure

 

 

Let’s make one more visualization with Plotly Express. This time, we’ll visualize the number of active sensors across each year using a line plot. Plotly provides this functionality through the px.line() function. First, we’ll need to query our database to get the distinct number of sensor names seen across each year, which we can do with the following DuckDB query:

%%sql sensor_years_df <<
SELECT Year, COUNT(DISTINCT Sensor_Name) AS Total_Sensors
FROM pedestrian_counts
GROUP BY Year
ORDER BY Year

 

sensor_years_df.head(5)

 

 

复制代码
figure = px.line(
    sensor_years_df,
    x="Year",
    y="Total_Sensors",
    markers=True,
    height=500,
    title="Total number of active sensors by year"
)
figure
复制代码

 

figure.update_layout(xaxis={"dtick": 1}, title={"x": 0.5})

 

figure.update_layout(xaxis_dtick=1, title_x=0.5)

In this short introduction to making visualizations with Plotly, we’ve focused on two Plotly Express functions, as well as Plotly chart customizations via a handful of figure-layout properties; however, this is by no means a comprehensive overview of using the library. Two resources we’d recommend consulting if you want to further explore using Plotly in Python are as follows:
• The Plotly Express guide: https://plotly.com/python/plotly-express
• The Plotly Python Figure reference: https://plotly.com/python/reference/index

 

%%sql year_counts_df <<
SELECT Year, sum(Hourly_Counts)::BIGINT AS Total_Counts
FROM pedestrian_counts
GROUP BY Year
ORDER BY Year

 

px.line(
    year_counts_df,
    x="Year",
    y="Total_Counts",
    markers=True,
    height=500,
    title="Total pedestrian counts by year",
)

 

 

%sql SELECT count(DISTINCT Year) FROM pedestrian_counts

 

 

%%sql
CREATE OR REPLACE TABLE common_sensors AS
SELECT Sensor_Name
FROM pedestrian_counts
GROUP BY Sensor_Name
HAVING COUNT(DISTINCT Year) = 14;

Ordinarily, when issuing a CREATE TABLE statement in DuckDB, no results are returned. When using JupySQL to submit database-modifying SQL, however, it returns the number of rows that were inserted into the database. This means we’ll see the following dataframe as output:

 

%%sql year_counts_filtered_df <<
SELECT Year, sum(Hourly_Counts)::BIGINT AS Total_Counts
FROM pedestrian_counts
WHERE Sensor_Name IN (FROM common_sensors)
GROUP BY Year
ORDER BY Year

 

px.line(
    year_counts_filtered_df,
    x="Year",
    y="Total_Counts",
    markers=True,
    height=500,
    title="Yearly traffic for sensors active all years",
)

 

 

复制代码
%%sql year_month_counts_df <<
SELECT Year,
    Month,
    month(Date_Time) AS Month_Num,
    sum(Hourly_Counts)::BIGINT AS Total_Counts
FROM pedestrian_counts
WHERE Year IN (2019, 2020, 2021)
  AND Sensor_Name in (FROM common_sensors)
GROUP BY Year, Month, Month_Num
ORDER BY Year, Month_Num
复制代码

 

year_month_counts_df.head(15)

 

 

复制代码
px.line(
    year_month_counts_df,
    x="Month",
    y="Total_Counts",
    color="Year",
    symbol="Year",
    symbol_sequence=["square", "diamond", "circle"],
    markers=True,
    height=500,
    title="Monthly traffic for sensors active 2019-2021",
).update_traces(marker_size=8)
复制代码

 

 

 

%%sql sensor_2020_df <<
SELECT Hourly_Counts, Date_Time
FROM pedestrian_counts
WHERE Sensor_Name = 'Flinders La-Swanston St (West)' AND Year = 2020

Then, we’ll use the px.line() function to create a time series plot of all readings for that year:

 

%%sql multi_sensor_df <<
SELECT Sensor_Name, Hourly_Counts, Date_Time
FROM pedestrian_counts
WHERE Sensor_Name IN (
    'Flinders St-Spark La',
    'Bourke Street Mall (North)',
    'Southern Cross Station'
) AND Year = 2019 AND Month = 'September'

 

复制代码
px.line(
    multi_sensor_df,
    y="Hourly_Counts",
    x="Date_Time",
    facet_col="Sensor_Name",
    facet_col_wrap=1,
    title="Hourly pedestrian traffic for December 2019",
    height=800,
).update_layout(yaxis_fixedrange=True)
复制代码

 

 

%%sql bourke_daily_df <<
SELECT Year,
    Date_Time::DATE AS Date,
    sum(Hourly_Counts)::BIGINT AS Daily_Counts,
 FROM pedestrian_counts
WHERE Sensor_Name = 'Bourke Street Mall (North)'
  AND Year IN (2019, 2020, 2021)
GROUP BY Year, Date

 

bourke_daily_df.head()

 

 

px.box(
    bourke_daily_df,
    x="Year",
    y="Daily_Counts",
    points="all",
    height=600,
    title="Distributions of daily traffic for a sensor",
)

 

 

posted on   ZhangZhihuiAAA  阅读(3)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律
历史上的今天:
2024-01-16 Redis - List Use Cases
2024-01-16 Redis - HyperLogLog Use Cases
2024-01-16 Redis - SORT
 
点击右上角即可分享
微信分享提示