ZhangZhihui's Blog  

 

     Data Wrangling

 

CREATE OR REPLACE TABLE web_log_text (raw_text VARCHAR);

 

COPY web_log_text FROM 'access.log' (DELIM '');

 

SELECT regexp_extract(raw_text, '^[0-9\.]*') AS client_ip
FROM web_log_text
LIMIT 3;

 

CREATE OR REPLACE TABLE web_log_split AS
SELECT regexp_extract(raw_text, '^[0-9\.]*') AS client_ip,
    regexp_extract(raw_text, '\[(.*)\]', 1) AS http_date_text,
    regexp_extract(raw_text, '"([A-Z]*) ', 1) AS http_method,
    regexp_extract(raw_text, '([a-zA-Z\-]*)"$', 1) AS http_lang
FROM web_log_text;

 

     Data Enrichment

 

SELECT client_ip,
    strptime(http_date_text, '%d/%b/%Y:%H:%M:%S %z') AS http_date,
    http_method,
    http_lang,
FROM web_log_split;

 

ALTER TABLE web_log_split
ADD COLUMN http_date TIMESTAMP WITH TIME ZONE;

 

UPDATE web_log_split
SET http_date = strptime(http_date_text, '%d/%b/%Y:%H:%M:%S %z');

 

     data enrichment

CREATE OR REPLACE TABLE language_iso (
    lang_iso VARCHAR PRIMARY KEY,
    language_name VARCHAR
);

 

INSERT INTO language_iso
SELECT *
FROM read_csv('language_iso.csv');

 

SELECT wls.http_date,
    wls.http_lang,
    lang.language_name
FROM web_log_split AS wls
LEFT OUTER JOIN language_iso AS lang ON (wls.http_lang = lang.lang_iso);

 

CREATE OR REPLACE VIEW web_log_view AS
SELECT wls.client_ip,
    strptime(wls.http_date_text, '%d/%b/%Y:%H:%M:%S %z') AS http_date,
    wls.http_method,
    wls.http_lang,
    lang.language_name
FROM web_log_split AS wls
LEFT OUTER JOIN language_iso lang ON (wls.http_lang = lang.lang_iso);

 

 

DROP VIEW IF EXISTS web_log_view;
DROP TABLE IF EXISTS language_iso;
DROP TABLE IF EXISTS web_log_split;
DROP TABLE IF EXISTS web_log_text;

 

 

复制代码
SELECT http_date,
    time_bucket(interval '1 day', http_date) AS day
FROM web_log_view;

┌──────────────────────────┬──────────────────────────┐
│ http_date                │ day                      │
│ timestamp with time zone │ timestamp with time zone │
├──────────────────────────┼──────────────────────────┤
│ 2023-01-27 03:58:37+112023-01-26 11:00:00+11   │
│ 2023-01-20 03:58:39+112023-01-19 11:00:00+11   │
│ 2023-01-20 03:58:39+112023-01-19 11:00:00+11
复制代码

 

复制代码
WITH web_cte AS (
    SELECT client_ip,
        time_bucket(interval '1 day', http_date) AS day,
        language_name
    FROM web_log_view
)
SELECT day,
    language_name,
    count(*) AS count
FROM web_cte
GROUP BY day, language_name
ORDER BY day, count(*) DESC;
复制代码

 

WITH web_cte AS (
    SELECT time_bucket(interval '1 day', http_date) AS day,
        language_name
    FROM web_log_view
) 
PIVOT web_cte ON language_name USING count(*);

 

 

https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page

https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf

 

D CREATE OR REPLACE TABLE trips AS
  SELECT *
  FROM read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet');
100% ▕████████████████████████████████████████████████████████████▏

 

This query will take a few seconds to run, as the 45 MB Parquet file needs to be downloaded and then processed. Once that’s done, have a look at the first few records in the trips table:

复制代码
D SELECT tpep_pickup_datetime,
      trip_distance,
      fare_amount,
      tip_amount,
      PULocationID,
      DOLocationID
  FROM trips
  LIMIT 10;
┌──────────────────────┬───────────────┬─────────────┬────────────┬──────────────┬──────────────┐
│ tpep_pickup_datetime │ trip_distance │ fare_amount │ tip_amount │ PULocationID │ DOLocationID │
│      timestampdoubledoubledouble   │    int64     │    int64     │
├──────────────────────┼───────────────┼─────────────┼────────────┼──────────────┼──────────────┤
│ 2023-01-01 00:32:100.979.30.0161141 │
│ 2023-01-01 00:55:081.17.94.043237 │
│ 2023-01-01 00:25:042.5114.915.048238 │
│ 2023-01-01 00:03:481.912.10.01387 │
│ 2023-01-01 00:10:291.4311.43.2810779 │
│ 2023-01-01 00:50:341.8412.810.0161137 │
│ 2023-01-01 00:09:221.6612.13.42239143 │
│ 2023-01-01 00:27:1211.745.710.74142200 │
│ 2023-01-01 00:21:442.9517.75.68164236 │
│ 2023-01-01 00:39:423.0114.90.0141107 │
├──────────────────────┴───────────────┴─────────────┴────────────┴──────────────┴──────────────┤
│ 10 rows                                                                             6 columns │
└───────────────────────────────────────────────────────────────────────────────────────────────┘
复制代码

 

D CREATE OR REPLACE TABLE locations (
      LocationID int PRIMARY KEY,
      Borough VARCHAR,
      Zone VARCHAR,
      service_zone VARCHAR
  );

 

We can now download the taxi+_zone_lookup.csv file to map a zone ID with a zone name and an associated borough name and insert the data into the locations table with the read_csv DuckDB function:

D INSERT INTO locations(LocationID, Borough, Zone, service_zone)
  SELECT LocationID,
      Borough,
      Zone,
      service_zone
  FROM read_csv('https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv');
HTTP Error: HTTP GET error on 'https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv' (HTTP 403)

The file can be manually downloaded in the browser.

D INSERT INTO locations(LocationID, Borough, Zone, service_zone)
  SELECT LocationID,
      Borough,
      Zone,
      service_zone
  FROM read_csv('taxi+_zone_lookup.csv');

 

复制代码
D SELECT LocationID, Borough, Zone
  FROM locations
  LIMIT 5;
┌────────────┬───────────────┬─────────────────────────┐
│ LocationID │    Borough    │          Zone           │
│   int32    │    varcharvarchar         │
├────────────┼───────────────┼─────────────────────────┤
│          1 │ EWR           │ Newark Airport          │
│          2 │ Queens        │ Jamaica Bay             │
│          3 │ Bronx         │ Allerton/Pelham Gardens │
│          4 │ Manhattan     │ Alphabet City           │
│          5 │ Staten Island │ Arden Heights           │
└────────────┴───────────────┴─────────────────────────┘
复制代码

 

D CREATE OR REPLACE TABLE trips_with_location AS
  SELECT t.*,
      l_pu.zone AS pick_up_zone,
      l_do.zone AS drop_off_zone
  FROM trips AS t
      LEFT JOIN locations AS l_pu ON l_pu.LocationID = t.PULocationID
      LEFT JOIN locations AS l_do ON l_do.LocationID = t.DOLocationID;

 

复制代码
D SELECT tpep_pickup_datetime,
      pick_up_zone,
      drop_off_zone,
      trip_distance
  FROM trips_with_location
  LIMIT 5;
┌──────────────────────┬───────────────────────┬───────────────────────┬───────────────┐
│ tpep_pickup_datetime │     pick_up_zone      │     drop_off_zone     │ trip_distance │
│      timestampvarcharvarchardouble     │
├──────────────────────┼───────────────────────┼───────────────────────┼───────────────┤
│ 2023-01-01 00:55:08  │ Central Park          │ Upper East Side South │           1.1 │
│ 2023-01-01 00:25:04  │ Clinton East          │ Upper West Side North │          2.51 │
│ 2023-01-01 00:03:48  │ LaGuardia Airport     │ Astoria               │           1.9 │
│ 2023-01-01 00:10:29  │ Gramercy              │ East Village          │          1.43 │
│ 2023-01-01 00:09:22Upper West Side South │ Lincoln Square West   │          1.66 │
└──────────────────────┴───────────────────────┴───────────────────────┴───────────────┘
复制代码

 

复制代码
D SELECT time_bucket(interval '1 day', tpep_pickup_datetime) AS day_of,
      count(*) AS num_trips,
      min(fare_amount) AS fare_min,
      max(fare_amount) AS fare_max,
      avg(fare_amount) AS fare_avg,
      avg(tip_amount) AS tip_avg,
      avg(
          CASE
              WHEN Payment_type = 1 THEN tip_amount / fare_amount
          END
      ) * 100 AS cc_tip_avg_pct
  FROM trips_with_location
  WHERE tpep_pickup_datetime BETWEEN '2023-01-20 00:00:00' AND '2023-01-29 23:59:59'
      AND fare_amount > 0
  GROUP BY 1
  ORDER BY 1;
┌─────────────────────┬───────────┬──────────┬──────────┬────────────────────┬────────────────────┬────────────────────┐
│       day_of        │ num_trips │ fare_min │ fare_max │      fare_avg      │      tip_avg       │   cc_tip_avg_pct   │
│      timestamp      │   int64   │  doubledoubledoubledoubledouble       │
├─────────────────────┼───────────┼──────────┼──────────┼────────────────────┼────────────────────┼────────────────────┤
│ 2023-01-20 00:00:001085310.01495.118.2332805373580633.37872727607790125.65019322256883 │
│ 2023-01-21 00:00:001110170.01518.217.2810485781457963.129110406514349624.720251716767162 │
│ 2023-01-22 00:00:00888630.01650.019.2616376894772653.46583403666306624.63108858908728 │
│ 2023-01-23 00:00:00889920.01550.018.5187673049264673.42212333692900725.861163077365763 │
│ 2023-01-24 00:00:001030040.011160.117.79141877985323.35354617296408725.807076648435856 │
│ 2023-01-25 00:00:001083280.01400.017.337500646185983.318054519607050327.878114736536613 │
│ 2023-01-26 00:00:001139790.01600.018.3364766316600483.47903929671256525.71778812195716 │
│ 2023-01-27 00:00:001107200.01500.018.0672556900290253.380060242051744325.68649966172753 │
│ 2023-01-28 00:00:001106530.01400.017.3067356510894833.14689109197216124.52911401808041 │
│ 2023-01-29 00:00:00873730.01598.719.312067457910343.486581667105298724.625264998695993 │
├─────────────────────┴───────────┴──────────┴──────────┴────────────────────┴────────────────────┴────────────────────┤
│ 10 rows                                                                                                    7 columns │
└──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
复制代码

 

 

 

复制代码
D WITH cte AS (
      SELECT twl.*,
          max(fare_amount) OVER (
              PARTITION BY time_bucket(INTERVAL '1 day', tpep_pickup_datetime)
          ) AS max_day_fare_amount
      FROM trips_with_location AS twl
  )
  SELECT tpep_pickup_datetime,
      pick_up_zone,
      drop_off_zone,
      fare_amount
  FROM cte
  WHERE fare_amount = max_day_fare_amount
      AND tpep_pickup_datetime BETWEEN '2023-01-20 00:00:00' AND '2023-01-29 23:59:59'
  ORDER BY tpep_pickup_datetime;
┌──────────────────────┬───────────────────┬───────────────────┬─────────────┐
│ tpep_pickup_datetime │   pick_up_zone    │   drop_off_zone   │ fare_amount │
│      timestampvarcharvarchardouble    │
├──────────────────────┼───────────────────┼───────────────────┼─────────────┤
│ 2023-01-20 15:39:16  │ JFK Airport       │ Outside of NYC    │       495.1 │
│ 2023-01-21 14:44:42  │ JFK Airport       │ Outside of NYC    │       518.2 │
│ 2023-01-22 23:24:55  │ Clinton East      │ Clinton East      │       650.0 │
│ 2023-01-23 23:01:23  │ Murray Hill       │ Outside of NYC    │       550.0 │
│ 2023-01-24 12:43:44  │ JFK Airport       │ Outside of NYC    │      1160.1 │
│ 2023-01-25 10:23:25  │ JFK Airport       │ Outside of NYC    │       400.0 │
│ 2023-01-26 10:28:15  │ LaGuardia Airport │ LaGuardia Airport │       600.0 │
│ 2023-01-27 01:42:45  │ Outside of NYC    │ Outside of NYC    │       500.0 │
│ 2023-01-27 12:07:36  │ Jamaica           │ Jamaica           │       500.0 │
│ 2023-01-28 23:00:50  │ Flatiron          │ Flatiron          │       400.0 │
│ 2023-01-29 14:46:13  │ Clinton West      │ Outside of NYC    │       598.7 │
├──────────────────────┴───────────────────┴───────────────────┴─────────────┤
│ 11 rows                                                          4 columns │
└────────────────────────────────────────────────────────────────────────────┘
复制代码

 

posted on   ZhangZhihuiAAA  阅读(6)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律
历史上的今天:
2022-01-09 Matplotlib - Figure Layout
2022-01-09 Matplotlib - Tick labels' rotation, size and alignment
 
点击右上角即可分享
微信分享提示