Data Wrangling
CREATE OR REPLACE TABLE web_log_text (raw_text VARCHAR);
COPY web_log_text FROM 'access.log' (DELIM '');
SELECT regexp_extract(raw_text, '^[0-9\.]*') AS client_ip FROM web_log_text LIMIT 3;
CREATE OR REPLACE TABLE web_log_split AS SELECT regexp_extract(raw_text, '^[0-9\.]*') AS client_ip, regexp_extract(raw_text, '\[(.*)\]', 1) AS http_date_text, regexp_extract(raw_text, '"([A-Z]*) ', 1) AS http_method, regexp_extract(raw_text, '([a-zA-Z\-]*)"$', 1) AS http_lang FROM web_log_text;
Data Enrichment
SELECT client_ip, strptime(http_date_text, '%d/%b/%Y:%H:%M:%S %z') AS http_date, http_method, http_lang, FROM web_log_split;
ALTER TABLE web_log_split ADD COLUMN http_date TIMESTAMP WITH TIME ZONE;
UPDATE web_log_split SET http_date = strptime(http_date_text, '%d/%b/%Y:%H:%M:%S %z');
data enrichment
CREATE OR REPLACE TABLE language_iso ( lang_iso VARCHAR PRIMARY KEY, language_name VARCHAR );
INSERT INTO language_iso SELECT * FROM read_csv('language_iso.csv');
SELECT wls.http_date, wls.http_lang, lang.language_name FROM web_log_split AS wls LEFT OUTER JOIN language_iso AS lang ON (wls.http_lang = lang.lang_iso);
CREATE OR REPLACE VIEW web_log_view AS SELECT wls.client_ip, strptime(wls.http_date_text, '%d/%b/%Y:%H:%M:%S %z') AS http_date, wls.http_method, wls.http_lang, lang.language_name FROM web_log_split AS wls LEFT OUTER JOIN language_iso lang ON (wls.http_lang = lang.lang_iso);
DROP VIEW IF EXISTS web_log_view; DROP TABLE IF EXISTS language_iso; DROP TABLE IF EXISTS web_log_split; DROP TABLE IF EXISTS web_log_text;
SELECT http_date, time_bucket(interval '1 day', http_date) AS day FROM web_log_view; ┌──────────────────────────┬──────────────────────────┐ │ http_date │ day │ │ timestamp with time zone │ timestamp with time zone │ ├──────────────────────────┼──────────────────────────┤ │ 2023-01-27 03:58:37+11 │ 2023-01-26 11:00:00+11 │ │ 2023-01-20 03:58:39+11 │ 2023-01-19 11:00:00+11 │ │ 2023-01-20 03:58:39+11 │ 2023-01-19 11:00:00+11 │
WITH web_cte AS ( SELECT client_ip, time_bucket(interval '1 day', http_date) AS day, language_name FROM web_log_view ) SELECT day, language_name, count(*) AS count FROM web_cte GROUP BY day, language_name ORDER BY day, count(*) DESC;
WITH web_cte AS ( SELECT time_bucket(interval '1 day', http_date) AS day, language_name FROM web_log_view ) PIVOT web_cte ON language_name USING count(*);
https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf
D CREATE OR REPLACE TABLE trips AS SELECT * FROM read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet'); 100% ▕████████████████████████████████████████████████████████████▏
This query will take a few seconds to run, as the 45 MB Parquet file needs to be downloaded and then processed. Once that’s done, have a look at the first few records in the trips table:
D SELECT tpep_pickup_datetime, trip_distance, fare_amount, tip_amount, PULocationID, DOLocationID FROM trips LIMIT 10; ┌──────────────────────┬───────────────┬─────────────┬────────────┬──────────────┬──────────────┐ │ tpep_pickup_datetime │ trip_distance │ fare_amount │ tip_amount │ PULocationID │ DOLocationID │ │ timestamp │ double │ double │ double │ int64 │ int64 │ ├──────────────────────┼───────────────┼─────────────┼────────────┼──────────────┼──────────────┤ │ 2023-01-01 00:32:10 │ 0.97 │ 9.3 │ 0.0 │ 161 │ 141 │ │ 2023-01-01 00:55:08 │ 1.1 │ 7.9 │ 4.0 │ 43 │ 237 │ │ 2023-01-01 00:25:04 │ 2.51 │ 14.9 │ 15.0 │ 48 │ 238 │ │ 2023-01-01 00:03:48 │ 1.9 │ 12.1 │ 0.0 │ 138 │ 7 │ │ 2023-01-01 00:10:29 │ 1.43 │ 11.4 │ 3.28 │ 107 │ 79 │ │ 2023-01-01 00:50:34 │ 1.84 │ 12.8 │ 10.0 │ 161 │ 137 │ │ 2023-01-01 00:09:22 │ 1.66 │ 12.1 │ 3.42 │ 239 │ 143 │ │ 2023-01-01 00:27:12 │ 11.7 │ 45.7 │ 10.74 │ 142 │ 200 │ │ 2023-01-01 00:21:44 │ 2.95 │ 17.7 │ 5.68 │ 164 │ 236 │ │ 2023-01-01 00:39:42 │ 3.01 │ 14.9 │ 0.0 │ 141 │ 107 │ ├──────────────────────┴───────────────┴─────────────┴────────────┴──────────────┴──────────────┤ │ 10 rows 6 columns │ └───────────────────────────────────────────────────────────────────────────────────────────────┘
D CREATE OR REPLACE TABLE locations ( LocationID int PRIMARY KEY, Borough VARCHAR, Zone VARCHAR, service_zone VARCHAR );
We can now download the taxi+_zone_lookup.csv file to map a zone ID with a zone name and an associated borough name and insert the data into the locations table with the read_csv DuckDB function:
D INSERT INTO locations(LocationID, Borough, Zone, service_zone) SELECT LocationID, Borough, Zone, service_zone FROM read_csv('https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv'); HTTP Error: HTTP GET error on 'https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv' (HTTP 403)
The file can be manually downloaded in the browser.
D INSERT INTO locations(LocationID, Borough, Zone, service_zone) SELECT LocationID, Borough, Zone, service_zone FROM read_csv('taxi+_zone_lookup.csv');
D SELECT LocationID, Borough, Zone FROM locations LIMIT 5; ┌────────────┬───────────────┬─────────────────────────┐ │ LocationID │ Borough │ Zone │ │ int32 │ varchar │ varchar │ ├────────────┼───────────────┼─────────────────────────┤ │ 1 │ EWR │ Newark Airport │ │ 2 │ Queens │ Jamaica Bay │ │ 3 │ Bronx │ Allerton/Pelham Gardens │ │ 4 │ Manhattan │ Alphabet City │ │ 5 │ Staten Island │ Arden Heights │ └────────────┴───────────────┴─────────────────────────┘
D CREATE OR REPLACE TABLE trips_with_location AS SELECT t.*, l_pu.zone AS pick_up_zone, l_do.zone AS drop_off_zone FROM trips AS t LEFT JOIN locations AS l_pu ON l_pu.LocationID = t.PULocationID LEFT JOIN locations AS l_do ON l_do.LocationID = t.DOLocationID;
D SELECT tpep_pickup_datetime, pick_up_zone, drop_off_zone, trip_distance FROM trips_with_location LIMIT 5; ┌──────────────────────┬───────────────────────┬───────────────────────┬───────────────┐ │ tpep_pickup_datetime │ pick_up_zone │ drop_off_zone │ trip_distance │ │ timestamp │ varchar │ varchar │ double │ ├──────────────────────┼───────────────────────┼───────────────────────┼───────────────┤ │ 2023-01-01 00:55:08 │ Central Park │ Upper East Side South │ 1.1 │ │ 2023-01-01 00:25:04 │ Clinton East │ Upper West Side North │ 2.51 │ │ 2023-01-01 00:03:48 │ LaGuardia Airport │ Astoria │ 1.9 │ │ 2023-01-01 00:10:29 │ Gramercy │ East Village │ 1.43 │ │ 2023-01-01 00:09:22 │ Upper West Side South │ Lincoln Square West │ 1.66 │ └──────────────────────┴───────────────────────┴───────────────────────┴───────────────┘
D SELECT time_bucket(interval '1 day', tpep_pickup_datetime) AS day_of, count(*) AS num_trips, min(fare_amount) AS fare_min, max(fare_amount) AS fare_max, avg(fare_amount) AS fare_avg, avg(tip_amount) AS tip_avg, avg( CASE WHEN Payment_type = 1 THEN tip_amount / fare_amount END ) * 100 AS cc_tip_avg_pct FROM trips_with_location WHERE tpep_pickup_datetime BETWEEN '2023-01-20 00:00:00' AND '2023-01-29 23:59:59' AND fare_amount > 0 GROUP BY 1 ORDER BY 1; ┌─────────────────────┬───────────┬──────────┬──────────┬────────────────────┬────────────────────┬────────────────────┐ │ day_of │ num_trips │ fare_min │ fare_max │ fare_avg │ tip_avg │ cc_tip_avg_pct │ │ timestamp │ int64 │ double │ double │ double │ double │ double │ ├─────────────────────┼───────────┼──────────┼──────────┼────────────────────┼────────────────────┼────────────────────┤ │ 2023-01-20 00:00:00 │ 108531 │ 0.01 │ 495.1 │ 18.233280537358063 │ 3.378727276077901 │ 25.65019322256883 │ │ 2023-01-21 00:00:00 │ 111017 │ 0.01 │ 518.2 │ 17.281048578145796 │ 3.1291104065143496 │ 24.720251716767162 │ │ 2023-01-22 00:00:00 │ 88863 │ 0.01 │ 650.0 │ 19.261637689477265 │ 3.465834036663066 │ 24.63108858908728 │ │ 2023-01-23 00:00:00 │ 88992 │ 0.01 │ 550.0 │ 18.518767304926467 │ 3.422123336929007 │ 25.861163077365763 │ │ 2023-01-24 00:00:00 │ 103004 │ 0.01 │ 1160.1 │ 17.7914187798532 │ 3.353546172964087 │ 25.807076648435856 │ │ 2023-01-25 00:00:00 │ 108328 │ 0.01 │ 400.0 │ 17.33750064618598 │ 3.3180545196070503 │ 27.878114736536613 │ │ 2023-01-26 00:00:00 │ 113979 │ 0.01 │ 600.0 │ 18.336476631660048 │ 3.479039296712565 │ 25.71778812195716 │ │ 2023-01-27 00:00:00 │ 110720 │ 0.01 │ 500.0 │ 18.067255690029025 │ 3.3800602420517443 │ 25.68649966172753 │ │ 2023-01-28 00:00:00 │ 110653 │ 0.01 │ 400.0 │ 17.306735651089483 │ 3.146891091972161 │ 24.52911401808041 │ │ 2023-01-29 00:00:00 │ 87373 │ 0.01 │ 598.7 │ 19.31206745791034 │ 3.4865816671052987 │ 24.625264998695993 │ ├─────────────────────┴───────────┴──────────┴──────────┴────────────────────┴────────────────────┴────────────────────┤ │ 10 rows 7 columns │ └──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
D WITH cte AS ( SELECT twl.*, max(fare_amount) OVER ( PARTITION BY time_bucket(INTERVAL '1 day', tpep_pickup_datetime) ) AS max_day_fare_amount FROM trips_with_location AS twl ) SELECT tpep_pickup_datetime, pick_up_zone, drop_off_zone, fare_amount FROM cte WHERE fare_amount = max_day_fare_amount AND tpep_pickup_datetime BETWEEN '2023-01-20 00:00:00' AND '2023-01-29 23:59:59' ORDER BY tpep_pickup_datetime; ┌──────────────────────┬───────────────────┬───────────────────┬─────────────┐ │ tpep_pickup_datetime │ pick_up_zone │ drop_off_zone │ fare_amount │ │ timestamp │ varchar │ varchar │ double │ ├──────────────────────┼───────────────────┼───────────────────┼─────────────┤ │ 2023-01-20 15:39:16 │ JFK Airport │ Outside of NYC │ 495.1 │ │ 2023-01-21 14:44:42 │ JFK Airport │ Outside of NYC │ 518.2 │ │ 2023-01-22 23:24:55 │ Clinton East │ Clinton East │ 650.0 │ │ 2023-01-23 23:01:23 │ Murray Hill │ Outside of NYC │ 550.0 │ │ 2023-01-24 12:43:44 │ JFK Airport │ Outside of NYC │ 1160.1 │ │ 2023-01-25 10:23:25 │ JFK Airport │ Outside of NYC │ 400.0 │ │ 2023-01-26 10:28:15 │ LaGuardia Airport │ LaGuardia Airport │ 600.0 │ │ 2023-01-27 01:42:45 │ Outside of NYC │ Outside of NYC │ 500.0 │ │ 2023-01-27 12:07:36 │ Jamaica │ Jamaica │ 500.0 │ │ 2023-01-28 23:00:50 │ Flatiron │ Flatiron │ 400.0 │ │ 2023-01-29 14:46:13 │ Clinton West │ Outside of NYC │ 598.7 │ ├──────────────────────┴───────────────────┴───────────────────┴─────────────┤ │ 11 rows 4 columns │ └────────────────────────────────────────────────────────────────────────────┘
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律
2022-01-09 Matplotlib - Figure Layout
2022-01-09 Matplotlib - Tick labels' rotation, size and alignment