说明:
1.本文为个人学习笔记记录;
2.学习视频来源:https://space.bilibili.com/474347248/channel/detail?cid=143235
3.数据来源:唐国梁Tommy,为了方便志同道合的伙伴一起学习,我将数据上传到个人盘分享:
链接:https://pan.baidu.com/s/1beeeBv7eCLL7QjpoXin_AQ
提取码:0rrc
4.本文代码运行环境基于pycharm.(原代码是基于jupyter实现的)
5.欢迎一起讨论学习:QQ:386825951
import numpy as np import pandas as pd import matplotlib.pyplot as plt #可视化 import seaborn as sns #可视化 import sklearn #机器学习 from collections import Counter pd.set_option('display.width', 1000)#加了这一行那表格的一行就不会分段出现了 pd.set_option('display.max_columns', None) print("###############################step1: 加载数据集###########################################") #step1: 加载数据集 train = pd.read_csv("train.csv",nrows=1000000) test = pd.read_csv("test.csv") print("##############################step2: 数据分析、清洗###########################################") #step2: 数据分析、清洗 print(train.shape) print(test.shape) # 查看形状 print(train.head()) #查看训练集的前五行 print(test.head(10)) #查看测试集的前10行 print(train.describe()) #训练集描述 print("*" * 100) print(test.describe()) #测试集描述 print("****************************************1. 检查数据中是否有空值************************************************") #检查数据中是否有空值 print("*" * 100 ) print("统计空值的数量,按升序排序") print(train.isnull().sum().sort_values(ascending=True)) #统计空值的数量,按升序排序 print(test.isnull().sum().sort_values(ascending=True)) #统计空值的数量,按升序排序 #删除train中为空的数据 train.drop(train[train.isnull().any(1)].index, axis=0, inplace=True) #any(1):这一行中的任意一个为空值;axis=0:行方向删除; inplace=True:原表中操作 print(train.shape) print("****************************************2. 检查车费这一列数据************************************************") #检查车费这一列数据(车费不可能为负数) train['fare_amount'].describe() #统计train中车费小于0的数据 Counter(train['fare_amount']<0) #删除掉train中车费小于0的列 train.drop(train[train['fare_amount']<0].index, axis=0, inplace=True) #再次检查车费这一列数据(发现没有负数了) train['fare_amount'].describe() #可视化(直方图)0 < 票价 <100 .hist() train[train.fare_amount < 100].fare_amount.hist(bins=100, figsize=(14, 3)) #bins=100:分成100份 plt.xlabel("fare_$USD") plt.title("Histogram") print("****************************************3. 检查乘客passenger_count 这一列************************************************") #检查乘客passenger_count 这一列 train['passenger_count'].describe() #查看乘客人数大雨6的数据 train[train['passenger_count']>6] #删除这个离异值 train.drop(train[train['passenger_count']>6].index, axis=0, inplace=True) print("****************************************4. 检查上车点的经度和纬度************************************************") #检查上车点的经度和纬度 纬度latitude范围:-90至90 经度范围:-180至180 train['pickup_latitude'].describe() print(train[train['pickup_latitude']<-90]) print(train[train['pickup_latitude']>90]) #删除错误值 train.drop(train[(train['pickup_latitude']<-90) | (train['pickup_latitude']>90)].index, axis=0, inplace=True) train['pickup_longitude'].describe() print(train[train['pickup_longitude']<-180]) print(train[train['pickup_longitude']>180]) #删除错误值 train.drop(train[(train['pickup_longitude']<-180) | (train['pickup_longitude']>180)].index, axis=0, inplace=True) print("****************************************5. 检查下车点的经度和纬度************************************************") #同理,处理下车点经纬度的异常值 train.drop(train[(train['dropoff_latitude']<-90) | (train['dropoff_latitude']>90)].index, axis=0, inplace=True) train.drop(train[(train['dropoff_longitude']<-180) | (train['dropoff_longitude']>180)].index, axis=0, inplace=True) print("****************************************6. 可视化地图,清理一些离异值************************************************") #可视化地图,清理一些离异值 # 1 在test数据集上确定一个区域框,删除掉train数据集中不在区域框内的奇异点 # (1) 纬度最小值,纬度最大值 print("纬度最小值,纬度最大值") print( min(test.pickup_latitude.min(), test.dropoff_latitude.min()), max(test.pickup_latitude.max(), test.dropoff_latitude.max()) ) #(2)经度最小值,经度最大值 print("经度最小值,经度最大值") print( min(test.pickup_longitude.min(), test.dropoff_longitude.min()), max(test.pickup_longitude.max(), test.dropoff_longitude.max())) # (3) 根据指定的区域框,除掉那些奇异点 def select_within_boundingbox(df, BB): return (df.pickup_longitude >= BB[0]) & (df.pickup_longitude <= BB[1]) & \ (df.pickup_latitude >= BB[2]) & (df.pickup_latitude <= BB[3]) & \ (df.dropoff_longitude >= BB[0]) & (df.dropoff_longitude <= BB[1]) & \ (df.dropoff_latitude >= BB[2]) & (df.dropoff_latitude <= BB[3]) BB = (-74.5, -72.8, 40.5, 41.8) # 截图 #这里用网址截图可能会报错,于是我在网页直接打开链接将图片下载下来然后读取图片 #nyc_map = plt.imread('https://aiblog.nl/download/nyc_-74.5_-72.8_40.5_41.8.png') nyc_map = plt.imread('nyc_map.png') BB_zoom = (-74.3, -73.7, 40.5, 40.9) # 放大后的地图 # 截图(放大) #nyc_map_zoom = plt.imread('https://aiblog.nl/download/nyc_-74.3_-73.7_40.5_40.9.png') nyc_map_zoom = plt.imread('nyc_map_zoom.png') train = train[select_within_boundingbox(train, BB)] # 删除区域框之外的点 print(train.shape) # (4)在地图显示这些点 def plot_on_map(df, BB, nyc_map, s=10, alpha=0.2): fig, axs = plt.subplots(1, 2, figsize=(16, 10)) # 第一个子图 axs[0].scatter(df.pickup_longitude, df.pickup_latitude, alpha=alpha, c='r', s=s) axs[0].set_xlim(BB[0], BB[1]) axs[0].set_ylim(BB[2], BB[3]) axs[0].set_title('PickUp Locations') axs[0].imshow(nyc_map, extent=BB) # 第二个子图 axs[1].scatter(df.dropoff_longitude, df.dropoff_latitude, alpha=alpha, c='r', s=s) axs[1].set_xlim((BB[0], BB[1])) axs[1].set_ylim((BB[2], BB[3])) axs[1].set_title('Dropoff locations') axs[1].imshow(nyc_map, extent=BB) plot_on_map(train, BB, nyc_map, s=1, alpha=0.3) plot_on_map(train, BB_zoom, nyc_map_zoom, s=1, alpha=0.3) #在pycharm中显示画的图 #plt.show() print("****************************************7. 检查数据类型************************************************") print(train.dtypes) #object : 字符串 # 日期类型转换:key, pickup_datetime pd.to_datetime方法 for dataset in [train, test]: dataset['key'] = pd.to_datetime(dataset['key']) dataset['pickup_datetime'] = pd.to_datetime(dataset['pickup_datetime']) print("****************************************8. 日期数据进行分析************************************************") # 将日期分隔为: # # year # month # day # hour # day of week # 增加5列,分别是:year, month, day, hour, day of week for dataset in [train, test]: dataset['year'] = dataset['pickup_datetime'].dt.year dataset['month'] = dataset['pickup_datetime'].dt.month dataset['day'] = dataset['pickup_datetime'].dt.day dataset['hour'] = dataset['pickup_datetime'].dt.hour dataset['day of week'] = dataset['pickup_datetime'].dt.dayofweek print(train.head()) print("*" * 100) print(test.head()) print("****************************************9. 根据经纬度计算距离************************************************") # 计算公式 def distance(lat1, long1, lat2, long2): data = [train, test] for i in data: R = 6371 # 地球半径(单位:千米) phi1 = np.radians(i[lat1]) phi2 = np.radians(i[lat2]) delta_phi = np.radians(i[lat2] - i[lat1]) delta_lambda = np.radians(i[long2] - i[long1]) # a = sin²((φB - φA)/2) + cos φA . cos φB . sin²((λB - λA)/2) a = np.sin(delta_phi / 2.0) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0) ** 2 # c = 2 * atan2( √a, √(1−a) ) c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)) # d = R*c d = (R * c) # 单位:千米 i['H_Distance'] = d return d distance('pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude') print(train.head()) print("*" * 100) print(test.head()) # 统计距离为0,票价为0的数据 train[(train['H_Distance']==0) & (train['fare_amount']==0)] # 删除 train.drop(train[(train['H_Distance']==0) & (train['fare_amount']==0)].index, axis=0, inplace=True) # 统计距离为0,票价不为0的数据 # 原因1:司机等待乘客很长时间,乘客最终取消了订单,乘客依然支付了等待的费用; # 原因2:车辆的经纬度没有被准确录入或缺失; len(train[(train['H_Distance']==0) & (train['fare_amount']!=0)]) # 删除 train.drop(train[(train['H_Distance']==0) & (train['fare_amount']!=0)].index, axis=0, inplace=True) print("****************************************10. 新的字段:每公里车费:根据距离、车费,计算每公里的车费************************************************") train['fare_per_mile'] = train.fare_amount / train.H_Distance print(train.fare_per_mile.describe()) print(train.head()) # 统计每一年的不同时间段的每小时车费 train.pivot_table('fare_per_mile', index='hour', columns='year').plot(figsize=(14, 6)) plt.ylabel('Fare $USD/mile') plt.show() print("##############################step3: 模型训练和数据预测###########################################") print(train.columns) # Index(['key', 'fare_amount', 'pickup_datetime', 'pickup_longitude', # 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', # 'passenger_count', 'year', 'month', 'day', 'hour', 'day of week', # 'H_Distance', 'fare_per_mile'], # dtype='object') print(test.columns) # Index(['key', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude', # 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'year', # 'month', 'day', 'hour', 'day of week', 'H_Distance'], # dtype='object') X_train = train.iloc[:, [3,4,5,6,7,8,9,10,11,12,13]] y_train = train.iloc[:, [1]] # are_amount 车费 print(X_train.shape) print(y_train.shape) # 随机森林实现 from sklearn.ensemble import RandomForestRegressor rf = RandomForestRegressor() rf.fit(X_train, y_train) print(test.columns) # Index(['key', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude', # 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'year', # 'month', 'day', 'hour', 'day of week', 'H_Distance'], # dtype='object') rf_predict = rf.predict(test.iloc[:, [2,3,4,5,6,7,8,9,10,11,12]]) # submission = pd.read_csv("sample_submission.csv") # # submission.head() # 提交 submission = pd.read_csv("sample_submission.csv") submission['fare_amount'] = rf_predict submission.to_csv("submission_1.csv", index=False) ##inplace=True:在原表中补齐,为False则会生成一个新表返回 print(submission.head())
过程图:
图1:
图2:
图3:
图4:
问题:
1. nyc_map_zoom = plt.imread('https://aiblog.nl/download/nyc_-74.3_-73.7_40.5_40.9.png') 是否可以下载任意地区的地图图片?