练习1-车费预测

image
image

源代码:

# %%
'''
步骤:
1、读入数据集,将车费、经纬度进行清洗
(使用plt画散点图(省略))
2、用sklearn进行预测
'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn


train = pd.read_csv(r"C:\Users\Administrator\纽约出租车车费预测\train.csv",nrows=1000000)


train.head()


train.describe() # 发现车费min为负,经度纬度、乘客数的max过大


train.shape # 原始数据集大小


train.drop(train[train.isna().any(1)].index, axis=0, inplace = True) # 删除任何有nan的行


train.shape # 删除nan之后的大小


# # 清洗乘客数


train["passenger_count"].describe()


train["passenger_count"].value_counts().sort_values(ascending=True) # 寻找人数异常值的个数


train.drop(train[(train['passenger_count'] > 6) | (train['passenger_count'] == 0)].index, inplace = True, axis = 0) #drop异常人数值

train["passenger_count"].value_counts().sort_values(ascending=True)


# # 清洗经纬度
eps = 1e-7
train[(train["pickup_longitude"] - train["dropoff_longitude"] < eps) & (train["pickup_longitude"] - train["dropoff_longitude"] > -eps) & \
      (train["pickup_latitude"] - train["dropoff_latitude"] < eps) & (train["pickup_latitude"] - train["dropoff_latitude"] > -eps)\
     ] # 很多起始位置基本小数点前6位没有发生变化


# 与describe里的经纬度对比,需要把一些离平均值很远的行去掉
for name in train.columns[3:7]:
    train.drop(train[(train[name] < train[name].mean()-10) | (train[name] > train[name].mean() + 10)].index\
    , axis = 0, inplace = True)


train.describe()

# %% [markdown]
# # 清洗车费


train["fare_amount"].value_counts().sort_index(ascending=True) # 计数后按车费排序


train.drop(train[train["fare_amount"] < eps].index, inplace = True, axis = 0)


train["fare_amount"].describe() # 认为大于0即为合法数据


train.describe() # 除了车费,其他数据方差很小,说明异常值基本去除



# # 导入测试集,并给训练集和测试集加入有关时间的列


test = pd.read_csv(r"C:\Users\Administrator\Desktop\纽约出租车车费预测\test.csv")




# 转时间类型
train['key'] = pd.to_datetime(train['key'])
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])
test['key'] = pd.to_datetime(test['key'])
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])


train.dtypes

# 增加列
train['year'] = train['pickup_datetime'].dt.year
train['month'] = train['pickup_datetime'].dt.month
train['day'] = train['pickup_datetime'].dt.day
train['hour'] = train['pickup_datetime'].dt.hour
train['day of week'] = train['pickup_datetime'].dt.dayofweek
test['year'] = test['pickup_datetime'].dt.year
test['month'] = test['pickup_datetime'].dt.month
test['day'] = test['pickup_datetime'].dt.day
test['hour'] = test['pickup_datetime'].dt.hour
test['day of week'] = test['pickup_datetime'].dt.dayofweek


train.dtypes

test.dtypes


# # 计算路程以及每mile的车费(预测时没用到,因为是预测test的车费)




def distance(lat1, long1, lat2, long2):
    data = [train, test]
    for i in data:
        R = 6371  # 地球半径(单位:千米)
        phi1 = np.radians(i[lat1])
        phi2 = np.radians(i[lat2])
    
        delta_phi = np.radians(i[lat2]-i[lat1])
        delta_lambda = np.radians(i[long2]-i[long1])
    
        #a = sin²((φB - φA)/2) + cos φA . cos φB . sin²((λB - λA)/2)
        a = np.sin(delta_phi / 2.0) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0) ** 2
    
        #c = 2 * atan2( √a, √(1−a) )
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    
        #d = R*c
        d = (R * c) # 单位:千米
        i['H_Distance'] = d
    return d

distance('pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude')

'''eps = 1e-7
train.drop( train[(train["pickup_longitude"] - train["dropoff_longitude"] < eps) & (train["pickup_longitude"] - train["dropoff_longitude"] > -eps) & \
      (train["pickup_latitude"] - train["dropoff_latitude"] < eps) & (train["pickup_latitude"] - train["dropoff_latitude"] > -eps)\
     ].index,inplace = True, axis = 0) # 去除没有动的点
'''
eps = 1e-7
train.drop(train[(train['H_Distance']< eps) & (train['H_Distance'] > -eps)].index, inplace=True, axis=0)

train["fare_pre_mile"] = train.fare_amount / train.H_Distance # 每mile的价钱


train


train["fare_pre_mile"].describe()

for i in range(0,20): # 发现大于平均值以后的数量占比很小,考虑由于个别异常值导致平均值过大
    print(train[train["fare_pre_mile"] > train["fare_pre_mile"].mean()+i]["fare_pre_mile"].count())


# 首先去除油价搞的离谱的
train.drop(train[(train['fare_pre_mile'] > train["fare_pre_mile"].mean()+1000)].index, inplace=True, axis=0)
train["fare_pre_mile"].describe()


train.drop(train[(train['fare_pre_mile'] > train["fare_pre_mile"].mean()+100)].index, inplace=True, axis=0)
train["fare_pre_mile"].describe()

train.drop(train[(train['fare_pre_mile'] > train["fare_pre_mile"].mean()+100)].index, inplace=True, axis=0)
train["fare_pre_mile"].describe()

train.drop(train[(train['fare_pre_mile'] > train["fare_pre_mile"].mean()+50)].index, inplace=True, axis=0)
train["fare_pre_mile"].describe()

# 发现平均值基本稳定了,油价也比较接近常识

for i in range(0,20): # 遍历每mile油费大于i的count
    print(i," : ",train[train["fare_pre_mile"] > i]["fare_pre_mile"].count())

# 去掉后面一部分
train.drop(train[(train['fare_pre_mile'] > 8)].index, inplace=True, axis=0)

# 再去除小于1的
train.drop(train[(train['fare_pre_mile'] < 1)].index, inplace=True, axis=0)

train['fare_pre_mile'].describe()


# 预测

from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # 标准化

x_train = train.drop(["key","pickup_datetime","fare_amount","fare_pre_mile"],1) # 训练集数据
y_train = train["fare_amount"] # 训练集结果
x_test = test.drop(["key","pickup_datetime"],1)


std_x = StandardScaler()
x_train = std_x.fit_transform(x_train)
x_test = std_x.fit_transform(x_test)

std_y = StandardScaler()
y_train = std_y.fit_transform(np.array(y_train).reshape(-1,1))


x_train.shape

y_train.shape

x_test.shape

# 梯度下降预测

sgd = SGDRegressor()

y_train = y_train.ravel()
sgd.fit(x_train,y_train)

y_sgd_predict = sgd.predict(x_test)
y_sgd_predict = std_y.inverse_transform(y_sgd_predict)

y_sgd_predict

test["fare_amount"]=y_sgd_predict

train

posted @ 2021-12-14 09:54  Lecoww  阅读(59)  评论(0编辑  收藏  举报