摩拜单车项目-数据清洗
import pandas as pd
import seaborn as sns
import geohash
import matplotlib.pyplot as plt
from math import radians,cos,sin,asin,sqrt
%matplotlib inline
train=pd.read_csv("D:/lulu working file/train.csv",engine="python",sep=",",parse_dates=["starttime"])
test=pd.read_csv("D:/lulu working file/test.csv",engine="python",sep=",",parse_dates=["starttime"])
train.head()
def _processData(df):
#Time
df['weekday']=df['starttime'].apply(lambda s:s.weekday())#Monday is 0 and Sunday is 6
df['hour']=df['starttime'].apply(lambda s:s.hour)
df['day']=df['starttime'].apply(lambda s:str(s)[:10])
print("Time process successfully!")
#Geohash
df['start_lat_lng']=df['geohashed_start_loc'].apply(lambda s:geohash.decode(s))
df['end_lat_lng']=df['geohashed_end_loc'].apply(lambda s:geohash.decode(s))
df['start_neighbors']=df['geohashed_start_loc'].apply(lambda s:geohash.neighbors(s))
df['geohashed_start_loc_6']=df['geohashed_start_loc'].apply(lambda s:s[:6])
df['geohashed_end_loc_6']=df['geohashed_end_loc'].apply(lambda s:s[:6])
df['start_neighbors_6']=df['geohashed_start_loc'].apply(lambda s:geohash.neighbors(s))
print("Geohash process successfully!")
#判断目的地是否在neighbors
def inGeohash(start_geohash,end_geohash,names):
names.append(start_geohash)
if end_geohash in names:
return 1
else:
return 0
df['inside']=df.apply(lambda s:inGeohash(s['geohashed_start_loc'],s['geohash_end_loc'],s['start_neighbors']),axis=1)
df['inside_6']=df.apply(lambda s:inGeohash(s['geohashed_start_loc_6'],s['geohash_end_loc_6'],s['start_neighbors_6']),axis=1)
print("Geohash inside process successfully!")
#Distance start->end
def haversine(lon1,lat1,lon2,lat2):
'''
Calculate the great circle distance between two points
on the earth(specified in decimal degrees)
'''
lon1,lat1,lon2,lat2=map(radians,[lon1,lat1,lon2,lat2])
#haversine 公式
dlon=lon2-lon1
dlat=lat2-lat1
a=sin(dlat2/2)**2+cos(lat1)*cos(lat2)*sin(dlon/2)**2
c=2*asin(sqrt(a))
r=6371
return c*r*1000
df['start_end_distance']=df.apply(lambda s:haversine(s['start_lat_lng'][1],s['start_lat_lng'][0],s['end_lat_lng'][1],s['end_lat_lng'][0]),axis=1)
print("Distance process successfully!")
return df
train=_processData(train)
def _timeAnalysis(df):
#days
print("数据集包含的天数如下:")
print(df['day'].unique())
print("*"*60)
#用户出行小时高峰期
g1=df.groupby("hour")
print(g1['orderid'].count.sort_values(ascending=False))
print("*"*60)
#周一至周日用车分析
g1=df.groupby("weekday")
print(pd.DataFrame(g1['orderid'].count()))
print("*"*60)
#周一至周日不同时间的用车
df.loc[(df['weekday']==5)|(df['weekday']==6),"isWeekend"]=1
df.loc[~((df['weekday']==5)|(df['weekday']==6)),"isWeekend"]=0
g1=df.groupby(["isWeekend","hour"])
#计算工作日与周末的天数
g2=df.groupby(["day","weekday"])
w=0
c=0
for i,j in list(g2.groups.keys()):
if j>=5:
w+=1
else:
c+=1
temp_df=pd.DataFrame(g1['orderid'].count()).reset_index()
temp_df.loc[temp_df['isweekend']==0.0,'orderid']=temp_df['orderid']/c
_timeAnalysis(train)
temp_df.loc[temp_df['isweekend']==1.0,'orderid']=temp_df['orderid']/c
temp_df.loc[temp_df['isWeekend'] == 1.0,'orderid'] = temp_df['orderid'] / w
print(temp_df.sort_values(["isWeekend","orderid"],ascending=False))
sns.barplot(x='hour',y='orderid',hue='isWeekend',data=temp_df)
敲了一遍代码发现很多细节问题,geohash 的包没有neighbors,替换了Lib里的问py文件就好了。路径尽量不要有中文,要浅,复制要改成反分隔符。