优惠券预测——数据探索2
#分隔符 separator=':' #计算折扣率,将满减和折扣统一 #因为discount_rate为null的时候一般都是没有使用优惠券,这个时候折扣应该是1 def get_discount_rate(s): s = str(s) if s=='null': return -1 #return 1 s = s.split(separator) if len(s) == 1: return float(s[0]) else: return 1.0-float(s[1])/float(s[0]) #获取是否满减(full reduction promotion) def get_if_fd(s): s = str(s) s = s.split(separator) if len(s)==1: return 0 else: return 1 #获取满减的条件 def get_full_value(s): s = str(s) s = s.split(separator) if len(s)==1: return -1 else: return int(s[0]) #获取满减的优惠 def get_reduction_value(s): s = str(s) s = s.split(separator) if len(s) == 1: return -1 else: return int(s[1]) #获取月份 def get_month(s): if s[0]=='null': return -1 else: return int(s[4:6]) #获取日期 def get_day(s): if s[0]=='null': return -1 else: return int(s[6:8]) #获取日期间隔输入内容为Date:Date_received def get_day_gap(s): s = s.split(separator) if s[0]=='null': return -1 if s[1]=='null': return -1 else: return (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8])) - date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days #获取Label,输入内容为Date:Date_received def get_label(s): s = s.split(separator) if s[0]=='null': return 0 if s[1]=='null': return -1 elif (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8]))-date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days<=15: return 1 else: return -1
def add_feature(df): df['if_fd']=df['discount_rate'].apply(get_if_fd) df['full_value']=df['discount_rate'].apply(get_full_value) df['reduction_value']=df['discount_rate'].apply(get_reduction_value) df['discount_rate']=df['discount_rate'].apply(get_discount_rate) df['distance']=df['distance'].replace('null',-1).astype(int) #df['month_received'] = df['date_received'].apply(get_month) #df['month'] = df['date'].apply(get_month) return df def add_label(df): df['day_gap']=df['date'].astype('str') + ':' + df['date_received'].astype('str') df['label']=df['day_gap'].apply(get_label) df['day_gap']=df['day_gap'].apply(get_day_gap) return df
#拷贝数据,免得调试的时候重读文件 dftrain = off_train.copy() dftest = off_test.copy()
dftrain=add_feature(dftrain) dftrain=add_label(dftrain) dftest=add_feature(dftest)
# 数据分析 dftrain.head() dftrain.describe() dftrain[dftrain.distance>=0]['distance'].value_counts()/dftrain[dftrain.distance>=0]['distance'].count() dftest[dftest.distance>=0]['distance'].value_counts()/dftest[dftest.distance>=0]['distance'].count() dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'].value_counts()/dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'].count()
print ('Offline 训练集满减情况') dftrain.if_fd.value_counts()/dftrain.if_fd.count()
print ('测试集满减情况') dftest.if_fd.value_counts()/dftest.if_fd.count()
# 箱线图查看分布 fig = plt.figure(figsize=(4, 6)) # 指定绘图对象宽度和高度 sns.boxplot(dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'],orient="v", width=0.5)
fig = plt.figure(figsize=(4, 6)) # 指定绘图对象宽度和高度 sns.boxplot(dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)]['discount_rate'],orient="v", width=0.5)
# 直方图和QQ图 plt.figure(figsize=(10,5)) ax=plt.subplot(1,2,1) sns.distplot(dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'],fit=stats.norm) ax=plt.subplot(1,2,2) res = stats.probplot(dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'], plot=plt)
plt.figure(figsize=(10,5)) ax=plt.subplot(1,2,1) sns.distplot(dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)]['discount_rate'],fit=stats.norm) ax=plt.subplot(1,2,2) res = stats.probplot(dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)]['discount_rate'], plot=plt)
# 对比分布 ax = sns.kdeplot(dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)]['discount_rate'], color="Red", shade=True) ax = sns.kdeplot(dftest[(dftest.discount_rate>=0)]['discount_rate'], color="Blue", shade=True) ax.set_xlabel('discount_rate') ax.set_ylabel("Frequency") ax = ax.legend(["train","test"])
ax = sns.kdeplot(dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'], color="Red", shade=True) ax = sns.kdeplot(dftest[(dftest.distance>=0)]['distance'], color="Blue", shade=True) ax.set_xlabel('distance') ax.set_ylabel("Frequency") ax = ax.legend(["train","test"])
ax = sns.kdeplot(dftrain[(dftrain.label>=0)&(dftrain.full_value>=0)]['full_value'], color="Red", shade=True) ax = sns.kdeplot(dftest[(dftest.full_value>=0)]['full_value'], color="Blue", shade=True) ax.set_xlabel('full_value') ax.set_ylabel("Frequency") ax = ax.legend(["train","test"])
ax = sns.kdeplot(dftrain[(dftrain.label>=0)&(dftrain.reduction_value>=0)]['reduction_value'], color="Red", shade=True) ax = sns.kdeplot(dftest[(dftest.reduction_value>=0)]['reduction_value'], color="Blue", shade=True) ax.set_xlabel('reduction_value') ax.set_ylabel("Frequency") ax = ax.legend(["train","test"])
# 可视化线性关系 fcols = 2 frows = 1 plt.figure(figsize=(8,4)) ax=plt.subplot(1,2,1) sns.regplot(x='distance', y='label', data=dftrain[(dftrain.label>=0)&(dftrain.distance>=0)][['distance','label']], ax=ax, scatter_kws={'marker':'.','s':3,'alpha':0.3}, line_kws={'color':'k'}); plt.xlabel('distance') plt.ylabel('label') ax=plt.subplot(1,2,2) sns.distplot(dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'].dropna()) plt.xlabel('distance') plt.show()
fcols = 2 frows = 1 plt.figure(figsize=(8,4)) ax=plt.subplot(1,2,1) sns.regplot(x='discount_rate', y='label', data=dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)][['discount_rate','label']], ax=ax, scatter_kws={'marker':'.','s':3,'alpha':0.3}, line_kws={'color':'k'}); plt.xlabel('discount_rate') plt.ylabel('label') ax=plt.subplot(1,2,2) sns.distplot(dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)]['discount_rate'].dropna()) plt.xlabel('discount_rate') plt.show()