对离散特征建子模型实验
In [1]:
# coding=utf8
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.linear_model import LogisticRegression as lr
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
def show_importance_of_sub_model(total_size,factor_size,special_percent):
each_dt_size = total_size/factor_size
dt = pd.DataFrame()
y = pd.DataFrame()
special_size = int(factor_size*special_percent)
special = range(special_size)
for i in range(factor_size):
dt_current = pd.DataFrame(np.random.random((each_dt_size,5)))
dt_current['type'] = 'type'+str(100+i)
y_current = pd.DataFrame(np.ones((each_dt_size,)))
if i in special:
y_current[(dt_current[0]+dt_current[1]+dt_current[2])>0.6] = -1
else:
y_current[(dt_current[0]+dt_current[1]+dt_current[2])>1.5] = -1
y = pd.concat((y,y_current))
dt = pd.concat((dt,dt_current))
type = pd.get_dummies(dt['type'])
y = np.array(y[0])
lr_clf = lr(penalty='l1')
n_estimator = 100
dummy_performance = []
sub_model_performance = []
for i in range(10):
dt_train,dt_test,y_train,y_test,type_train,type_test = train_test_split(dt,y,type,test_size=0.2)
X_ori_train = pd.concat((dt_train,type_train),1)
del X_ori_train['type']
X_ori_test = pd.concat((dt_test,type_test),1)
del X_ori_test['type']
lr_clf.fit(type_train,y_train)
coef = lr_clf.coef_[0]
dt_train.loc[:,'coef'] = 0
dt_test.loc[:,'coef'] = 0
for i in range(len(type.columns)):
dt_train.loc[dt_train['type']==type.columns[i],'coef'] = coef[i]
dt_test.loc[dt_test['type']==type.columns[i],'coef'] = coef[i]
X_sub_train = dt_train
del X_sub_train['type']
X_sub_test = dt_test
del X_sub_test['type']
clf_ori = rf(n_estimators=n_estimator)
clf_ori.fit(X_ori_train,y_train)
dummy_performance.append(accuracy_score(y_test,clf_ori.predict(X_ori_test)))
clf_sub = rf(n_estimators=n_estimator)
clf_sub.fit(X_sub_train,y_train)
sub_model_performance.append(accuracy_score(y_test,clf_sub.predict(X_sub_test)))
return [np.mean(dummy_performance),np.mean(sub_model_performance)]
def performance_ev(factor_size=100,special_percent=0.2,total_size=10000):
x = None
outcome = list()
if type(factor_size) == list:
x = factor_size
factor_sizes = factor_size
for factor_size in factor_sizes:
outcome.append(show_importance_of_sub_model(total_size,factor_size,special_percent))
else:
x = special_percent
special_percents = special_percent
for special_percent in special_percents:
outcome.append(show_importance_of_sub_model(total_size,factor_size,special_percent))
performance = pd.DataFrame(outcome,columns=['dummy','sub_model'])
plt.plot(x,performance)
In [2]:
%matplotlib inline
performance_ev(factor_size = [5,10,20,40,80])
In [3]:
%matplotlib inline
performance_ev(special_percent = [0.02,0.04,0.08,0.16,0.32])
对离散特征,使用子模型建模能提升效果
离散特征越重要(重要的值越多,对应special_percent越高),使用子模型带来的提升越大
离散特征的取值越多,RandomForest的预测效果越差