标准化/归一化对机器学习经典模型的影响
归一化
数据标准化(归一化)处理是数据挖掘的一项基础工作,不同评价指标往往具有不同的量纲和量纲单位,这样的情况会影响到数据分析的结果,为了消除指标之间的量纲影响,需要进行数据标准化处理,以解决数据指标之间的可比性。原始数据经过数据标准化处理后,各指标处于同一数量级,适合进行综合对比评价。
归一化的几种方法
MinMaxScaler
也称为离差标准化,是对原始数据的线性变换,使结果值映射到[0 - 1]之间。转换函数如下:
MaxAbsScaler
与上述标准化方法相似,但是它通过除以最大值将训练集缩放至[-1,1]。这意味着数据已经以0为中心或者是含有非常非常多0的稀疏数据。
StandardScaler
计算训练集的平均值和标准差,以便测试数据集使用相同的变换
实验
实验方法
我们通过比较在不同标准化方法下,四种机器学习中的经典模型的均方误差(mean-square error, MSE)的大小来得出不同标准化或不标准化影响
实验代码及其结果
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv('huodian.csv')
data = data.sort_values(by='time',ascending=True)
data.reset_index(inplace=True,drop=True)
target = data['T1AOMW_AV']#target即是Y
del data['T1AOMW_AV']#在原data中删去Y
# 找出存在缺失值的列
All_NaN = pd.DataFrame(data.isnull().sum()).reset_index()
All_NaN.columns = ['name','times']
All_NaN.describe()
times | |
---|---|
count | 170.0 |
mean | 0.0 |
std | 0.0 |
min | 0.0 |
25% | 0.0 |
50% | 0.0 |
75% | 0.0 |
max | 0.0 |
# 去掉数据中变化较小的特征
feature_describe_T = data.describe().T
unstd_feature = feature_describe_T[feature_describe_T['std']>=1].index
data = data[unstd_feature]
#删除无关变量
del data['time']
test_data = data[:5000]
#切分数据集
data1 = data[5000:16060]
target1 = target[5000:16060]
data2 = data[16060:]
target2 = target[16060:]
import scipy.stats as stats
dict_corr = {
'spearman' : [],
'pearson' : [],
'kendall' : [],
'columns' : []
}
#对每一列求各项系数
for i in data.columns:
corr_pear,pval = stats.pearsonr(data[i],target)
corr_spear,pval = stats.spearmanr(data[i],target)
corr_kendall,pval = stats.kendalltau(data[i],target)
dict_corr['pearson'].append(abs(corr_pear))
dict_corr['spearman'].append(abs(corr_spear))
dict_corr['kendall'].append(abs(corr_kendall))
dict_corr['columns'].append(i)
# 筛选新属性
dict_corr =pd.DataFrame(dict_corr)
new_fea = list(dict_corr[(dict_corr['pearson']>0.1) & (dict_corr['spearman']>0.15) & (dict_corr['kendall']>0.15)&(dict_corr['pearson']<0.93) & (dict_corr['spearman']<0.93) & (dict_corr['kendall']<0.93)]['columns'].values)
#new_fea = list(dict_corr[(dict_corr['pearson']<0.63) & (dict_corr['spearman']<0.69) & (dict_corr['kendall']<0.63)]['columns'].values)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.preprocessing import MinMaxScaler,StandardScaler,MaxAbsScaler
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.metrics import mean_squared_error as mse
from sklearn.svm import SVR
import warnings
warnings.filterwarnings("ignore")
##分割数据集和测试集
X_train, X_test, y_train, y_test = train_test_split(data[new_fea],target,test_size=0.25,random_state=12345)
print('without normalization:')
estimator_lr = Lasso(alpha=0.5).fit(X_train,y_train)
predict_lr = estimator_lr.predict(X_test)
print('Lssao:',mse(predict_lr,y_test))
estimator_rg = Ridge(alpha=0.5).fit(X_train,y_train)
predict_rg = estimator_rg.predict(X_test)
print('Ridge:',mse(predict_rg,y_test))
estimator_svr = SVR(kernel='rbf',C=100,epsilon=0.1).fit(X_train,y_train)
predict_svr = estimator_svr.predict(X_test)
print('SVR:',mse(predict_svr,y_test))
estimator_RF = RandomForestRegressor().fit(X_train,y_train)
predict_RF = estimator_RF.predict(X_test)
print('RF:',mse(predict_RF,y_test))
mm = MinMaxScaler()
mm_x_train = mm.fit_transform(X_train)
mm_x_test = mm.transform(X_test)
print('MinMaxScaler:')
estimator_lr = Lasso(alpha=0.5).fit(mm_x_train,y_train)
predict_lr = estimator_lr.predict(mm_x_test)
print('Lssao:',mse(predict_lr,y_test))
estimator_rg = Ridge(alpha=0.5).fit(mm_x_train,y_train)
predict_rg = estimator_rg.predict(mm_x_test)
print('Ridge:',mse(predict_rg,y_test))
estimator_svr = SVR(kernel='rbf',C=100,epsilon=0.1).fit(mm_x_train,y_train)
predict_svr = estimator_svr.predict(mm_x_test)
print('SVR:',mse(predict_svr,y_test))
estimator_RF = RandomForestRegressor().fit(mm_x_train,y_train)
predict_RF = estimator_RF.predict(mm_x_test)
print('RF:',mse(predict_RF,y_test))
ma = MaxAbsScaler()
ma_x_train = ma.fit_transform(X_train)
ma_x_test = ma.transform(X_test)
print('MaxAbsScaler:')
estimator_lr = Lasso(alpha=0.5).fit(ma_x_train,y_train)
predict_lr = estimator_lr.predict(ma_x_test)
print('Lssao:',mse(predict_lr,y_test))
estimator_rg = Ridge(alpha=0.5).fit(ma_x_train,y_train)
predict_rg = estimator_rg.predict(ma_x_test)
print('Ridge:',mse(predict_rg,y_test))
estimator_svr = SVR(kernel='rbf',C=100,epsilon=0.1).fit(ma_x_train,y_train)
predict_svr = estimator_svr.predict(ma_x_test)
print('SVR:',mse(predict_svr,y_test))
estimator_RF = RandomForestRegressor().fit(ma_x_train,y_train)
predict_RF = estimator_RF.predict(ma_x_test)
print('RF:',mse(predict_RF,y_test))
ss = StandardScaler()
ss_x_train = ss.fit_transform(X_train)
ss_x_test = ss.transform(X_test)
print('StandardScaler:')
estimator_lr = Lasso(alpha=0.5).fit(ss_x_train,y_train)
predict_lr = estimator_lr.predict(ss_x_test)
print('Lssao:',mse(predict_lr,y_test))
estimator_rg = Ridge(alpha=0.5).fit(ss_x_train,y_train)
predict_rg = estimator_rg.predict(ss_x_test)
print('Ridge:',mse(predict_rg,y_test))
estimator_svr = SVR(kernel='rbf',C=100,epsilon=0.1).fit(ss_x_train,y_train)
predict_svr = estimator_svr.predict(ss_x_test)
print('SVR:',mse(predict_svr,y_test))
estimator_RF = RandomForestRegressor().fit(ss_x_train,y_train)
predict_RF = estimator_RF.predict(ss_x_test)
print('RF:',mse(predict_RF,y_test))
without normalization:
Lssao: 64.48569344896079
Ridge: 52.32215979123271
SVR: 2562.6181533319277
RF: 11.342877117923145
MinMaxScaler:
Lssao: 110.64816111661362
Ridge: 55.430338750636416
SVR: 37.81036885831256
RF: 10.204243317509082
MaxAbsScaler:
Lssao: 257.7066786267883
Ridge: 63.91979829622576
SVR: 69.74587878254961
RF: 11.721070230746417
StandardScaler:
Lssao: 81.70216554870805
Ridge: 52.5282264448465
SVR: 7.996381635964344
RF: 9.615276857782204
实验结果分析
通过对比不难发现,对于Lssao模型,在归一化之后其MSE有较明显的增大,对于Ridge除MaxAbsScaler外归一化的影响均不大,对于SVR假如不对其进行归一化,其MSE会非常大,而使用Standerscaler效果最好,而不同的归一化方法,或是否归一化,对RF影响不大
原因分析
svm实质上选择的是分割两类数据最远的超平面,由于错分类造成了影响,不进行归一化会造成对平面的影响,导致得到的划分平面不准确测试集成功率低