阿里云工业蒸汽量预测
数据探索
导入工具包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
读取数据
train_data=pd.read_csv('data/zhengqi_train.txt',sep='\t',encoding='utf-8')
test_data=pd.read_csv('data/zhengqi_test.txt',sep='\t',encoding='utf-8')
train_data
V0 | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V29 | V30 | V31 | V32 | V33 | V34 | V35 | V36 | V37 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.566 | 0.016 | -0.143 | 0.407 | 0.452 | -0.901 | -1.812 | -2.360 | -0.436 | -2.114 | ... | 0.136 | 0.109 | -0.615 | 0.327 | -4.627 | -4.789 | -5.101 | -2.608 | -3.508 | 0.175 |
1 | 0.968 | 0.437 | 0.066 | 0.566 | 0.194 | -0.893 | -1.566 | -2.360 | 0.332 | -2.114 | ... | -0.128 | 0.124 | 0.032 | 0.600 | -0.843 | 0.160 | 0.364 | -0.335 | -0.730 | 0.676 |
2 | 1.013 | 0.568 | 0.235 | 0.370 | 0.112 | -0.797 | -1.367 | -2.360 | 0.396 | -2.114 | ... | -0.009 | 0.361 | 0.277 | -0.116 | -0.843 | 0.160 | 0.364 | 0.765 | -0.589 | 0.633 |
3 | 0.733 | 0.368 | 0.283 | 0.165 | 0.599 | -0.679 | -1.200 | -2.086 | 0.403 | -2.114 | ... | 0.015 | 0.417 | 0.279 | 0.603 | -0.843 | -0.065 | 0.364 | 0.333 | -0.112 | 0.206 |
4 | 0.684 | 0.638 | 0.260 | 0.209 | 0.337 | -0.454 | -1.073 | -2.086 | 0.314 | -2.114 | ... | 0.183 | 1.078 | 0.328 | 0.418 | -0.843 | -0.215 | 0.364 | -0.280 | -0.028 | 0.384 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2883 | 0.190 | -0.025 | -0.138 | 0.161 | 0.600 | -0.212 | 0.757 | 0.584 | -0.026 | 0.904 | ... | 0.128 | -0.208 | 0.809 | -0.173 | 0.247 | -0.027 | -0.349 | 0.576 | 0.686 | 0.235 |
2884 | 0.507 | 0.557 | 0.296 | 0.183 | 0.530 | -0.237 | 0.749 | 0.584 | 0.537 | 0.904 | ... | 0.291 | -0.287 | 0.465 | -0.310 | 0.763 | 0.498 | -0.349 | -0.615 | -0.380 | 1.042 |
2885 | -0.394 | -0.721 | -0.485 | 0.084 | 0.136 | 0.034 | 0.655 | 0.614 | -0.818 | 0.904 | ... | 0.291 | -0.179 | 0.268 | 0.552 | 0.763 | 0.498 | -0.349 | 0.951 | 0.748 | 0.005 |
2886 | -0.219 | -0.282 | -0.344 | -0.049 | 0.449 | -0.140 | 0.560 | 0.583 | -0.596 | 0.904 | ... | 0.216 | 1.061 | -0.051 | 1.023 | 0.878 | 0.610 | -0.230 | -0.301 | 0.555 | 0.350 |
2887 | 0.368 | 0.380 | -0.225 | -0.049 | 0.379 | 0.092 | 0.550 | 0.551 | 0.244 | 0.904 | ... | 0.047 | 0.057 | -0.042 | 0.847 | 0.534 | -0.009 | -0.190 | -0.567 | 0.388 | 0.417 |
2888 rows × 39 columns
查看数据
train_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2888 entries, 0 to 2887
Data columns (total 39 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 V0 2888 non-null float64
1 V1 2888 non-null float64
2 V2 2888 non-null float64
3 V3 2888 non-null float64
4 V4 2888 non-null float64
5 V5 2888 non-null float64
6 V6 2888 non-null float64
7 V7 2888 non-null float64
8 V8 2888 non-null float64
9 V9 2888 non-null float64
10 V10 2888 non-null float64
11 V11 2888 non-null float64
12 V12 2888 non-null float64
13 V13 2888 non-null float64
14 V14 2888 non-null float64
15 V15 2888 non-null float64
16 V16 2888 non-null float64
17 V17 2888 non-null float64
18 V18 2888 non-null float64
19 V19 2888 non-null float64
20 V20 2888 non-null float64
21 V21 2888 non-null float64
22 V22 2888 non-null float64
23 V23 2888 non-null float64
24 V24 2888 non-null float64
25 V25 2888 non-null float64
26 V26 2888 non-null float64
27 V27 2888 non-null float64
28 V28 2888 non-null float64
29 V29 2888 non-null float64
30 V30 2888 non-null float64
31 V31 2888 non-null float64
32 V32 2888 non-null float64
33 V33 2888 non-null float64
34 V34 2888 non-null float64
35 V35 2888 non-null float64
36 V36 2888 non-null float64
37 V37 2888 non-null float64
38 target 2888 non-null float64
dtypes: float64(39)
memory usage: 880.1 KB
train_data.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
V0 | 2888.0 | 0.123048 | 0.928031 | -4.335 | -0.29700 | 0.3590 | 0.72600 | 2.121 |
V1 | 2888.0 | 0.056068 | 0.941515 | -5.122 | -0.22625 | 0.2725 | 0.59900 | 1.918 |
V2 | 2888.0 | 0.289720 | 0.911236 | -3.420 | -0.31300 | 0.3860 | 0.91825 | 2.828 |
V3 | 2888.0 | -0.067790 | 0.970298 | -3.956 | -0.65225 | -0.0445 | 0.62400 | 2.457 |
V4 | 2888.0 | 0.012921 | 0.888377 | -4.742 | -0.38500 | 0.1100 | 0.55025 | 2.689 |
V5 | 2888.0 | -0.558565 | 0.517957 | -2.182 | -0.85300 | -0.4660 | -0.15400 | 0.489 |
V6 | 2888.0 | 0.182892 | 0.918054 | -4.576 | -0.31000 | 0.3880 | 0.83125 | 1.895 |
V7 | 2888.0 | 0.116155 | 0.955116 | -5.048 | -0.29500 | 0.3440 | 0.78225 | 1.918 |
V8 | 2888.0 | 0.177856 | 0.895444 | -4.692 | -0.15900 | 0.3620 | 0.72600 | 2.245 |
V9 | 2888.0 | -0.169452 | 0.953813 | -12.891 | -0.39000 | 0.0420 | 0.04200 | 1.335 |
V10 | 2888.0 | 0.034319 | 0.968272 | -2.584 | -0.42050 | 0.1570 | 0.61925 | 4.830 |
V11 | 2888.0 | -0.364465 | 0.858504 | -3.160 | -0.80325 | -0.1120 | 0.24700 | 1.455 |
V12 | 2888.0 | 0.023177 | 0.894092 | -5.165 | -0.41900 | 0.1230 | 0.61600 | 2.657 |
V13 | 2888.0 | 0.195738 | 0.922757 | -3.675 | -0.39800 | 0.2895 | 0.86425 | 2.475 |
V14 | 2888.0 | 0.016081 | 1.015585 | -2.455 | -0.66800 | -0.1610 | 0.82975 | 2.558 |
V15 | 2888.0 | 0.096146 | 1.033048 | -2.903 | -0.66225 | -0.0005 | 0.73000 | 4.314 |
V16 | 2888.0 | 0.113505 | 0.983128 | -5.981 | -0.30000 | 0.3060 | 0.77425 | 2.861 |
V17 | 2888.0 | -0.043458 | 0.655857 | -2.224 | -0.36600 | 0.1650 | 0.43000 | 2.023 |
V18 | 2888.0 | 0.055034 | 0.953466 | -3.582 | -0.36750 | 0.0820 | 0.51325 | 4.441 |
V19 | 2888.0 | -0.114884 | 1.108859 | -3.704 | -0.98750 | -0.0005 | 0.73725 | 3.431 |
V20 | 2888.0 | -0.186226 | 0.788511 | -3.402 | -0.67550 | -0.1565 | 0.30400 | 3.525 |
V21 | 2888.0 | -0.056556 | 0.781471 | -2.643 | -0.51700 | -0.0565 | 0.43150 | 2.259 |
V22 | 2888.0 | 0.302893 | 0.639186 | -1.375 | -0.06300 | 0.2165 | 0.87200 | 2.018 |
V23 | 2888.0 | 0.155978 | 0.978757 | -5.542 | 0.09725 | 0.3380 | 0.36825 | 1.906 |
V24 | 2888.0 | -0.021813 | 1.033403 | -1.344 | -1.19100 | 0.0950 | 0.93125 | 2.423 |
V25 | 2888.0 | -0.051679 | 0.915957 | -3.808 | -0.55725 | -0.0760 | 0.35600 | 7.284 |
V26 | 2888.0 | 0.072092 | 0.889771 | -5.131 | -0.45200 | 0.0750 | 0.64425 | 2.980 |
V27 | 2888.0 | 0.272407 | 0.270374 | -1.164 | 0.15775 | 0.3250 | 0.44200 | 0.925 |
V28 | 2888.0 | 0.137712 | 0.929899 | -2.435 | -0.45500 | -0.4470 | 0.73000 | 4.671 |
V29 | 2888.0 | 0.097648 | 1.061200 | -2.912 | -0.66400 | -0.0230 | 0.74525 | 4.580 |
V30 | 2888.0 | 0.055477 | 0.901934 | -4.507 | -0.28300 | 0.0535 | 0.48800 | 2.689 |
V31 | 2888.0 | 0.127791 | 0.873028 | -5.859 | -0.17025 | 0.2995 | 0.63500 | 2.013 |
V32 | 2888.0 | 0.020806 | 0.902584 | -4.053 | -0.40725 | 0.0390 | 0.55700 | 2.395 |
V33 | 2888.0 | 0.007801 | 1.006995 | -4.627 | -0.49900 | -0.0400 | 0.46200 | 5.465 |
V34 | 2888.0 | 0.006715 | 1.003291 | -4.789 | -0.29000 | 0.1600 | 0.27300 | 5.110 |
V35 | 2888.0 | 0.197764 | 0.985675 | -5.695 | -0.20250 | 0.3640 | 0.60200 | 2.324 |
V36 | 2888.0 | 0.030658 | 0.970812 | -2.608 | -0.41300 | 0.1370 | 0.64425 | 5.238 |
V37 | 2888.0 | -0.130330 | 1.017196 | -3.630 | -0.79825 | -0.1855 | 0.49525 | 3.000 |
target | 2888.0 | 0.126353 | 0.983966 | -3.044 | -0.35025 | 0.3130 | 0.79325 | 2.538 |
可视化数据分布
箱线图
column=train_data.columns.tolist()[:39]
fig=plt.figure(figsize=(80,60),dpi=75)
for i in range(38):
plt.subplot(7,8,i+1)
sns.boxplot(train_data[column[i]],orient="v",width=0.5)
plt.ylabel(column[i],fontsize=36)
plt.show()
获取异常数据并画图
直方图和Q-Q图
cols=6
rows=len(train_data.columns)
plt.figure(figsize=(4*cols,4*rows))
i=0
for col in train_data.columns:
i+=1
ax=plt.subplot(rows,cols,i)
sns.distplot(train_data[col],fit=stats.norm)
i+=1
ax=plt.subplot(rows,cols,i)
res=stats.probplot(train_data[col],plot=plt)
plt.tight_layout()
plt.show()
KDE分布图
cols=6
rows=len(test_data.columns)
plt.figure(figsize=(4*cols,4*rows))
i=1
for col in test_data.columns:
ax=plt.subplot(rows,cols,i)
ax=sns.kdeplot(train_data[col],color='red',shade=True)
ax=sns.kdeplot(test_data[col],color='blue',shade=True)
ax.set_xlabel(col)
ax.set_ylabel("Frequency")
ax=ax.legend(['train','test'])
i+=1
plt.show()
线性回归关系图
cols=6
rows=len(test_data.columns)
plt.figure(figsize=(5*cols,4*rows))
i=0
for col in test_data.columns:
i+=1
ax=plt.subplot(rows,cols,i)
sns.regplot(x=col,y='target',data=train_data,ax=ax,scatter_kws={'marker':'.','s':3,'alpha':0.3},line_kws={'color':'k'})
plt.xlabel(col)
plt.ylabel('target')
i+=1
ax=plt.subplot(rows,cols,i)
sns.distplot(train_data[col].dropna())
plt.xlabel(col)
查看特征变量的相关性
计算相关性系数
pd.set_option('display.max_columns',10)
pd.set_option('display.max_rows',10)
data_train1=train_data.drop(['V5','V9','V11','V17','V22','V28'],axis=1)
train_corr=data_train1.corr()
train_corr
V0 | V1 | V2 | V3 | V4 | ... | V34 | V35 | V36 | V37 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|
V0 | 1.000000 | 0.908607 | 0.463643 | 0.409576 | 0.781212 | ... | -0.019342 | 0.138933 | 0.231417 | -0.494076 | 0.873212 |
V1 | 0.908607 | 1.000000 | 0.506514 | 0.383924 | 0.657790 | ... | -0.029115 | 0.146329 | 0.235299 | -0.494043 | 0.871846 |
V2 | 0.463643 | 0.506514 | 1.000000 | 0.410148 | 0.057697 | ... | -0.025620 | 0.043648 | 0.316462 | -0.734956 | 0.638878 |
V3 | 0.409576 | 0.383924 | 0.410148 | 1.000000 | 0.315046 | ... | -0.031898 | 0.080034 | 0.324475 | -0.229613 | 0.512074 |
V4 | 0.781212 | 0.657790 | 0.057697 | 0.315046 | 1.000000 | ... | 0.028659 | 0.100010 | 0.113609 | -0.031054 | 0.603984 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
V34 | -0.019342 | -0.029115 | -0.025620 | -0.031898 | 0.028659 | ... | 1.000000 | 0.233616 | -0.019032 | -0.006854 | -0.006034 |
V35 | 0.138933 | 0.146329 | 0.043648 | 0.080034 | 0.100010 | ... | 0.233616 | 1.000000 | 0.025401 | -0.077991 | 0.140294 |
V36 | 0.231417 | 0.235299 | 0.316462 | 0.324475 | 0.113609 | ... | -0.019032 | 0.025401 | 1.000000 | -0.039478 | 0.319309 |
V37 | -0.494076 | -0.494043 | -0.734956 | -0.229613 | -0.031054 | ... | -0.006854 | -0.077991 | -0.039478 | 1.000000 | -0.565795 |
target | 0.873212 | 0.871846 | 0.638878 | 0.512074 | 0.603984 | ... | -0.006034 | 0.140294 | 0.319309 | -0.565795 | 1.000000 |
33 rows × 33 columns
相关性热力图
ax=plt.subplots(figsize=(20,16))
ax=sns.heatmap(train_corr,vmax=0.8,square=True,annot=True)
根据相关性系数筛选特征变量
k=10
cols=train_corr.nlargest(k,'target')['target'].index
hm=plt.subplots(figsize=(10,10))
hm=sns.heatmap(train_data[cols].corr(),annot=True,square=True)
plt.show()
Box-Cox变换
def scale_minmax(col):
return (col-col.min())/(col.max()-col.min())
特征工程
数据预处理和特征处理
数据预处理
数据采集
数据清洗
数据采样
特征处理
标准化
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
x=load_iris().data
y=load_iris().target
StandardScaler().fit_transform(x)
array([[-9.00681170e-01, 1.01900435e+00, -1.34022653e+00,
-1.31544430e+00],
[-1.14301691e+00, -1.31979479e-01, -1.34022653e+00,
-1.31544430e+00],
[-1.38535265e+00, 3.28414053e-01, -1.39706395e+00,
-1.31544430e+00],
[-1.50652052e+00, 9.82172869e-02, -1.28338910e+00,
-1.31544430e+00],
[-1.02184904e+00, 1.24920112e+00, -1.34022653e+00,
-1.31544430e+00],
[-5.37177559e-01, 1.93979142e+00, -1.16971425e+00,
-1.05217993e+00],
[-1.50652052e+00, 7.88807586e-01, -1.34022653e+00,
-1.18381211e+00],
[-1.02184904e+00, 7.88807586e-01, -1.28338910e+00,
-1.31544430e+00],
[-1.74885626e+00, -3.62176246e-01, -1.34022653e+00,
-1.31544430e+00],
[-1.14301691e+00, 9.82172869e-02, -1.28338910e+00,
-1.44707648e+00],
[-5.37177559e-01, 1.47939788e+00, -1.28338910e+00,
-1.31544430e+00],
[-1.26418478e+00, 7.88807586e-01, -1.22655167e+00,
-1.31544430e+00],
[-1.26418478e+00, -1.31979479e-01, -1.34022653e+00,
-1.44707648e+00],
[-1.87002413e+00, -1.31979479e-01, -1.51073881e+00,
-1.44707648e+00],
[-5.25060772e-02, 2.16998818e+00, -1.45390138e+00,
-1.31544430e+00],
[-1.73673948e-01, 3.09077525e+00, -1.28338910e+00,
-1.05217993e+00],
[-5.37177559e-01, 1.93979142e+00, -1.39706395e+00,
-1.05217993e+00],
[-9.00681170e-01, 1.01900435e+00, -1.34022653e+00,
-1.18381211e+00],
[-1.73673948e-01, 1.70959465e+00, -1.16971425e+00,
-1.18381211e+00],
[-9.00681170e-01, 1.70959465e+00, -1.28338910e+00,
-1.18381211e+00],
[-5.37177559e-01, 7.88807586e-01, -1.16971425e+00,
-1.31544430e+00],
[-9.00681170e-01, 1.47939788e+00, -1.28338910e+00,
-1.05217993e+00],
[-1.50652052e+00, 1.24920112e+00, -1.56757623e+00,
-1.31544430e+00],
[-9.00681170e-01, 5.58610819e-01, -1.16971425e+00,
-9.20547742e-01],
[-1.26418478e+00, 7.88807586e-01, -1.05603939e+00,
-1.31544430e+00],
[-1.02184904e+00, -1.31979479e-01, -1.22655167e+00,
-1.31544430e+00],
[-1.02184904e+00, 7.88807586e-01, -1.22655167e+00,
-1.05217993e+00],
[-7.79513300e-01, 1.01900435e+00, -1.28338910e+00,
-1.31544430e+00],
[-7.79513300e-01, 7.88807586e-01, -1.34022653e+00,
-1.31544430e+00],
[-1.38535265e+00, 3.28414053e-01, -1.22655167e+00,
-1.31544430e+00],
[-1.26418478e+00, 9.82172869e-02, -1.22655167e+00,
-1.31544430e+00],
[-5.37177559e-01, 7.88807586e-01, -1.28338910e+00,
-1.05217993e+00],
[-7.79513300e-01, 2.40018495e+00, -1.28338910e+00,
-1.44707648e+00],
[-4.16009689e-01, 2.63038172e+00, -1.34022653e+00,
-1.31544430e+00],
[-1.14301691e+00, 9.82172869e-02, -1.28338910e+00,
-1.31544430e+00],
[-1.02184904e+00, 3.28414053e-01, -1.45390138e+00,
-1.31544430e+00],
[-4.16009689e-01, 1.01900435e+00, -1.39706395e+00,
-1.31544430e+00],
[-1.14301691e+00, 1.24920112e+00, -1.34022653e+00,
-1.44707648e+00],
[-1.74885626e+00, -1.31979479e-01, -1.39706395e+00,
-1.31544430e+00],
[-9.00681170e-01, 7.88807586e-01, -1.28338910e+00,
-1.31544430e+00],
[-1.02184904e+00, 1.01900435e+00, -1.39706395e+00,
-1.18381211e+00],
[-1.62768839e+00, -1.74335684e+00, -1.39706395e+00,
-1.18381211e+00],
[-1.74885626e+00, 3.28414053e-01, -1.39706395e+00,
-1.31544430e+00],
[-1.02184904e+00, 1.01900435e+00, -1.22655167e+00,
-7.88915558e-01],
[-9.00681170e-01, 1.70959465e+00, -1.05603939e+00,
-1.05217993e+00],
[-1.26418478e+00, -1.31979479e-01, -1.34022653e+00,
-1.18381211e+00],
[-9.00681170e-01, 1.70959465e+00, -1.22655167e+00,
-1.31544430e+00],
[-1.50652052e+00, 3.28414053e-01, -1.34022653e+00,
-1.31544430e+00],
[-6.58345429e-01, 1.47939788e+00, -1.28338910e+00,
-1.31544430e+00],
[-1.02184904e+00, 5.58610819e-01, -1.34022653e+00,
-1.31544430e+00],
[ 1.40150837e+00, 3.28414053e-01, 5.35408562e-01,
2.64141916e-01],
[ 6.74501145e-01, 3.28414053e-01, 4.21733708e-01,
3.95774101e-01],
[ 1.28034050e+00, 9.82172869e-02, 6.49083415e-01,
3.95774101e-01],
[-4.16009689e-01, -1.74335684e+00, 1.37546573e-01,
1.32509732e-01],
[ 7.95669016e-01, -5.92373012e-01, 4.78571135e-01,
3.95774101e-01],
[-1.73673948e-01, -5.92373012e-01, 4.21733708e-01,
1.32509732e-01],
[ 5.53333275e-01, 5.58610819e-01, 5.35408562e-01,
5.27406285e-01],
[-1.14301691e+00, -1.51316008e+00, -2.60315415e-01,
-2.62386821e-01],
[ 9.16836886e-01, -3.62176246e-01, 4.78571135e-01,
1.32509732e-01],
[-7.79513300e-01, -8.22569778e-01, 8.07091462e-02,
2.64141916e-01],
[-1.02184904e+00, -2.43394714e+00, -1.46640561e-01,
-2.62386821e-01],
[ 6.86617933e-02, -1.31979479e-01, 2.51221427e-01,
3.95774101e-01],
[ 1.89829664e-01, -1.97355361e+00, 1.37546573e-01,
-2.62386821e-01],
[ 3.10997534e-01, -3.62176246e-01, 5.35408562e-01,
2.64141916e-01],
[-2.94841818e-01, -3.62176246e-01, -8.98031345e-02,
1.32509732e-01],
[ 1.03800476e+00, 9.82172869e-02, 3.64896281e-01,
2.64141916e-01],
[-2.94841818e-01, -1.31979479e-01, 4.21733708e-01,
3.95774101e-01],
[-5.25060772e-02, -8.22569778e-01, 1.94384000e-01,
-2.62386821e-01],
[ 4.32165405e-01, -1.97355361e+00, 4.21733708e-01,
3.95774101e-01],
[-2.94841818e-01, -1.28296331e+00, 8.07091462e-02,
-1.30754636e-01],
[ 6.86617933e-02, 3.28414053e-01, 5.92245988e-01,
7.90670654e-01],
[ 3.10997534e-01, -5.92373012e-01, 1.37546573e-01,
1.32509732e-01],
[ 5.53333275e-01, -1.28296331e+00, 6.49083415e-01,
3.95774101e-01],
[ 3.10997534e-01, -5.92373012e-01, 5.35408562e-01,
8.77547895e-04],
[ 6.74501145e-01, -3.62176246e-01, 3.08058854e-01,
1.32509732e-01],
[ 9.16836886e-01, -1.31979479e-01, 3.64896281e-01,
2.64141916e-01],
[ 1.15917263e+00, -5.92373012e-01, 5.92245988e-01,
2.64141916e-01],
[ 1.03800476e+00, -1.31979479e-01, 7.05920842e-01,
6.59038469e-01],
[ 1.89829664e-01, -3.62176246e-01, 4.21733708e-01,
3.95774101e-01],
[-1.73673948e-01, -1.05276654e+00, -1.46640561e-01,
-2.62386821e-01],
[-4.16009689e-01, -1.51316008e+00, 2.38717193e-02,
-1.30754636e-01],
[-4.16009689e-01, -1.51316008e+00, -3.29657076e-02,
-2.62386821e-01],
[-5.25060772e-02, -8.22569778e-01, 8.07091462e-02,
8.77547895e-04],
[ 1.89829664e-01, -8.22569778e-01, 7.62758269e-01,
5.27406285e-01],
[-5.37177559e-01, -1.31979479e-01, 4.21733708e-01,
3.95774101e-01],
[ 1.89829664e-01, 7.88807586e-01, 4.21733708e-01,
5.27406285e-01],
[ 1.03800476e+00, 9.82172869e-02, 5.35408562e-01,
3.95774101e-01],
[ 5.53333275e-01, -1.74335684e+00, 3.64896281e-01,
1.32509732e-01],
[-2.94841818e-01, -1.31979479e-01, 1.94384000e-01,
1.32509732e-01],
[-4.16009689e-01, -1.28296331e+00, 1.37546573e-01,
1.32509732e-01],
[-4.16009689e-01, -1.05276654e+00, 3.64896281e-01,
8.77547895e-04],
[ 3.10997534e-01, -1.31979479e-01, 4.78571135e-01,
2.64141916e-01],
[-5.25060772e-02, -1.05276654e+00, 1.37546573e-01,
8.77547895e-04],
[-1.02184904e+00, -1.74335684e+00, -2.60315415e-01,
-2.62386821e-01],
[-2.94841818e-01, -8.22569778e-01, 2.51221427e-01,
1.32509732e-01],
[-1.73673948e-01, -1.31979479e-01, 2.51221427e-01,
8.77547895e-04],
[-1.73673948e-01, -3.62176246e-01, 2.51221427e-01,
1.32509732e-01],
[ 4.32165405e-01, -3.62176246e-01, 3.08058854e-01,
1.32509732e-01],
[-9.00681170e-01, -1.28296331e+00, -4.30827696e-01,
-1.30754636e-01],
[-1.73673948e-01, -5.92373012e-01, 1.94384000e-01,
1.32509732e-01],
[ 5.53333275e-01, 5.58610819e-01, 1.27429511e+00,
1.71209594e+00],
[-5.25060772e-02, -8.22569778e-01, 7.62758269e-01,
9.22302838e-01],
[ 1.52267624e+00, -1.31979479e-01, 1.21745768e+00,
1.18556721e+00],
[ 5.53333275e-01, -3.62176246e-01, 1.04694540e+00,
7.90670654e-01],
[ 7.95669016e-01, -1.31979479e-01, 1.16062026e+00,
1.31719939e+00],
[ 2.12851559e+00, -1.31979479e-01, 1.61531967e+00,
1.18556721e+00],
[-1.14301691e+00, -1.28296331e+00, 4.21733708e-01,
6.59038469e-01],
[ 1.76501198e+00, -3.62176246e-01, 1.44480739e+00,
7.90670654e-01],
[ 1.03800476e+00, -1.28296331e+00, 1.16062026e+00,
7.90670654e-01],
[ 1.64384411e+00, 1.24920112e+00, 1.33113254e+00,
1.71209594e+00],
[ 7.95669016e-01, 3.28414053e-01, 7.62758269e-01,
1.05393502e+00],
[ 6.74501145e-01, -8.22569778e-01, 8.76433123e-01,
9.22302838e-01],
[ 1.15917263e+00, -1.31979479e-01, 9.90107977e-01,
1.18556721e+00],
[-1.73673948e-01, -1.28296331e+00, 7.05920842e-01,
1.05393502e+00],
[-5.25060772e-02, -5.92373012e-01, 7.62758269e-01,
1.58046376e+00],
[ 6.74501145e-01, 3.28414053e-01, 8.76433123e-01,
1.44883158e+00],
[ 7.95669016e-01, -1.31979479e-01, 9.90107977e-01,
7.90670654e-01],
[ 2.24968346e+00, 1.70959465e+00, 1.67215710e+00,
1.31719939e+00],
[ 2.24968346e+00, -1.05276654e+00, 1.78583195e+00,
1.44883158e+00],
[ 1.89829664e-01, -1.97355361e+00, 7.05920842e-01,
3.95774101e-01],
[ 1.28034050e+00, 3.28414053e-01, 1.10378283e+00,
1.44883158e+00],
[-2.94841818e-01, -5.92373012e-01, 6.49083415e-01,
1.05393502e+00],
[ 2.24968346e+00, -5.92373012e-01, 1.67215710e+00,
1.05393502e+00],
[ 5.53333275e-01, -8.22569778e-01, 6.49083415e-01,
7.90670654e-01],
[ 1.03800476e+00, 5.58610819e-01, 1.10378283e+00,
1.18556721e+00],
[ 1.64384411e+00, 3.28414053e-01, 1.27429511e+00,
7.90670654e-01],
[ 4.32165405e-01, -5.92373012e-01, 5.92245988e-01,
7.90670654e-01],
[ 3.10997534e-01, -1.31979479e-01, 6.49083415e-01,
7.90670654e-01],
[ 6.74501145e-01, -5.92373012e-01, 1.04694540e+00,
1.18556721e+00],
[ 1.64384411e+00, -1.31979479e-01, 1.16062026e+00,
5.27406285e-01],
[ 1.88617985e+00, -5.92373012e-01, 1.33113254e+00,
9.22302838e-01],
[ 2.49201920e+00, 1.70959465e+00, 1.50164482e+00,
1.05393502e+00],
[ 6.74501145e-01, -5.92373012e-01, 1.04694540e+00,
1.31719939e+00],
[ 5.53333275e-01, -5.92373012e-01, 7.62758269e-01,
3.95774101e-01],
[ 3.10997534e-01, -1.05276654e+00, 1.04694540e+00,
2.64141916e-01],
[ 2.24968346e+00, -1.31979479e-01, 1.33113254e+00,
1.44883158e+00],
[ 5.53333275e-01, 7.88807586e-01, 1.04694540e+00,
1.58046376e+00],
[ 6.74501145e-01, 9.82172869e-02, 9.90107977e-01,
7.90670654e-01],
[ 1.89829664e-01, -1.31979479e-01, 5.92245988e-01,
7.90670654e-01],
[ 1.28034050e+00, 9.82172869e-02, 9.33270550e-01,
1.18556721e+00],
[ 1.03800476e+00, 9.82172869e-02, 1.04694540e+00,
1.58046376e+00],
[ 1.28034050e+00, 9.82172869e-02, 7.62758269e-01,
1.44883158e+00],
[-5.25060772e-02, -8.22569778e-01, 7.62758269e-01,
9.22302838e-01],
[ 1.15917263e+00, 3.28414053e-01, 1.21745768e+00,
1.44883158e+00],
[ 1.03800476e+00, 5.58610819e-01, 1.10378283e+00,
1.71209594e+00],
[ 1.03800476e+00, -1.31979479e-01, 8.19595696e-01,
1.44883158e+00],
[ 5.53333275e-01, -1.28296331e+00, 7.05920842e-01,
9.22302838e-01],
[ 7.95669016e-01, -1.31979479e-01, 8.19595696e-01,
1.05393502e+00],
[ 4.32165405e-01, 7.88807586e-01, 9.33270550e-01,
1.44883158e+00],
[ 6.86617933e-02, -1.31979479e-01, 7.62758269e-01,
7.90670654e-01]])
(x-x.mean(axis=0))/x.std(axis=0)
array([[-9.00681170e-01, 1.01900435e+00, -1.34022653e+00,
-1.31544430e+00],
[-1.14301691e+00, -1.31979479e-01, -1.34022653e+00,
-1.31544430e+00],
[-1.38535265e+00, 3.28414053e-01, -1.39706395e+00,
-1.31544430e+00],
[-1.50652052e+00, 9.82172869e-02, -1.28338910e+00,
-1.31544430e+00],
[-1.02184904e+00, 1.24920112e+00, -1.34022653e+00,
-1.31544430e+00],
[-5.37177559e-01, 1.93979142e+00, -1.16971425e+00,
-1.05217993e+00],
[-1.50652052e+00, 7.88807586e-01, -1.34022653e+00,
-1.18381211e+00],
[-1.02184904e+00, 7.88807586e-01, -1.28338910e+00,
-1.31544430e+00],
[-1.74885626e+00, -3.62176246e-01, -1.34022653e+00,
-1.31544430e+00],
[-1.14301691e+00, 9.82172869e-02, -1.28338910e+00,
-1.44707648e+00],
[-5.37177559e-01, 1.47939788e+00, -1.28338910e+00,
-1.31544430e+00],
[-1.26418478e+00, 7.88807586e-01, -1.22655167e+00,
-1.31544430e+00],
[-1.26418478e+00, -1.31979479e-01, -1.34022653e+00,
-1.44707648e+00],
[-1.87002413e+00, -1.31979479e-01, -1.51073881e+00,
-1.44707648e+00],
[-5.25060772e-02, 2.16998818e+00, -1.45390138e+00,
-1.31544430e+00],
[-1.73673948e-01, 3.09077525e+00, -1.28338910e+00,
-1.05217993e+00],
[-5.37177559e-01, 1.93979142e+00, -1.39706395e+00,
-1.05217993e+00],
[-9.00681170e-01, 1.01900435e+00, -1.34022653e+00,
-1.18381211e+00],
[-1.73673948e-01, 1.70959465e+00, -1.16971425e+00,
-1.18381211e+00],
[-9.00681170e-01, 1.70959465e+00, -1.28338910e+00,
-1.18381211e+00],
[-5.37177559e-01, 7.88807586e-01, -1.16971425e+00,
-1.31544430e+00],
[-9.00681170e-01, 1.47939788e+00, -1.28338910e+00,
-1.05217993e+00],
[-1.50652052e+00, 1.24920112e+00, -1.56757623e+00,
-1.31544430e+00],
[-9.00681170e-01, 5.58610819e-01, -1.16971425e+00,
-9.20547742e-01],
[-1.26418478e+00, 7.88807586e-01, -1.05603939e+00,
-1.31544430e+00],
[-1.02184904e+00, -1.31979479e-01, -1.22655167e+00,
-1.31544430e+00],
[-1.02184904e+00, 7.88807586e-01, -1.22655167e+00,
-1.05217993e+00],
[-7.79513300e-01, 1.01900435e+00, -1.28338910e+00,
-1.31544430e+00],
[-7.79513300e-01, 7.88807586e-01, -1.34022653e+00,
-1.31544430e+00],
[-1.38535265e+00, 3.28414053e-01, -1.22655167e+00,
-1.31544430e+00],
[-1.26418478e+00, 9.82172869e-02, -1.22655167e+00,
-1.31544430e+00],
[-5.37177559e-01, 7.88807586e-01, -1.28338910e+00,
-1.05217993e+00],
[-7.79513300e-01, 2.40018495e+00, -1.28338910e+00,
-1.44707648e+00],
[-4.16009689e-01, 2.63038172e+00, -1.34022653e+00,
-1.31544430e+00],
[-1.14301691e+00, 9.82172869e-02, -1.28338910e+00,
-1.31544430e+00],
[-1.02184904e+00, 3.28414053e-01, -1.45390138e+00,
-1.31544430e+00],
[-4.16009689e-01, 1.01900435e+00, -1.39706395e+00,
-1.31544430e+00],
[-1.14301691e+00, 1.24920112e+00, -1.34022653e+00,
-1.44707648e+00],
[-1.74885626e+00, -1.31979479e-01, -1.39706395e+00,
-1.31544430e+00],
[-9.00681170e-01, 7.88807586e-01, -1.28338910e+00,
-1.31544430e+00],
[-1.02184904e+00, 1.01900435e+00, -1.39706395e+00,
-1.18381211e+00],
[-1.62768839e+00, -1.74335684e+00, -1.39706395e+00,
-1.18381211e+00],
[-1.74885626e+00, 3.28414053e-01, -1.39706395e+00,
-1.31544430e+00],
[-1.02184904e+00, 1.01900435e+00, -1.22655167e+00,
-7.88915558e-01],
[-9.00681170e-01, 1.70959465e+00, -1.05603939e+00,
-1.05217993e+00],
[-1.26418478e+00, -1.31979479e-01, -1.34022653e+00,
-1.18381211e+00],
[-9.00681170e-01, 1.70959465e+00, -1.22655167e+00,
-1.31544430e+00],
[-1.50652052e+00, 3.28414053e-01, -1.34022653e+00,
-1.31544430e+00],
[-6.58345429e-01, 1.47939788e+00, -1.28338910e+00,
-1.31544430e+00],
[-1.02184904e+00, 5.58610819e-01, -1.34022653e+00,
-1.31544430e+00],
[ 1.40150837e+00, 3.28414053e-01, 5.35408562e-01,
2.64141916e-01],
[ 6.74501145e-01, 3.28414053e-01, 4.21733708e-01,
3.95774101e-01],
[ 1.28034050e+00, 9.82172869e-02, 6.49083415e-01,
3.95774101e-01],
[-4.16009689e-01, -1.74335684e+00, 1.37546573e-01,
1.32509732e-01],
[ 7.95669016e-01, -5.92373012e-01, 4.78571135e-01,
3.95774101e-01],
[-1.73673948e-01, -5.92373012e-01, 4.21733708e-01,
1.32509732e-01],
[ 5.53333275e-01, 5.58610819e-01, 5.35408562e-01,
5.27406285e-01],
[-1.14301691e+00, -1.51316008e+00, -2.60315415e-01,
-2.62386821e-01],
[ 9.16836886e-01, -3.62176246e-01, 4.78571135e-01,
1.32509732e-01],
[-7.79513300e-01, -8.22569778e-01, 8.07091462e-02,
2.64141916e-01],
[-1.02184904e+00, -2.43394714e+00, -1.46640561e-01,
-2.62386821e-01],
[ 6.86617933e-02, -1.31979479e-01, 2.51221427e-01,
3.95774101e-01],
[ 1.89829664e-01, -1.97355361e+00, 1.37546573e-01,
-2.62386821e-01],
[ 3.10997534e-01, -3.62176246e-01, 5.35408562e-01,
2.64141916e-01],
[-2.94841818e-01, -3.62176246e-01, -8.98031345e-02,
1.32509732e-01],
[ 1.03800476e+00, 9.82172869e-02, 3.64896281e-01,
2.64141916e-01],
[-2.94841818e-01, -1.31979479e-01, 4.21733708e-01,
3.95774101e-01],
[-5.25060772e-02, -8.22569778e-01, 1.94384000e-01,
-2.62386821e-01],
[ 4.32165405e-01, -1.97355361e+00, 4.21733708e-01,
3.95774101e-01],
[-2.94841818e-01, -1.28296331e+00, 8.07091462e-02,
-1.30754636e-01],
[ 6.86617933e-02, 3.28414053e-01, 5.92245988e-01,
7.90670654e-01],
[ 3.10997534e-01, -5.92373012e-01, 1.37546573e-01,
1.32509732e-01],
[ 5.53333275e-01, -1.28296331e+00, 6.49083415e-01,
3.95774101e-01],
[ 3.10997534e-01, -5.92373012e-01, 5.35408562e-01,
8.77547895e-04],
[ 6.74501145e-01, -3.62176246e-01, 3.08058854e-01,
1.32509732e-01],
[ 9.16836886e-01, -1.31979479e-01, 3.64896281e-01,
2.64141916e-01],
[ 1.15917263e+00, -5.92373012e-01, 5.92245988e-01,
2.64141916e-01],
[ 1.03800476e+00, -1.31979479e-01, 7.05920842e-01,
6.59038469e-01],
[ 1.89829664e-01, -3.62176246e-01, 4.21733708e-01,
3.95774101e-01],
[-1.73673948e-01, -1.05276654e+00, -1.46640561e-01,
-2.62386821e-01],
[-4.16009689e-01, -1.51316008e+00, 2.38717193e-02,
-1.30754636e-01],
[-4.16009689e-01, -1.51316008e+00, -3.29657076e-02,
-2.62386821e-01],
[-5.25060772e-02, -8.22569778e-01, 8.07091462e-02,
8.77547895e-04],
[ 1.89829664e-01, -8.22569778e-01, 7.62758269e-01,
5.27406285e-01],
[-5.37177559e-01, -1.31979479e-01, 4.21733708e-01,
3.95774101e-01],
[ 1.89829664e-01, 7.88807586e-01, 4.21733708e-01,
5.27406285e-01],
[ 1.03800476e+00, 9.82172869e-02, 5.35408562e-01,
3.95774101e-01],
[ 5.53333275e-01, -1.74335684e+00, 3.64896281e-01,
1.32509732e-01],
[-2.94841818e-01, -1.31979479e-01, 1.94384000e-01,
1.32509732e-01],
[-4.16009689e-01, -1.28296331e+00, 1.37546573e-01,
1.32509732e-01],
[-4.16009689e-01, -1.05276654e+00, 3.64896281e-01,
8.77547895e-04],
[ 3.10997534e-01, -1.31979479e-01, 4.78571135e-01,
2.64141916e-01],
[-5.25060772e-02, -1.05276654e+00, 1.37546573e-01,
8.77547895e-04],
[-1.02184904e+00, -1.74335684e+00, -2.60315415e-01,
-2.62386821e-01],
[-2.94841818e-01, -8.22569778e-01, 2.51221427e-01,
1.32509732e-01],
[-1.73673948e-01, -1.31979479e-01, 2.51221427e-01,
8.77547895e-04],
[-1.73673948e-01, -3.62176246e-01, 2.51221427e-01,
1.32509732e-01],
[ 4.32165405e-01, -3.62176246e-01, 3.08058854e-01,
1.32509732e-01],
[-9.00681170e-01, -1.28296331e+00, -4.30827696e-01,
-1.30754636e-01],
[-1.73673948e-01, -5.92373012e-01, 1.94384000e-01,
1.32509732e-01],
[ 5.53333275e-01, 5.58610819e-01, 1.27429511e+00,
1.71209594e+00],
[-5.25060772e-02, -8.22569778e-01, 7.62758269e-01,
9.22302838e-01],
[ 1.52267624e+00, -1.31979479e-01, 1.21745768e+00,
1.18556721e+00],
[ 5.53333275e-01, -3.62176246e-01, 1.04694540e+00,
7.90670654e-01],
[ 7.95669016e-01, -1.31979479e-01, 1.16062026e+00,
1.31719939e+00],
[ 2.12851559e+00, -1.31979479e-01, 1.61531967e+00,
1.18556721e+00],
[-1.14301691e+00, -1.28296331e+00, 4.21733708e-01,
6.59038469e-01],
[ 1.76501198e+00, -3.62176246e-01, 1.44480739e+00,
7.90670654e-01],
[ 1.03800476e+00, -1.28296331e+00, 1.16062026e+00,
7.90670654e-01],
[ 1.64384411e+00, 1.24920112e+00, 1.33113254e+00,
1.71209594e+00],
[ 7.95669016e-01, 3.28414053e-01, 7.62758269e-01,
1.05393502e+00],
[ 6.74501145e-01, -8.22569778e-01, 8.76433123e-01,
9.22302838e-01],
[ 1.15917263e+00, -1.31979479e-01, 9.90107977e-01,
1.18556721e+00],
[-1.73673948e-01, -1.28296331e+00, 7.05920842e-01,
1.05393502e+00],
[-5.25060772e-02, -5.92373012e-01, 7.62758269e-01,
1.58046376e+00],
[ 6.74501145e-01, 3.28414053e-01, 8.76433123e-01,
1.44883158e+00],
[ 7.95669016e-01, -1.31979479e-01, 9.90107977e-01,
7.90670654e-01],
[ 2.24968346e+00, 1.70959465e+00, 1.67215710e+00,
1.31719939e+00],
[ 2.24968346e+00, -1.05276654e+00, 1.78583195e+00,
1.44883158e+00],
[ 1.89829664e-01, -1.97355361e+00, 7.05920842e-01,
3.95774101e-01],
[ 1.28034050e+00, 3.28414053e-01, 1.10378283e+00,
1.44883158e+00],
[-2.94841818e-01, -5.92373012e-01, 6.49083415e-01,
1.05393502e+00],
[ 2.24968346e+00, -5.92373012e-01, 1.67215710e+00,
1.05393502e+00],
[ 5.53333275e-01, -8.22569778e-01, 6.49083415e-01,
7.90670654e-01],
[ 1.03800476e+00, 5.58610819e-01, 1.10378283e+00,
1.18556721e+00],
[ 1.64384411e+00, 3.28414053e-01, 1.27429511e+00,
7.90670654e-01],
[ 4.32165405e-01, -5.92373012e-01, 5.92245988e-01,
7.90670654e-01],
[ 3.10997534e-01, -1.31979479e-01, 6.49083415e-01,
7.90670654e-01],
[ 6.74501145e-01, -5.92373012e-01, 1.04694540e+00,
1.18556721e+00],
[ 1.64384411e+00, -1.31979479e-01, 1.16062026e+00,
5.27406285e-01],
[ 1.88617985e+00, -5.92373012e-01, 1.33113254e+00,
9.22302838e-01],
[ 2.49201920e+00, 1.70959465e+00, 1.50164482e+00,
1.05393502e+00],
[ 6.74501145e-01, -5.92373012e-01, 1.04694540e+00,
1.31719939e+00],
[ 5.53333275e-01, -5.92373012e-01, 7.62758269e-01,
3.95774101e-01],
[ 3.10997534e-01, -1.05276654e+00, 1.04694540e+00,
2.64141916e-01],
[ 2.24968346e+00, -1.31979479e-01, 1.33113254e+00,
1.44883158e+00],
[ 5.53333275e-01, 7.88807586e-01, 1.04694540e+00,
1.58046376e+00],
[ 6.74501145e-01, 9.82172869e-02, 9.90107977e-01,
7.90670654e-01],
[ 1.89829664e-01, -1.31979479e-01, 5.92245988e-01,
7.90670654e-01],
[ 1.28034050e+00, 9.82172869e-02, 9.33270550e-01,
1.18556721e+00],
[ 1.03800476e+00, 9.82172869e-02, 1.04694540e+00,
1.58046376e+00],
[ 1.28034050e+00, 9.82172869e-02, 7.62758269e-01,
1.44883158e+00],
[-5.25060772e-02, -8.22569778e-01, 7.62758269e-01,
9.22302838e-01],
[ 1.15917263e+00, 3.28414053e-01, 1.21745768e+00,
1.44883158e+00],
[ 1.03800476e+00, 5.58610819e-01, 1.10378283e+00,
1.71209594e+00],
[ 1.03800476e+00, -1.31979479e-01, 8.19595696e-01,
1.44883158e+00],
[ 5.53333275e-01, -1.28296331e+00, 7.05920842e-01,
9.22302838e-01],
[ 7.95669016e-01, -1.31979479e-01, 8.19595696e-01,
1.05393502e+00],
[ 4.32165405e-01, 7.88807586e-01, 9.33270550e-01,
1.44883158e+00],
[ 6.86617933e-02, -1.31979479e-01, 7.62758269e-01,
7.90670654e-01]])
区间缩放法
from sklearn.preprocessing import MinMaxScaler
MinMaxScaler(feature_range=(1,5)).fit_transform(x)
array([[1.88888889, 3.5 , 1.27118644, 1.16666667],
[1.66666667, 2.66666667, 1.27118644, 1.16666667],
[1.44444444, 3. , 1.20338983, 1.16666667],
[1.33333333, 2.83333333, 1.33898305, 1.16666667],
[1.77777778, 3.66666667, 1.27118644, 1.16666667],
[2.22222222, 4.16666667, 1.47457627, 1.5 ],
[1.33333333, 3.33333333, 1.27118644, 1.33333333],
[1.77777778, 3.33333333, 1.33898305, 1.16666667],
[1.11111111, 2.5 , 1.27118644, 1.16666667],
[1.66666667, 2.83333333, 1.33898305, 1. ],
[2.22222222, 3.83333333, 1.33898305, 1.16666667],
[1.55555556, 3.33333333, 1.40677966, 1.16666667],
[1.55555556, 2.66666667, 1.27118644, 1. ],
[1. , 2.66666667, 1.06779661, 1. ],
[2.66666667, 4.33333333, 1.13559322, 1.16666667],
[2.55555556, 5. , 1.33898305, 1.5 ],
[2.22222222, 4.16666667, 1.20338983, 1.5 ],
[1.88888889, 3.5 , 1.27118644, 1.33333333],
[2.55555556, 4. , 1.47457627, 1.33333333],
[1.88888889, 4. , 1.33898305, 1.33333333],
[2.22222222, 3.33333333, 1.47457627, 1.16666667],
[1.88888889, 3.83333333, 1.33898305, 1.5 ],
[1.33333333, 3.66666667, 1. , 1.16666667],
[1.88888889, 3.16666667, 1.47457627, 1.66666667],
[1.55555556, 3.33333333, 1.61016949, 1.16666667],
[1.77777778, 2.66666667, 1.40677966, 1.16666667],
[1.77777778, 3.33333333, 1.40677966, 1.5 ],
[2. , 3.5 , 1.33898305, 1.16666667],
[2. , 3.33333333, 1.27118644, 1.16666667],
[1.44444444, 3. , 1.40677966, 1.16666667],
[1.55555556, 2.83333333, 1.40677966, 1.16666667],
[2.22222222, 3.33333333, 1.33898305, 1.5 ],
[2. , 4.5 , 1.33898305, 1. ],
[2.33333333, 4.66666667, 1.27118644, 1.16666667],
[1.66666667, 2.83333333, 1.33898305, 1.16666667],
[1.77777778, 3. , 1.13559322, 1.16666667],
[2.33333333, 3.5 , 1.20338983, 1.16666667],
[1.66666667, 3.66666667, 1.27118644, 1. ],
[1.11111111, 2.66666667, 1.20338983, 1.16666667],
[1.88888889, 3.33333333, 1.33898305, 1.16666667],
[1.77777778, 3.5 , 1.20338983, 1.33333333],
[1.22222222, 1.5 , 1.20338983, 1.33333333],
[1.11111111, 3. , 1.20338983, 1.16666667],
[1.77777778, 3.5 , 1.40677966, 1.83333333],
[1.88888889, 4. , 1.61016949, 1.5 ],
[1.55555556, 2.66666667, 1.27118644, 1.33333333],
[1.88888889, 4. , 1.40677966, 1.16666667],
[1.33333333, 3. , 1.27118644, 1.16666667],
[2.11111111, 3.83333333, 1.33898305, 1.16666667],
[1.77777778, 3.16666667, 1.27118644, 1.16666667],
[4. , 3. , 3.50847458, 3.16666667],
[3.33333333, 3. , 3.37288136, 3.33333333],
[3.88888889, 2.83333333, 3.6440678 , 3.33333333],
[2.33333333, 1.5 , 3.03389831, 3. ],
[3.44444444, 2.33333333, 3.44067797, 3.33333333],
[2.55555556, 2.33333333, 3.37288136, 3. ],
[3.22222222, 3.16666667, 3.50847458, 3.5 ],
[1.66666667, 1.66666667, 2.55932203, 2.5 ],
[3.55555556, 2.5 , 3.44067797, 3. ],
[2. , 2.16666667, 2.96610169, 3.16666667],
[1.77777778, 1. , 2.69491525, 2.5 ],
[2.77777778, 2.66666667, 3.16949153, 3.33333333],
[2.88888889, 1.33333333, 3.03389831, 2.5 ],
[3. , 2.5 , 3.50847458, 3.16666667],
[2.44444444, 2.5 , 2.76271186, 3. ],
[3.66666667, 2.83333333, 3.30508475, 3.16666667],
[2.44444444, 2.66666667, 3.37288136, 3.33333333],
[2.66666667, 2.16666667, 3.10169492, 2.5 ],
[3.11111111, 1.33333333, 3.37288136, 3.33333333],
[2.44444444, 1.83333333, 2.96610169, 2.66666667],
[2.77777778, 3. , 3.57627119, 3.83333333],
[3. , 2.33333333, 3.03389831, 3. ],
[3.22222222, 1.83333333, 3.6440678 , 3.33333333],
[3. , 2.33333333, 3.50847458, 2.83333333],
[3.33333333, 2.5 , 3.23728814, 3. ],
[3.55555556, 2.66666667, 3.30508475, 3.16666667],
[3.77777778, 2.33333333, 3.57627119, 3.16666667],
[3.66666667, 2.66666667, 3.71186441, 3.66666667],
[2.88888889, 2.5 , 3.37288136, 3.33333333],
[2.55555556, 2. , 2.69491525, 2.5 ],
[2.33333333, 1.66666667, 2.89830508, 2.66666667],
[2.33333333, 1.66666667, 2.83050847, 2.5 ],
[2.66666667, 2.16666667, 2.96610169, 2.83333333],
[2.88888889, 2.16666667, 3.77966102, 3.5 ],
[2.22222222, 2.66666667, 3.37288136, 3.33333333],
[2.88888889, 3.33333333, 3.37288136, 3.5 ],
[3.66666667, 2.83333333, 3.50847458, 3.33333333],
[3.22222222, 1.5 , 3.30508475, 3. ],
[2.44444444, 2.66666667, 3.10169492, 3. ],
[2.33333333, 1.83333333, 3.03389831, 3. ],
[2.33333333, 2. , 3.30508475, 2.83333333],
[3. , 2.66666667, 3.44067797, 3.16666667],
[2.66666667, 2. , 3.03389831, 2.83333333],
[1.77777778, 1.5 , 2.55932203, 2.5 ],
[2.44444444, 2.16666667, 3.16949153, 3. ],
[2.55555556, 2.66666667, 3.16949153, 2.83333333],
[2.55555556, 2.5 , 3.16949153, 3. ],
[3.11111111, 2.5 , 3.23728814, 3. ],
[1.88888889, 1.83333333, 2.3559322 , 2.66666667],
[2.55555556, 2.33333333, 3.10169492, 3. ],
[3.22222222, 3.16666667, 4.38983051, 5. ],
[2.66666667, 2.16666667, 3.77966102, 4. ],
[4.11111111, 2.66666667, 4.3220339 , 4.33333333],
[3.22222222, 2.5 , 4.11864407, 3.83333333],
[3.44444444, 2.66666667, 4.25423729, 4.5 ],
[4.66666667, 2.66666667, 4.79661017, 4.33333333],
[1.66666667, 1.83333333, 3.37288136, 3.66666667],
[4.33333333, 2.5 , 4.59322034, 3.83333333],
[3.66666667, 1.83333333, 4.25423729, 3.83333333],
[4.22222222, 3.66666667, 4.45762712, 5. ],
[3.44444444, 3. , 3.77966102, 4.16666667],
[3.33333333, 2.16666667, 3.91525424, 4. ],
[3.77777778, 2.66666667, 4.05084746, 4.33333333],
[2.55555556, 1.83333333, 3.71186441, 4.16666667],
[2.66666667, 2.33333333, 3.77966102, 4.83333333],
[3.33333333, 3. , 3.91525424, 4.66666667],
[3.44444444, 2.66666667, 4.05084746, 3.83333333],
[4.77777778, 4. , 4.86440678, 4.5 ],
[4.77777778, 2. , 5. , 4.66666667],
[2.88888889, 1.33333333, 3.71186441, 3.33333333],
[3.88888889, 3. , 4.18644068, 4.66666667],
[2.44444444, 2.33333333, 3.6440678 , 4.16666667],
[4.77777778, 2.33333333, 4.86440678, 4.16666667],
[3.22222222, 2.16666667, 3.6440678 , 3.83333333],
[3.66666667, 3.16666667, 4.18644068, 4.33333333],
[4.22222222, 3. , 4.38983051, 3.83333333],
[3.11111111, 2.33333333, 3.57627119, 3.83333333],
[3. , 2.66666667, 3.6440678 , 3.83333333],
[3.33333333, 2.33333333, 4.11864407, 4.33333333],
[4.22222222, 2.66666667, 4.25423729, 3.5 ],
[4.44444444, 2.33333333, 4.45762712, 4. ],
[5. , 4. , 4.66101695, 4.16666667],
[3.33333333, 2.33333333, 4.11864407, 4.5 ],
[3.22222222, 2.33333333, 3.77966102, 3.33333333],
[3. , 2. , 4.11864407, 3.16666667],
[4.77777778, 2.66666667, 4.45762712, 4.66666667],
[3.22222222, 3.33333333, 4.11864407, 4.83333333],
[3.33333333, 2.83333333, 4.05084746, 3.83333333],
[2.88888889, 2.66666667, 3.57627119, 3.83333333],
[3.88888889, 2.83333333, 3.98305085, 4.33333333],
[3.66666667, 2.83333333, 4.11864407, 4.83333333],
[3.88888889, 2.83333333, 3.77966102, 4.66666667],
[2.66666667, 2.16666667, 3.77966102, 4. ],
[3.77777778, 3. , 4.3220339 , 4.66666667],
[3.66666667, 3.16666667, 4.18644068, 5. ],
[3.66666667, 2.66666667, 3.84745763, 4.66666667],
[3.22222222, 1.83333333, 3.71186441, 4. ],
[3.44444444, 2.66666667, 3.84745763, 4.16666667],
[3.11111111, 3.33333333, 3.98305085, 4.66666667],
[2.77777778, 2.66666667, 3.77966102, 3.83333333]])
归一化
from sklearn.preprocessing import Normalizer
Normalizer().fit_transform(x)
array([[0.80377277, 0.55160877, 0.22064351, 0.0315205 ],
[0.82813287, 0.50702013, 0.23660939, 0.03380134],
[0.80533308, 0.54831188, 0.2227517 , 0.03426949],
[0.80003025, 0.53915082, 0.26087943, 0.03478392],
[0.790965 , 0.5694948 , 0.2214702 , 0.0316386 ],
[0.78417499, 0.5663486 , 0.2468699 , 0.05808704],
[0.78010936, 0.57660257, 0.23742459, 0.0508767 ],
[0.80218492, 0.54548574, 0.24065548, 0.0320874 ],
[0.80642366, 0.5315065 , 0.25658935, 0.03665562],
[0.81803119, 0.51752994, 0.25041771, 0.01669451],
[0.80373519, 0.55070744, 0.22325977, 0.02976797],
[0.786991 , 0.55745196, 0.26233033, 0.03279129],
[0.82307218, 0.51442011, 0.24006272, 0.01714734],
[0.8025126 , 0.55989251, 0.20529392, 0.01866308],
[0.81120865, 0.55945424, 0.16783627, 0.02797271],
[0.77381111, 0.59732787, 0.2036345 , 0.05430253],
[0.79428944, 0.57365349, 0.19121783, 0.05883625],
[0.80327412, 0.55126656, 0.22050662, 0.04725142],
[0.8068282 , 0.53788547, 0.24063297, 0.04246464],
[0.77964883, 0.58091482, 0.22930848, 0.0458617 ],
[0.8173379 , 0.51462016, 0.25731008, 0.03027177],
[0.78591858, 0.57017622, 0.23115252, 0.06164067],
[0.77577075, 0.60712493, 0.16864581, 0.03372916],
[0.80597792, 0.52151512, 0.26865931, 0.07901744],
[0.776114 , 0.54974742, 0.30721179, 0.03233808],
[0.82647451, 0.4958847 , 0.26447184, 0.03305898],
[0.79778206, 0.5424918 , 0.25529026, 0.06382256],
[0.80641965, 0.54278246, 0.23262105, 0.03101614],
[0.81609427, 0.5336001 , 0.21971769, 0.03138824],
[0.79524064, 0.54144043, 0.27072022, 0.03384003],
[0.80846584, 0.52213419, 0.26948861, 0.03368608],
[0.82225028, 0.51771314, 0.22840286, 0.06090743],
[0.76578311, 0.60379053, 0.22089897, 0.0147266 ],
[0.77867447, 0.59462414, 0.19820805, 0.02831544],
[0.81768942, 0.51731371, 0.25031309, 0.03337508],
[0.82512295, 0.52807869, 0.19802951, 0.03300492],
[0.82699754, 0.52627116, 0.19547215, 0.03007264],
[0.78523221, 0.5769053 , 0.22435206, 0.01602515],
[0.80212413, 0.54690282, 0.23699122, 0.03646019],
[0.80779568, 0.53853046, 0.23758697, 0.03167826],
[0.80033301, 0.56023311, 0.20808658, 0.04801998],
[0.86093857, 0.44003527, 0.24871559, 0.0573959 ],
[0.78609038, 0.57170209, 0.23225397, 0.03573138],
[0.78889479, 0.55222635, 0.25244633, 0.09466737],
[0.76693897, 0.57144472, 0.28572236, 0.06015208],
[0.82210585, 0.51381615, 0.23978087, 0.05138162],
[0.77729093, 0.57915795, 0.24385598, 0.030482 ],
[0.79594782, 0.55370283, 0.24224499, 0.03460643],
[0.79837025, 0.55735281, 0.22595384, 0.03012718],
[0.81228363, 0.5361072 , 0.22743942, 0.03249135],
[0.76701103, 0.35063361, 0.51499312, 0.15340221],
[0.74549757, 0.37274878, 0.52417798, 0.17472599],
[0.75519285, 0.33928954, 0.53629637, 0.16417236],
[0.75384916, 0.31524601, 0.54825394, 0.17818253],
[0.7581754 , 0.32659863, 0.5365549 , 0.17496355],
[0.72232962, 0.35482858, 0.57026022, 0.16474184],
[0.72634846, 0.38046824, 0.54187901, 0.18446945],
[0.75916547, 0.37183615, 0.51127471, 0.15493173],
[0.76301853, 0.33526572, 0.53180079, 0.15029153],
[0.72460233, 0.37623583, 0.54345175, 0.19508524],
[0.76923077, 0.30769231, 0.53846154, 0.15384615],
[0.73923462, 0.37588201, 0.52623481, 0.187941 ],
[0.78892752, 0.28927343, 0.52595168, 0.13148792],
[0.73081412, 0.34743622, 0.56308629, 0.16772783],
[0.75911707, 0.3931142 , 0.48800383, 0.17622361],
[0.76945444, 0.35601624, 0.50531337, 0.16078153],
[0.70631892, 0.37838513, 0.5675777 , 0.18919257],
[0.75676497, 0.35228714, 0.53495455, 0.13047672],
[0.76444238, 0.27125375, 0.55483721, 0.18494574],
[0.76185188, 0.34011245, 0.53057542, 0.14964948],
[0.6985796 , 0.37889063, 0.56833595, 0.21312598],
[0.77011854, 0.35349703, 0.50499576, 0.16412362],
[0.74143307, 0.29421947, 0.57667016, 0.17653168],
[0.73659895, 0.33811099, 0.56754345, 0.14490471],
[0.76741698, 0.34773582, 0.51560829, 0.15588157],
[0.76785726, 0.34902603, 0.51190484, 0.16287881],
[0.76467269, 0.31486523, 0.53976896, 0.15743261],
[0.74088576, 0.33173989, 0.55289982, 0.18798594],
[0.73350949, 0.35452959, 0.55013212, 0.18337737],
[0.78667474, 0.35883409, 0.48304589, 0.13801311],
[0.76521855, 0.33391355, 0.52869645, 0.15304371],
[0.77242925, 0.33706004, 0.51963422, 0.14044168],
[0.76434981, 0.35581802, 0.51395936, 0.15814134],
[0.70779525, 0.31850786, 0.60162596, 0.1887454 ],
[0.69333409, 0.38518561, 0.57777841, 0.1925928 ],
[0.71524936, 0.40530797, 0.53643702, 0.19073316],
[0.75457341, 0.34913098, 0.52932761, 0.16893434],
[0.77530021, 0.28304611, 0.54147951, 0.15998258],
[0.72992443, 0.39103094, 0.53440896, 0.16944674],
[0.74714194, 0.33960997, 0.54337595, 0.17659719],
[0.72337118, 0.34195729, 0.57869695, 0.15782644],
[0.73260391, 0.36029701, 0.55245541, 0.1681386 ],
[0.76262994, 0.34186859, 0.52595168, 0.1577855 ],
[0.76986879, 0.35413965, 0.5081134 , 0.15397376],
[0.73544284, 0.35458851, 0.55158213, 0.1707278 ],
[0.73239618, 0.38547167, 0.53966034, 0.15418867],
[0.73446047, 0.37367287, 0.5411814 , 0.16750853],
[0.75728103, 0.3542121 , 0.52521104, 0.15878473],
[0.78258054, 0.38361791, 0.4603415 , 0.16879188],
[0.7431482 , 0.36505526, 0.5345452 , 0.16948994],
[0.65387747, 0.34250725, 0.62274045, 0.25947519],
[0.69052512, 0.32145135, 0.60718588, 0.22620651],
[0.71491405, 0.30207636, 0.59408351, 0.21145345],
[0.69276796, 0.31889319, 0.61579374, 0.1979337 ],
[0.68619022, 0.31670318, 0.61229281, 0.232249 ],
[0.70953708, 0.28008043, 0.61617694, 0.1960563 ],
[0.67054118, 0.34211284, 0.61580312, 0.23263673],
[0.71366557, 0.28351098, 0.61590317, 0.17597233],
[0.71414125, 0.26647062, 0.61821183, 0.19185884],
[0.69198788, 0.34599394, 0.58626751, 0.24027357],
[0.71562645, 0.3523084 , 0.56149152, 0.22019275],
[0.71576546, 0.30196356, 0.59274328, 0.21249287],
[0.71718148, 0.31640359, 0.58007326, 0.22148252],
[0.6925518 , 0.30375079, 0.60750157, 0.24300063],
[0.67767924, 0.32715549, 0.59589036, 0.28041899],
[0.69589887, 0.34794944, 0.57629125, 0.25008866],
[0.70610474, 0.3258945 , 0.59747324, 0.1955367 ],
[0.69299099, 0.34199555, 0.60299216, 0.19799743],
[0.70600618, 0.2383917 , 0.63265489, 0.21088496],
[0.72712585, 0.26661281, 0.60593821, 0.18178146],
[0.70558934, 0.32722984, 0.58287815, 0.23519645],
[0.68307923, 0.34153961, 0.59769433, 0.24395687],
[0.71486543, 0.25995106, 0.62202576, 0.18567933],
[0.73122464, 0.31338199, 0.56873028, 0.20892133],
[0.69595601, 0.3427843 , 0.59208198, 0.21813547],
[0.71529453, 0.31790868, 0.59607878, 0.17882363],
[0.72785195, 0.32870733, 0.56349829, 0.21131186],
[0.71171214, 0.35002236, 0.57170319, 0.21001342],
[0.69594002, 0.30447376, 0.60894751, 0.22835532],
[0.73089855, 0.30454106, 0.58877939, 0.1624219 ],
[0.72766159, 0.27533141, 0.59982915, 0.18683203],
[0.71578999, 0.34430405, 0.5798805 , 0.18121266],
[0.69417747, 0.30370264, 0.60740528, 0.2386235 ],
[0.72366005, 0.32162669, 0.58582004, 0.17230001],
[0.69385414, 0.29574111, 0.63698085, 0.15924521],
[0.73154399, 0.28501714, 0.57953485, 0.21851314],
[0.67017484, 0.36168166, 0.59571097, 0.2553047 ],
[0.69804799, 0.338117 , 0.59988499, 0.196326 ],
[0.71066905, 0.35533453, 0.56853524, 0.21320072],
[0.72415258, 0.32534391, 0.56672811, 0.22039426],
[0.69997037, 0.32386689, 0.58504986, 0.25073566],
[0.73337886, 0.32948905, 0.54206264, 0.24445962],
[0.69052512, 0.32145135, 0.60718588, 0.22620651],
[0.69193502, 0.32561648, 0.60035539, 0.23403685],
[0.68914871, 0.33943145, 0.58629069, 0.25714504],
[0.72155725, 0.32308533, 0.56001458, 0.24769876],
[0.72965359, 0.28954508, 0.57909015, 0.22005426],
[0.71653899, 0.3307103 , 0.57323119, 0.22047353],
[0.67467072, 0.36998072, 0.58761643, 0.25028107],
[0.69025916, 0.35097923, 0.5966647 , 0.21058754]])
定量特征二值化
from sklearn.preprocessing import Binarizer
Binarizer(threshold=3).fit_transform(x)
array([[1., 1., 0., 0.],
[1., 0., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 0., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 0., 0., 0.],
[1., 0., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 0., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 0., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 0., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 0., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 0., 0.],
[1., 1., 1., 0.],
[1., 1., 1., 0.],
[1., 1., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 1., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 1., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 1., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 1., 1., 0.],
[1., 1., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 0., 0.],
[1., 0., 1., 0.],
[1., 1., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 1., 1., 0.],
[1., 1., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 1., 1., 0.],
[1., 0., 1., 0.],
[1., 1., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 1., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 1., 1., 0.],
[1., 1., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 1., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 1., 1., 0.],
[1., 1., 1., 0.],
[1., 0., 1., 0.],
[1., 1., 1., 0.],
[1., 1., 1., 0.],
[1., 1., 1., 0.],
[1., 0., 1., 0.],
[1., 1., 1., 0.],
[1., 1., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 0., 1., 0.],
[1., 1., 1., 0.],
[1., 0., 1., 0.]])
定性特征哑变量
from sklearn.preprocessing import OneHotEncoder
OneHotEncoder().fit_transform(x).toarray()
array([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 1., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]])
缺失值处理
from sklearn.impute import SimpleImputer
SimpleImputer(missing_values=np.nan,strategy='constant',fill_value=0).fit_transform(x)
array([[5.1, 3.5, 1.4, 0.2],
[4.9, 3. , 1.4, 0.2],
[4.7, 3.2, 1.3, 0.2],
[4.6, 3.1, 1.5, 0.2],
[5. , 3.6, 1.4, 0.2],
[5.4, 3.9, 1.7, 0.4],
[4.6, 3.4, 1.4, 0.3],
[5. , 3.4, 1.5, 0.2],
[4.4, 2.9, 1.4, 0.2],
[4.9, 3.1, 1.5, 0.1],
[5.4, 3.7, 1.5, 0.2],
[4.8, 3.4, 1.6, 0.2],
[4.8, 3. , 1.4, 0.1],
[4.3, 3. , 1.1, 0.1],
[5.8, 4. , 1.2, 0.2],
[5.7, 4.4, 1.5, 0.4],
[5.4, 3.9, 1.3, 0.4],
[5.1, 3.5, 1.4, 0.3],
[5.7, 3.8, 1.7, 0.3],
[5.1, 3.8, 1.5, 0.3],
[5.4, 3.4, 1.7, 0.2],
[5.1, 3.7, 1.5, 0.4],
[4.6, 3.6, 1. , 0.2],
[5.1, 3.3, 1.7, 0.5],
[4.8, 3.4, 1.9, 0.2],
[5. , 3. , 1.6, 0.2],
[5. , 3.4, 1.6, 0.4],
[5.2, 3.5, 1.5, 0.2],
[5.2, 3.4, 1.4, 0.2],
[4.7, 3.2, 1.6, 0.2],
[4.8, 3.1, 1.6, 0.2],
[5.4, 3.4, 1.5, 0.4],
[5.2, 4.1, 1.5, 0.1],
[5.5, 4.2, 1.4, 0.2],
[4.9, 3.1, 1.5, 0.2],
[5. , 3.2, 1.2, 0.2],
[5.5, 3.5, 1.3, 0.2],
[4.9, 3.6, 1.4, 0.1],
[4.4, 3. , 1.3, 0.2],
[5.1, 3.4, 1.5, 0.2],
[5. , 3.5, 1.3, 0.3],
[4.5, 2.3, 1.3, 0.3],
[4.4, 3.2, 1.3, 0.2],
[5. , 3.5, 1.6, 0.6],
[5.1, 3.8, 1.9, 0.4],
[4.8, 3. , 1.4, 0.3],
[5.1, 3.8, 1.6, 0.2],
[4.6, 3.2, 1.4, 0.2],
[5.3, 3.7, 1.5, 0.2],
[5. , 3.3, 1.4, 0.2],
[7. , 3.2, 4.7, 1.4],
[6.4, 3.2, 4.5, 1.5],
[6.9, 3.1, 4.9, 1.5],
[5.5, 2.3, 4. , 1.3],
[6.5, 2.8, 4.6, 1.5],
[5.7, 2.8, 4.5, 1.3],
[6.3, 3.3, 4.7, 1.6],
[4.9, 2.4, 3.3, 1. ],
[6.6, 2.9, 4.6, 1.3],
[5.2, 2.7, 3.9, 1.4],
[5. , 2. , 3.5, 1. ],
[5.9, 3. , 4.2, 1.5],
[6. , 2.2, 4. , 1. ],
[6.1, 2.9, 4.7, 1.4],
[5.6, 2.9, 3.6, 1.3],
[6.7, 3.1, 4.4, 1.4],
[5.6, 3. , 4.5, 1.5],
[5.8, 2.7, 4.1, 1. ],
[6.2, 2.2, 4.5, 1.5],
[5.6, 2.5, 3.9, 1.1],
[5.9, 3.2, 4.8, 1.8],
[6.1, 2.8, 4. , 1.3],
[6.3, 2.5, 4.9, 1.5],
[6.1, 2.8, 4.7, 1.2],
[6.4, 2.9, 4.3, 1.3],
[6.6, 3. , 4.4, 1.4],
[6.8, 2.8, 4.8, 1.4],
[6.7, 3. , 5. , 1.7],
[6. , 2.9, 4.5, 1.5],
[5.7, 2.6, 3.5, 1. ],
[5.5, 2.4, 3.8, 1.1],
[5.5, 2.4, 3.7, 1. ],
[5.8, 2.7, 3.9, 1.2],
[6. , 2.7, 5.1, 1.6],
[5.4, 3. , 4.5, 1.5],
[6. , 3.4, 4.5, 1.6],
[6.7, 3.1, 4.7, 1.5],
[6.3, 2.3, 4.4, 1.3],
[5.6, 3. , 4.1, 1.3],
[5.5, 2.5, 4. , 1.3],
[5.5, 2.6, 4.4, 1.2],
[6.1, 3. , 4.6, 1.4],
[5.8, 2.6, 4. , 1.2],
[5. , 2.3, 3.3, 1. ],
[5.6, 2.7, 4.2, 1.3],
[5.7, 3. , 4.2, 1.2],
[5.7, 2.9, 4.2, 1.3],
[6.2, 2.9, 4.3, 1.3],
[5.1, 2.5, 3. , 1.1],
[5.7, 2.8, 4.1, 1.3],
[6.3, 3.3, 6. , 2.5],
[5.8, 2.7, 5.1, 1.9],
[7.1, 3. , 5.9, 2.1],
[6.3, 2.9, 5.6, 1.8],
[6.5, 3. , 5.8, 2.2],
[7.6, 3. , 6.6, 2.1],
[4.9, 2.5, 4.5, 1.7],
[7.3, 2.9, 6.3, 1.8],
[6.7, 2.5, 5.8, 1.8],
[7.2, 3.6, 6.1, 2.5],
[6.5, 3.2, 5.1, 2. ],
[6.4, 2.7, 5.3, 1.9],
[6.8, 3. , 5.5, 2.1],
[5.7, 2.5, 5. , 2. ],
[5.8, 2.8, 5.1, 2.4],
[6.4, 3.2, 5.3, 2.3],
[6.5, 3. , 5.5, 1.8],
[7.7, 3.8, 6.7, 2.2],
[7.7, 2.6, 6.9, 2.3],
[6. , 2.2, 5. , 1.5],
[6.9, 3.2, 5.7, 2.3],
[5.6, 2.8, 4.9, 2. ],
[7.7, 2.8, 6.7, 2. ],
[6.3, 2.7, 4.9, 1.8],
[6.7, 3.3, 5.7, 2.1],
[7.2, 3.2, 6. , 1.8],
[6.2, 2.8, 4.8, 1.8],
[6.1, 3. , 4.9, 1.8],
[6.4, 2.8, 5.6, 2.1],
[7.2, 3. , 5.8, 1.6],
[7.4, 2.8, 6.1, 1.9],
[7.9, 3.8, 6.4, 2. ],
[6.4, 2.8, 5.6, 2.2],
[6.3, 2.8, 5.1, 1.5],
[6.1, 2.6, 5.6, 1.4],
[7.7, 3. , 6.1, 2.3],
[6.3, 3.4, 5.6, 2.4],
[6.4, 3.1, 5.5, 1.8],
[6. , 3. , 4.8, 1.8],
[6.9, 3.1, 5.4, 2.1],
[6.7, 3.1, 5.6, 2.4],
[6.9, 3.1, 5.1, 2.3],
[5.8, 2.7, 5.1, 1.9],
[6.8, 3.2, 5.9, 2.3],
[6.7, 3.3, 5.7, 2.5],
[6.7, 3. , 5.2, 2.3],
[6.3, 2.5, 5. , 1.9],
[6.5, 3. , 5.2, 2. ],
[6.2, 3.4, 5.4, 2.3],
[5.9, 3. , 5.1, 1.8]])
数据转换
多项式转换
from sklearn.preprocessing import PolynomialFeatures
PolynomialFeatures().fit_transform(x)
array([[ 1. , 5.1 , 3.5 , ..., 1.96, 0.28, 0.04],
[ 1. , 4.9 , 3. , ..., 1.96, 0.28, 0.04],
[ 1. , 4.7 , 3.2 , ..., 1.69, 0.26, 0.04],
...,
[ 1. , 6.5 , 3. , ..., 27.04, 10.4 , 4. ],
[ 1. , 6.2 , 3.4 , ..., 29.16, 12.42, 5.29],
[ 1. , 5.9 , 3. , ..., 26.01, 9.18, 3.24]])
对数变换
np.log1p(x)
array([[1.80828877, 1.5040774 , 0.87546874, 0.18232156],
[1.77495235, 1.38629436, 0.87546874, 0.18232156],
[1.74046617, 1.43508453, 0.83290912, 0.18232156],
[1.7227666 , 1.41098697, 0.91629073, 0.18232156],
[1.79175947, 1.5260563 , 0.87546874, 0.18232156],
[1.85629799, 1.58923521, 0.99325177, 0.33647224],
[1.7227666 , 1.48160454, 0.87546874, 0.26236426],
[1.79175947, 1.48160454, 0.91629073, 0.18232156],
[1.68639895, 1.36097655, 0.87546874, 0.18232156],
[1.77495235, 1.41098697, 0.91629073, 0.09531018],
[1.85629799, 1.54756251, 0.91629073, 0.18232156],
[1.75785792, 1.48160454, 0.95551145, 0.18232156],
[1.75785792, 1.38629436, 0.87546874, 0.09531018],
[1.66770682, 1.38629436, 0.74193734, 0.09531018],
[1.91692261, 1.60943791, 0.78845736, 0.18232156],
[1.90210753, 1.68639895, 0.91629073, 0.33647224],
[1.85629799, 1.58923521, 0.83290912, 0.33647224],
[1.80828877, 1.5040774 , 0.87546874, 0.26236426],
[1.90210753, 1.56861592, 0.99325177, 0.26236426],
[1.80828877, 1.56861592, 0.91629073, 0.26236426],
[1.85629799, 1.48160454, 0.99325177, 0.18232156],
[1.80828877, 1.54756251, 0.91629073, 0.33647224],
[1.7227666 , 1.5260563 , 0.69314718, 0.18232156],
[1.80828877, 1.45861502, 0.99325177, 0.40546511],
[1.75785792, 1.48160454, 1.06471074, 0.18232156],
[1.79175947, 1.38629436, 0.95551145, 0.18232156],
[1.79175947, 1.48160454, 0.95551145, 0.33647224],
[1.82454929, 1.5040774 , 0.91629073, 0.18232156],
[1.82454929, 1.48160454, 0.87546874, 0.18232156],
[1.74046617, 1.43508453, 0.95551145, 0.18232156],
[1.75785792, 1.41098697, 0.95551145, 0.18232156],
[1.85629799, 1.48160454, 0.91629073, 0.33647224],
[1.82454929, 1.62924054, 0.91629073, 0.09531018],
[1.87180218, 1.64865863, 0.87546874, 0.18232156],
[1.77495235, 1.41098697, 0.91629073, 0.18232156],
[1.79175947, 1.43508453, 0.78845736, 0.18232156],
[1.87180218, 1.5040774 , 0.83290912, 0.18232156],
[1.77495235, 1.5260563 , 0.87546874, 0.09531018],
[1.68639895, 1.38629436, 0.83290912, 0.18232156],
[1.80828877, 1.48160454, 0.91629073, 0.18232156],
[1.79175947, 1.5040774 , 0.83290912, 0.26236426],
[1.70474809, 1.19392247, 0.83290912, 0.26236426],
[1.68639895, 1.43508453, 0.83290912, 0.18232156],
[1.79175947, 1.5040774 , 0.95551145, 0.47000363],
[1.80828877, 1.56861592, 1.06471074, 0.33647224],
[1.75785792, 1.38629436, 0.87546874, 0.26236426],
[1.80828877, 1.56861592, 0.95551145, 0.18232156],
[1.7227666 , 1.43508453, 0.87546874, 0.18232156],
[1.84054963, 1.54756251, 0.91629073, 0.18232156],
[1.79175947, 1.45861502, 0.87546874, 0.18232156],
[2.07944154, 1.43508453, 1.74046617, 0.87546874],
[2.00148 , 1.43508453, 1.70474809, 0.91629073],
[2.06686276, 1.41098697, 1.77495235, 0.91629073],
[1.87180218, 1.19392247, 1.60943791, 0.83290912],
[2.01490302, 1.33500107, 1.7227666 , 0.91629073],
[1.90210753, 1.33500107, 1.70474809, 0.83290912],
[1.98787435, 1.45861502, 1.74046617, 0.95551145],
[1.77495235, 1.22377543, 1.45861502, 0.69314718],
[2.02814825, 1.36097655, 1.7227666 , 0.83290912],
[1.82454929, 1.30833282, 1.58923521, 0.87546874],
[1.79175947, 1.09861229, 1.5040774 , 0.69314718],
[1.93152141, 1.38629436, 1.64865863, 0.91629073],
[1.94591015, 1.16315081, 1.60943791, 0.69314718],
[1.96009478, 1.36097655, 1.74046617, 0.87546874],
[1.88706965, 1.36097655, 1.5260563 , 0.83290912],
[2.04122033, 1.41098697, 1.68639895, 0.87546874],
[1.88706965, 1.38629436, 1.70474809, 0.91629073],
[1.91692261, 1.30833282, 1.62924054, 0.69314718],
[1.97408103, 1.16315081, 1.70474809, 0.91629073],
[1.88706965, 1.25276297, 1.58923521, 0.74193734],
[1.93152141, 1.43508453, 1.75785792, 1.02961942],
[1.96009478, 1.33500107, 1.60943791, 0.83290912],
[1.98787435, 1.25276297, 1.77495235, 0.91629073],
[1.96009478, 1.33500107, 1.74046617, 0.78845736],
[2.00148 , 1.36097655, 1.66770682, 0.83290912],
[2.02814825, 1.38629436, 1.68639895, 0.87546874],
[2.05412373, 1.33500107, 1.75785792, 0.87546874],
[2.04122033, 1.38629436, 1.79175947, 0.99325177],
[1.94591015, 1.36097655, 1.70474809, 0.91629073],
[1.90210753, 1.28093385, 1.5040774 , 0.69314718],
[1.87180218, 1.22377543, 1.56861592, 0.74193734],
[1.87180218, 1.22377543, 1.54756251, 0.69314718],
[1.91692261, 1.30833282, 1.58923521, 0.78845736],
[1.94591015, 1.30833282, 1.80828877, 0.95551145],
[1.85629799, 1.38629436, 1.70474809, 0.91629073],
[1.94591015, 1.48160454, 1.70474809, 0.95551145],
[2.04122033, 1.41098697, 1.74046617, 0.91629073],
[1.98787435, 1.19392247, 1.68639895, 0.83290912],
[1.88706965, 1.38629436, 1.62924054, 0.83290912],
[1.87180218, 1.25276297, 1.60943791, 0.83290912],
[1.87180218, 1.28093385, 1.68639895, 0.78845736],
[1.96009478, 1.38629436, 1.7227666 , 0.87546874],
[1.91692261, 1.28093385, 1.60943791, 0.78845736],
[1.79175947, 1.19392247, 1.45861502, 0.69314718],
[1.88706965, 1.30833282, 1.64865863, 0.83290912],
[1.90210753, 1.38629436, 1.64865863, 0.78845736],
[1.90210753, 1.36097655, 1.64865863, 0.83290912],
[1.97408103, 1.36097655, 1.66770682, 0.83290912],
[1.80828877, 1.25276297, 1.38629436, 0.74193734],
[1.90210753, 1.33500107, 1.62924054, 0.83290912],
[1.98787435, 1.45861502, 1.94591015, 1.25276297],
[1.91692261, 1.30833282, 1.80828877, 1.06471074],
[2.09186406, 1.38629436, 1.93152141, 1.13140211],
[1.98787435, 1.36097655, 1.88706965, 1.02961942],
[2.01490302, 1.38629436, 1.91692261, 1.16315081],
[2.1517622 , 1.38629436, 2.02814825, 1.13140211],
[1.77495235, 1.25276297, 1.70474809, 0.99325177],
[2.11625551, 1.36097655, 1.98787435, 1.02961942],
[2.04122033, 1.25276297, 1.91692261, 1.02961942],
[2.10413415, 1.5260563 , 1.96009478, 1.25276297],
[2.01490302, 1.43508453, 1.80828877, 1.09861229],
[2.00148 , 1.30833282, 1.84054963, 1.06471074],
[2.05412373, 1.38629436, 1.87180218, 1.13140211],
[1.90210753, 1.25276297, 1.79175947, 1.09861229],
[1.91692261, 1.33500107, 1.80828877, 1.22377543],
[2.00148 , 1.43508453, 1.84054963, 1.19392247],
[2.01490302, 1.38629436, 1.87180218, 1.02961942],
[2.16332303, 1.56861592, 2.04122033, 1.16315081],
[2.16332303, 1.28093385, 2.06686276, 1.19392247],
[1.94591015, 1.16315081, 1.79175947, 0.91629073],
[2.06686276, 1.43508453, 1.90210753, 1.19392247],
[1.88706965, 1.33500107, 1.77495235, 1.09861229],
[2.16332303, 1.33500107, 2.04122033, 1.09861229],
[1.98787435, 1.30833282, 1.77495235, 1.02961942],
[2.04122033, 1.45861502, 1.90210753, 1.13140211],
[2.10413415, 1.43508453, 1.94591015, 1.02961942],
[1.97408103, 1.33500107, 1.75785792, 1.02961942],
[1.96009478, 1.38629436, 1.77495235, 1.02961942],
[2.00148 , 1.33500107, 1.88706965, 1.13140211],
[2.10413415, 1.38629436, 1.91692261, 0.95551145],
[2.12823171, 1.33500107, 1.96009478, 1.06471074],
[2.18605128, 1.56861592, 2.00148 , 1.09861229],
[2.00148 , 1.33500107, 1.88706965, 1.16315081],
[1.98787435, 1.33500107, 1.80828877, 0.91629073],
[1.96009478, 1.28093385, 1.88706965, 0.87546874],
[2.16332303, 1.38629436, 1.96009478, 1.19392247],
[1.98787435, 1.48160454, 1.88706965, 1.22377543],
[2.00148 , 1.41098697, 1.87180218, 1.02961942],
[1.94591015, 1.38629436, 1.75785792, 1.02961942],
[2.06686276, 1.41098697, 1.85629799, 1.13140211],
[2.04122033, 1.41098697, 1.88706965, 1.22377543],
[2.06686276, 1.41098697, 1.80828877, 1.19392247],
[1.91692261, 1.30833282, 1.80828877, 1.06471074],
[2.05412373, 1.43508453, 1.93152141, 1.19392247],
[2.04122033, 1.45861502, 1.90210753, 1.25276297],
[2.04122033, 1.38629436, 1.82454929, 1.19392247],
[1.98787435, 1.25276297, 1.79175947, 1.06471074],
[2.01490302, 1.38629436, 1.82454929, 1.09861229],
[1.97408103, 1.48160454, 1.85629799, 1.19392247],
[1.93152141, 1.38629436, 1.80828877, 1.02961942]])
特征降维
VarianceThreshold方差过滤
from sklearn.feature_selection import VarianceThreshold
VarianceThreshold(threshold=3).fit_transform(x)
array([[1.4],
[1.4],
[1.3],
[1.5],
[1.4],
[1.7],
[1.4],
[1.5],
[1.4],
[1.5],
[1.5],
[1.6],
[1.4],
[1.1],
[1.2],
[1.5],
[1.3],
[1.4],
[1.7],
[1.5],
[1.7],
[1.5],
[1. ],
[1.7],
[1.9],
[1.6],
[1.6],
[1.5],
[1.4],
[1.6],
[1.6],
[1.5],
[1.5],
[1.4],
[1.5],
[1.2],
[1.3],
[1.4],
[1.3],
[1.5],
[1.3],
[1.3],
[1.3],
[1.6],
[1.9],
[1.4],
[1.6],
[1.4],
[1.5],
[1.4],
[4.7],
[4.5],
[4.9],
[4. ],
[4.6],
[4.5],
[4.7],
[3.3],
[4.6],
[3.9],
[3.5],
[4.2],
[4. ],
[4.7],
[3.6],
[4.4],
[4.5],
[4.1],
[4.5],
[3.9],
[4.8],
[4. ],
[4.9],
[4.7],
[4.3],
[4.4],
[4.8],
[5. ],
[4.5],
[3.5],
[3.8],
[3.7],
[3.9],
[5.1],
[4.5],
[4.5],
[4.7],
[4.4],
[4.1],
[4. ],
[4.4],
[4.6],
[4. ],
[3.3],
[4.2],
[4.2],
[4.2],
[4.3],
[3. ],
[4.1],
[6. ],
[5.1],
[5.9],
[5.6],
[5.8],
[6.6],
[4.5],
[6.3],
[5.8],
[6.1],
[5.1],
[5.3],
[5.5],
[5. ],
[5.1],
[5.3],
[5.5],
[6.7],
[6.9],
[5. ],
[5.7],
[4.9],
[6.7],
[4.9],
[5.7],
[6. ],
[4.8],
[4.9],
[5.6],
[5.8],
[6.1],
[6.4],
[5.6],
[5.1],
[5.6],
[6.1],
[5.6],
[5.5],
[4.8],
[5.4],
[5.6],
[5.1],
[5.1],
[5.9],
[5.7],
[5.2],
[5. ],
[5.2],
[5.4],
[5.1]])
SelectKBest
卡方检验
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
SelectKBest(chi2,k=2).fit_transform(x,y)
array([[1.4, 0.2],
[1.4, 0.2],
[1.3, 0.2],
[1.5, 0.2],
[1.4, 0.2],
[1.7, 0.4],
[1.4, 0.3],
[1.5, 0.2],
[1.4, 0.2],
[1.5, 0.1],
[1.5, 0.2],
[1.6, 0.2],
[1.4, 0.1],
[1.1, 0.1],
[1.2, 0.2],
[1.5, 0.4],
[1.3, 0.4],
[1.4, 0.3],
[1.7, 0.3],
[1.5, 0.3],
[1.7, 0.2],
[1.5, 0.4],
[1. , 0.2],
[1.7, 0.5],
[1.9, 0.2],
[1.6, 0.2],
[1.6, 0.4],
[1.5, 0.2],
[1.4, 0.2],
[1.6, 0.2],
[1.6, 0.2],
[1.5, 0.4],
[1.5, 0.1],
[1.4, 0.2],
[1.5, 0.2],
[1.2, 0.2],
[1.3, 0.2],
[1.4, 0.1],
[1.3, 0.2],
[1.5, 0.2],
[1.3, 0.3],
[1.3, 0.3],
[1.3, 0.2],
[1.6, 0.6],
[1.9, 0.4],
[1.4, 0.3],
[1.6, 0.2],
[1.4, 0.2],
[1.5, 0.2],
[1.4, 0.2],
[4.7, 1.4],
[4.5, 1.5],
[4.9, 1.5],
[4. , 1.3],
[4.6, 1.5],
[4.5, 1.3],
[4.7, 1.6],
[3.3, 1. ],
[4.6, 1.3],
[3.9, 1.4],
[3.5, 1. ],
[4.2, 1.5],
[4. , 1. ],
[4.7, 1.4],
[3.6, 1.3],
[4.4, 1.4],
[4.5, 1.5],
[4.1, 1. ],
[4.5, 1.5],
[3.9, 1.1],
[4.8, 1.8],
[4. , 1.3],
[4.9, 1.5],
[4.7, 1.2],
[4.3, 1.3],
[4.4, 1.4],
[4.8, 1.4],
[5. , 1.7],
[4.5, 1.5],
[3.5, 1. ],
[3.8, 1.1],
[3.7, 1. ],
[3.9, 1.2],
[5.1, 1.6],
[4.5, 1.5],
[4.5, 1.6],
[4.7, 1.5],
[4.4, 1.3],
[4.1, 1.3],
[4. , 1.3],
[4.4, 1.2],
[4.6, 1.4],
[4. , 1.2],
[3.3, 1. ],
[4.2, 1.3],
[4.2, 1.2],
[4.2, 1.3],
[4.3, 1.3],
[3. , 1.1],
[4.1, 1.3],
[6. , 2.5],
[5.1, 1.9],
[5.9, 2.1],
[5.6, 1.8],
[5.8, 2.2],
[6.6, 2.1],
[4.5, 1.7],
[6.3, 1.8],
[5.8, 1.8],
[6.1, 2.5],
[5.1, 2. ],
[5.3, 1.9],
[5.5, 2.1],
[5. , 2. ],
[5.1, 2.4],
[5.3, 2.3],
[5.5, 1.8],
[6.7, 2.2],
[6.9, 2.3],
[5. , 1.5],
[5.7, 2.3],
[4.9, 2. ],
[6.7, 2. ],
[4.9, 1.8],
[5.7, 2.1],
[6. , 1.8],
[4.8, 1.8],
[4.9, 1.8],
[5.6, 2.1],
[5.8, 1.6],
[6.1, 1.9],
[6.4, 2. ],
[5.6, 2.2],
[5.1, 1.5],
[5.6, 1.4],
[6.1, 2.3],
[5.6, 2.4],
[5.5, 1.8],
[4.8, 1.8],
[5.4, 2.1],
[5.6, 2.4],
[5.1, 2.3],
[5.1, 1.9],
[5.9, 2.3],
[5.7, 2.5],
[5.2, 2.3],
[5. , 1.9],
[5.2, 2. ],
[5.4, 2.3],
[5.1, 1.8]])
互信息法
from sklearn.feature_selection import mutual_info_regression
SelectKBest(mutual_info_regression,k=3).fit_transform(x,y)
array([[5.1, 1.4, 0.2],
[4.9, 1.4, 0.2],
[4.7, 1.3, 0.2],
[4.6, 1.5, 0.2],
[5. , 1.4, 0.2],
[5.4, 1.7, 0.4],
[4.6, 1.4, 0.3],
[5. , 1.5, 0.2],
[4.4, 1.4, 0.2],
[4.9, 1.5, 0.1],
[5.4, 1.5, 0.2],
[4.8, 1.6, 0.2],
[4.8, 1.4, 0.1],
[4.3, 1.1, 0.1],
[5.8, 1.2, 0.2],
[5.7, 1.5, 0.4],
[5.4, 1.3, 0.4],
[5.1, 1.4, 0.3],
[5.7, 1.7, 0.3],
[5.1, 1.5, 0.3],
[5.4, 1.7, 0.2],
[5.1, 1.5, 0.4],
[4.6, 1. , 0.2],
[5.1, 1.7, 0.5],
[4.8, 1.9, 0.2],
[5. , 1.6, 0.2],
[5. , 1.6, 0.4],
[5.2, 1.5, 0.2],
[5.2, 1.4, 0.2],
[4.7, 1.6, 0.2],
[4.8, 1.6, 0.2],
[5.4, 1.5, 0.4],
[5.2, 1.5, 0.1],
[5.5, 1.4, 0.2],
[4.9, 1.5, 0.2],
[5. , 1.2, 0.2],
[5.5, 1.3, 0.2],
[4.9, 1.4, 0.1],
[4.4, 1.3, 0.2],
[5.1, 1.5, 0.2],
[5. , 1.3, 0.3],
[4.5, 1.3, 0.3],
[4.4, 1.3, 0.2],
[5. , 1.6, 0.6],
[5.1, 1.9, 0.4],
[4.8, 1.4, 0.3],
[5.1, 1.6, 0.2],
[4.6, 1.4, 0.2],
[5.3, 1.5, 0.2],
[5. , 1.4, 0.2],
[7. , 4.7, 1.4],
[6.4, 4.5, 1.5],
[6.9, 4.9, 1.5],
[5.5, 4. , 1.3],
[6.5, 4.6, 1.5],
[5.7, 4.5, 1.3],
[6.3, 4.7, 1.6],
[4.9, 3.3, 1. ],
[6.6, 4.6, 1.3],
[5.2, 3.9, 1.4],
[5. , 3.5, 1. ],
[5.9, 4.2, 1.5],
[6. , 4. , 1. ],
[6.1, 4.7, 1.4],
[5.6, 3.6, 1.3],
[6.7, 4.4, 1.4],
[5.6, 4.5, 1.5],
[5.8, 4.1, 1. ],
[6.2, 4.5, 1.5],
[5.6, 3.9, 1.1],
[5.9, 4.8, 1.8],
[6.1, 4. , 1.3],
[6.3, 4.9, 1.5],
[6.1, 4.7, 1.2],
[6.4, 4.3, 1.3],
[6.6, 4.4, 1.4],
[6.8, 4.8, 1.4],
[6.7, 5. , 1.7],
[6. , 4.5, 1.5],
[5.7, 3.5, 1. ],
[5.5, 3.8, 1.1],
[5.5, 3.7, 1. ],
[5.8, 3.9, 1.2],
[6. , 5.1, 1.6],
[5.4, 4.5, 1.5],
[6. , 4.5, 1.6],
[6.7, 4.7, 1.5],
[6.3, 4.4, 1.3],
[5.6, 4.1, 1.3],
[5.5, 4. , 1.3],
[5.5, 4.4, 1.2],
[6.1, 4.6, 1.4],
[5.8, 4. , 1.2],
[5. , 3.3, 1. ],
[5.6, 4.2, 1.3],
[5.7, 4.2, 1.2],
[5.7, 4.2, 1.3],
[6.2, 4.3, 1.3],
[5.1, 3. , 1.1],
[5.7, 4.1, 1.3],
[6.3, 6. , 2.5],
[5.8, 5.1, 1.9],
[7.1, 5.9, 2.1],
[6.3, 5.6, 1.8],
[6.5, 5.8, 2.2],
[7.6, 6.6, 2.1],
[4.9, 4.5, 1.7],
[7.3, 6.3, 1.8],
[6.7, 5.8, 1.8],
[7.2, 6.1, 2.5],
[6.5, 5.1, 2. ],
[6.4, 5.3, 1.9],
[6.8, 5.5, 2.1],
[5.7, 5. , 2. ],
[5.8, 5.1, 2.4],
[6.4, 5.3, 2.3],
[6.5, 5.5, 1.8],
[7.7, 6.7, 2.2],
[7.7, 6.9, 2.3],
[6. , 5. , 1.5],
[6.9, 5.7, 2.3],
[5.6, 4.9, 2. ],
[7.7, 6.7, 2. ],
[6.3, 4.9, 1.8],
[6.7, 5.7, 2.1],
[7.2, 6. , 1.8],
[6.2, 4.8, 1.8],
[6.1, 4.9, 1.8],
[6.4, 5.6, 2.1],
[7.2, 5.8, 1.6],
[7.4, 6.1, 1.9],
[7.9, 6.4, 2. ],
[6.4, 5.6, 2.2],
[6.3, 5.1, 1.5],
[6.1, 5.6, 1.4],
[7.7, 6.1, 2.3],
[6.3, 5.6, 2.4],
[6.4, 5.5, 1.8],
[6. , 4.8, 1.8],
[6.9, 5.4, 2.1],
[6.7, 5.6, 2.4],
[6.9, 5.1, 2.3],
[5.8, 5.1, 1.9],
[6.8, 5.9, 2.3],
[6.7, 5.7, 2.5],
[6.7, 5.2, 2.3],
[6.3, 5. , 1.9],
[6.5, 5.2, 2. ],
[6.2, 5.4, 2.3],
[5.9, 5.1, 1.8]])
RFE
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(n_estimators=100)
RFE(model,n_features_to_select=2).fit_transform(x,y)
array([[1.4, 0.2],
[1.4, 0.2],
[1.3, 0.2],
[1.5, 0.2],
[1.4, 0.2],
[1.7, 0.4],
[1.4, 0.3],
[1.5, 0.2],
[1.4, 0.2],
[1.5, 0.1],
[1.5, 0.2],
[1.6, 0.2],
[1.4, 0.1],
[1.1, 0.1],
[1.2, 0.2],
[1.5, 0.4],
[1.3, 0.4],
[1.4, 0.3],
[1.7, 0.3],
[1.5, 0.3],
[1.7, 0.2],
[1.5, 0.4],
[1. , 0.2],
[1.7, 0.5],
[1.9, 0.2],
[1.6, 0.2],
[1.6, 0.4],
[1.5, 0.2],
[1.4, 0.2],
[1.6, 0.2],
[1.6, 0.2],
[1.5, 0.4],
[1.5, 0.1],
[1.4, 0.2],
[1.5, 0.2],
[1.2, 0.2],
[1.3, 0.2],
[1.4, 0.1],
[1.3, 0.2],
[1.5, 0.2],
[1.3, 0.3],
[1.3, 0.3],
[1.3, 0.2],
[1.6, 0.6],
[1.9, 0.4],
[1.4, 0.3],
[1.6, 0.2],
[1.4, 0.2],
[1.5, 0.2],
[1.4, 0.2],
[4.7, 1.4],
[4.5, 1.5],
[4.9, 1.5],
[4. , 1.3],
[4.6, 1.5],
[4.5, 1.3],
[4.7, 1.6],
[3.3, 1. ],
[4.6, 1.3],
[3.9, 1.4],
[3.5, 1. ],
[4.2, 1.5],
[4. , 1. ],
[4.7, 1.4],
[3.6, 1.3],
[4.4, 1.4],
[4.5, 1.5],
[4.1, 1. ],
[4.5, 1.5],
[3.9, 1.1],
[4.8, 1.8],
[4. , 1.3],
[4.9, 1.5],
[4.7, 1.2],
[4.3, 1.3],
[4.4, 1.4],
[4.8, 1.4],
[5. , 1.7],
[4.5, 1.5],
[3.5, 1. ],
[3.8, 1.1],
[3.7, 1. ],
[3.9, 1.2],
[5.1, 1.6],
[4.5, 1.5],
[4.5, 1.6],
[4.7, 1.5],
[4.4, 1.3],
[4.1, 1.3],
[4. , 1.3],
[4.4, 1.2],
[4.6, 1.4],
[4. , 1.2],
[3.3, 1. ],
[4.2, 1.3],
[4.2, 1.2],
[4.2, 1.3],
[4.3, 1.3],
[3. , 1.1],
[4.1, 1.3],
[6. , 2.5],
[5.1, 1.9],
[5.9, 2.1],
[5.6, 1.8],
[5.8, 2.2],
[6.6, 2.1],
[4.5, 1.7],
[6.3, 1.8],
[5.8, 1.8],
[6.1, 2.5],
[5.1, 2. ],
[5.3, 1.9],
[5.5, 2.1],
[5. , 2. ],
[5.1, 2.4],
[5.3, 2.3],
[5.5, 1.8],
[6.7, 2.2],
[6.9, 2.3],
[5. , 1.5],
[5.7, 2.3],
[4.9, 2. ],
[6.7, 2. ],
[4.9, 1.8],
[5.7, 2.1],
[6. , 1.8],
[4.8, 1.8],
[4.9, 1.8],
[5.6, 2.1],
[5.8, 1.6],
[6.1, 1.9],
[6.4, 2. ],
[5.6, 2.2],
[5.1, 1.5],
[5.6, 1.4],
[6.1, 2.3],
[5.6, 2.4],
[5.5, 1.8],
[4.8, 1.8],
[5.4, 2.1],
[5.6, 2.4],
[5.1, 2.3],
[5.1, 1.9],
[5.9, 2.3],
[5.7, 2.5],
[5.2, 2.3],
[5. , 1.9],
[5.2, 2. ],
[5.4, 2.3],
[5.1, 1.8]])
SelectFromModel
from sklearn.feature_selection import SelectFromModel
model=RandomForestClassifier(n_estimators=100)
SelectFromModel(model,threshold=0.1).fit_transform(x,y)
array([[1.4, 0.2],
[1.4, 0.2],
[1.3, 0.2],
[1.5, 0.2],
[1.4, 0.2],
[1.7, 0.4],
[1.4, 0.3],
[1.5, 0.2],
[1.4, 0.2],
[1.5, 0.1],
[1.5, 0.2],
[1.6, 0.2],
[1.4, 0.1],
[1.1, 0.1],
[1.2, 0.2],
[1.5, 0.4],
[1.3, 0.4],
[1.4, 0.3],
[1.7, 0.3],
[1.5, 0.3],
[1.7, 0.2],
[1.5, 0.4],
[1. , 0.2],
[1.7, 0.5],
[1.9, 0.2],
[1.6, 0.2],
[1.6, 0.4],
[1.5, 0.2],
[1.4, 0.2],
[1.6, 0.2],
[1.6, 0.2],
[1.5, 0.4],
[1.5, 0.1],
[1.4, 0.2],
[1.5, 0.2],
[1.2, 0.2],
[1.3, 0.2],
[1.4, 0.1],
[1.3, 0.2],
[1.5, 0.2],
[1.3, 0.3],
[1.3, 0.3],
[1.3, 0.2],
[1.6, 0.6],
[1.9, 0.4],
[1.4, 0.3],
[1.6, 0.2],
[1.4, 0.2],
[1.5, 0.2],
[1.4, 0.2],
[4.7, 1.4],
[4.5, 1.5],
[4.9, 1.5],
[4. , 1.3],
[4.6, 1.5],
[4.5, 1.3],
[4.7, 1.6],
[3.3, 1. ],
[4.6, 1.3],
[3.9, 1.4],
[3.5, 1. ],
[4.2, 1.5],
[4. , 1. ],
[4.7, 1.4],
[3.6, 1.3],
[4.4, 1.4],
[4.5, 1.5],
[4.1, 1. ],
[4.5, 1.5],
[3.9, 1.1],
[4.8, 1.8],
[4. , 1.3],
[4.9, 1.5],
[4.7, 1.2],
[4.3, 1.3],
[4.4, 1.4],
[4.8, 1.4],
[5. , 1.7],
[4.5, 1.5],
[3.5, 1. ],
[3.8, 1.1],
[3.7, 1. ],
[3.9, 1.2],
[5.1, 1.6],
[4.5, 1.5],
[4.5, 1.6],
[4.7, 1.5],
[4.4, 1.3],
[4.1, 1.3],
[4. , 1.3],
[4.4, 1.2],
[4.6, 1.4],
[4. , 1.2],
[3.3, 1. ],
[4.2, 1.3],
[4.2, 1.2],
[4.2, 1.3],
[4.3, 1.3],
[3. , 1.1],
[4.1, 1.3],
[6. , 2.5],
[5.1, 1.9],
[5.9, 2.1],
[5.6, 1.8],
[5.8, 2.2],
[6.6, 2.1],
[4.5, 1.7],
[6.3, 1.8],
[5.8, 1.8],
[6.1, 2.5],
[5.1, 2. ],
[5.3, 1.9],
[5.5, 2.1],
[5. , 2. ],
[5.1, 2.4],
[5.3, 2.3],
[5.5, 1.8],
[6.7, 2.2],
[6.9, 2.3],
[5. , 1.5],
[5.7, 2.3],
[4.9, 2. ],
[6.7, 2. ],
[4.9, 1.8],
[5.7, 2.1],
[6. , 1.8],
[4.8, 1.8],
[4.9, 1.8],
[5.6, 2.1],
[5.8, 1.6],
[6.1, 1.9],
[6.4, 2. ],
[5.6, 2.2],
[5.1, 1.5],
[5.6, 1.4],
[6.1, 2.3],
[5.6, 2.4],
[5.5, 1.8],
[4.8, 1.8],
[5.4, 2.1],
[5.6, 2.4],
[5.1, 2.3],
[5.1, 1.9],
[5.9, 2.3],
[5.7, 2.5],
[5.2, 2.3],
[5. , 1.9],
[5.2, 2. ],
[5.4, 2.3],
[5.1, 1.8]])
线性降维
PCA主成分分析法
from sklearn.decomposition import PCA
PCA(n_components=2).fit_transform(x)
array([[-2.68412563, 0.31939725],
[-2.71414169, -0.17700123],
[-2.88899057, -0.14494943],
[-2.74534286, -0.31829898],
[-2.72871654, 0.32675451],
[-2.28085963, 0.74133045],
[-2.82053775, -0.08946138],
[-2.62614497, 0.16338496],
[-2.88638273, -0.57831175],
[-2.6727558 , -0.11377425],
[-2.50694709, 0.6450689 ],
[-2.61275523, 0.01472994],
[-2.78610927, -0.235112 ],
[-3.22380374, -0.51139459],
[-2.64475039, 1.17876464],
[-2.38603903, 1.33806233],
[-2.62352788, 0.81067951],
[-2.64829671, 0.31184914],
[-2.19982032, 0.87283904],
[-2.5879864 , 0.51356031],
[-2.31025622, 0.39134594],
[-2.54370523, 0.43299606],
[-3.21593942, 0.13346807],
[-2.30273318, 0.09870885],
[-2.35575405, -0.03728186],
[-2.50666891, -0.14601688],
[-2.46882007, 0.13095149],
[-2.56231991, 0.36771886],
[-2.63953472, 0.31203998],
[-2.63198939, -0.19696122],
[-2.58739848, -0.20431849],
[-2.4099325 , 0.41092426],
[-2.64886233, 0.81336382],
[-2.59873675, 1.09314576],
[-2.63692688, -0.12132235],
[-2.86624165, 0.06936447],
[-2.62523805, 0.59937002],
[-2.80068412, 0.26864374],
[-2.98050204, -0.48795834],
[-2.59000631, 0.22904384],
[-2.77010243, 0.26352753],
[-2.84936871, -0.94096057],
[-2.99740655, -0.34192606],
[-2.40561449, 0.18887143],
[-2.20948924, 0.43666314],
[-2.71445143, -0.2502082 ],
[-2.53814826, 0.50377114],
[-2.83946217, -0.22794557],
[-2.54308575, 0.57941002],
[-2.70335978, 0.10770608],
[ 1.28482569, 0.68516047],
[ 0.93248853, 0.31833364],
[ 1.46430232, 0.50426282],
[ 0.18331772, -0.82795901],
[ 1.08810326, 0.07459068],
[ 0.64166908, -0.41824687],
[ 1.09506066, 0.28346827],
[-0.74912267, -1.00489096],
[ 1.04413183, 0.2283619 ],
[-0.0087454 , -0.72308191],
[-0.50784088, -1.26597119],
[ 0.51169856, -0.10398124],
[ 0.26497651, -0.55003646],
[ 0.98493451, -0.12481785],
[-0.17392537, -0.25485421],
[ 0.92786078, 0.46717949],
[ 0.66028376, -0.35296967],
[ 0.23610499, -0.33361077],
[ 0.94473373, -0.54314555],
[ 0.04522698, -0.58383438],
[ 1.11628318, -0.08461685],
[ 0.35788842, -0.06892503],
[ 1.29818388, -0.32778731],
[ 0.92172892, -0.18273779],
[ 0.71485333, 0.14905594],
[ 0.90017437, 0.32850447],
[ 1.33202444, 0.24444088],
[ 1.55780216, 0.26749545],
[ 0.81329065, -0.1633503 ],
[-0.30558378, -0.36826219],
[-0.06812649, -0.70517213],
[-0.18962247, -0.68028676],
[ 0.13642871, -0.31403244],
[ 1.38002644, -0.42095429],
[ 0.58800644, -0.48428742],
[ 0.80685831, 0.19418231],
[ 1.22069088, 0.40761959],
[ 0.81509524, -0.37203706],
[ 0.24595768, -0.2685244 ],
[ 0.16641322, -0.68192672],
[ 0.46480029, -0.67071154],
[ 0.8908152 , -0.03446444],
[ 0.23054802, -0.40438585],
[-0.70453176, -1.01224823],
[ 0.35698149, -0.50491009],
[ 0.33193448, -0.21265468],
[ 0.37621565, -0.29321893],
[ 0.64257601, 0.01773819],
[-0.90646986, -0.75609337],
[ 0.29900084, -0.34889781],
[ 2.53119273, -0.00984911],
[ 1.41523588, -0.57491635],
[ 2.61667602, 0.34390315],
[ 1.97153105, -0.1797279 ],
[ 2.35000592, -0.04026095],
[ 3.39703874, 0.55083667],
[ 0.52123224, -1.19275873],
[ 2.93258707, 0.3555 ],
[ 2.32122882, -0.2438315 ],
[ 2.91675097, 0.78279195],
[ 1.66177415, 0.24222841],
[ 1.80340195, -0.21563762],
[ 2.1655918 , 0.21627559],
[ 1.34616358, -0.77681835],
[ 1.58592822, -0.53964071],
[ 1.90445637, 0.11925069],
[ 1.94968906, 0.04194326],
[ 3.48705536, 1.17573933],
[ 3.79564542, 0.25732297],
[ 1.30079171, -0.76114964],
[ 2.42781791, 0.37819601],
[ 1.19900111, -0.60609153],
[ 3.49992004, 0.4606741 ],
[ 1.38876613, -0.20439933],
[ 2.2754305 , 0.33499061],
[ 2.61409047, 0.56090136],
[ 1.25850816, -0.17970479],
[ 1.29113206, -0.11666865],
[ 2.12360872, -0.20972948],
[ 2.38800302, 0.4646398 ],
[ 2.84167278, 0.37526917],
[ 3.23067366, 1.37416509],
[ 2.15943764, -0.21727758],
[ 1.44416124, -0.14341341],
[ 1.78129481, -0.49990168],
[ 3.07649993, 0.68808568],
[ 2.14424331, 0.1400642 ],
[ 1.90509815, 0.04930053],
[ 1.16932634, -0.16499026],
[ 2.10761114, 0.37228787],
[ 2.31415471, 0.18365128],
[ 1.9222678 , 0.40920347],
[ 1.41523588, -0.57491635],
[ 2.56301338, 0.2778626 ],
[ 2.41874618, 0.3047982 ],
[ 1.94410979, 0.1875323 ],
[ 1.52716661, -0.37531698],
[ 1.76434572, 0.07885885],
[ 1.90094161, 0.11662796],
[ 1.39018886, -0.28266094]])
LDA线性判别分析法
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
LDA(n_components=2).fit_transform(x,y)
array([[ 8.06179978e+00, 3.00420621e-01],
[ 7.12868772e+00, -7.86660426e-01],
[ 7.48982797e+00, -2.65384488e-01],
[ 6.81320057e+00, -6.70631068e-01],
[ 8.13230933e+00, 5.14462530e-01],
[ 7.70194674e+00, 1.46172097e+00],
[ 7.21261762e+00, 3.55836209e-01],
[ 7.60529355e+00, -1.16338380e-02],
[ 6.56055159e+00, -1.01516362e+00],
[ 7.34305989e+00, -9.47319209e-01],
[ 8.39738652e+00, 6.47363392e-01],
[ 7.21929685e+00, -1.09646389e-01],
[ 7.32679599e+00, -1.07298943e+00],
[ 7.57247066e+00, -8.05464137e-01],
[ 9.84984300e+00, 1.58593698e+00],
[ 9.15823890e+00, 2.73759647e+00],
[ 8.58243141e+00, 1.83448945e+00],
[ 7.78075375e+00, 5.84339407e-01],
[ 8.07835876e+00, 9.68580703e-01],
[ 8.02097451e+00, 1.14050366e+00],
[ 7.49680227e+00, -1.88377220e-01],
[ 7.58648117e+00, 1.20797032e+00],
[ 8.68104293e+00, 8.77590154e-01],
[ 6.25140358e+00, 4.39696367e-01],
[ 6.55893336e+00, -3.89222752e-01],
[ 6.77138315e+00, -9.70634453e-01],
[ 6.82308032e+00, 4.63011612e-01],
[ 7.92461638e+00, 2.09638715e-01],
[ 7.99129024e+00, 8.63787128e-02],
[ 6.82946447e+00, -5.44960851e-01],
[ 6.75895493e+00, -7.59002759e-01],
[ 7.37495254e+00, 5.65844592e-01],
[ 9.12634625e+00, 1.22443267e+00],
[ 9.46768199e+00, 1.82522635e+00],
[ 7.06201386e+00, -6.63400423e-01],
[ 7.95876243e+00, -1.64961722e-01],
[ 8.61367201e+00, 4.03253602e-01],
[ 8.33041759e+00, 2.28133530e-01],
[ 6.93412007e+00, -7.05519379e-01],
[ 7.68823131e+00, -9.22362309e-03],
[ 7.91793715e+00, 6.75121313e-01],
[ 5.66188065e+00, -1.93435524e+00],
[ 7.24101468e+00, -2.72615132e-01],
[ 6.41443556e+00, 1.24730131e+00],
[ 6.85944381e+00, 1.05165396e+00],
[ 6.76470393e+00, -5.05151855e-01],
[ 8.08189937e+00, 7.63392750e-01],
[ 7.18676904e+00, -3.60986823e-01],
[ 8.31444876e+00, 6.44953177e-01],
[ 7.67196741e+00, -1.34893840e-01],
[-1.45927545e+00, 2.85437643e-02],
[-1.79770574e+00, 4.84385502e-01],
[-2.41694888e+00, -9.27840307e-02],
[-2.26247349e+00, -1.58725251e+00],
[-2.54867836e+00, -4.72204898e-01],
[-2.42996725e+00, -9.66132066e-01],
[-2.44848456e+00, 7.95961954e-01],
[-2.22666513e-01, -1.58467318e+00],
[-1.75020123e+00, -8.21180130e-01],
[-1.95842242e+00, -3.51563753e-01],
[-1.19376031e+00, -2.63445570e+00],
[-1.85892567e+00, 3.19006544e-01],
[-1.15809388e+00, -2.64340991e+00],
[-2.66605725e+00, -6.42504540e-01],
[-3.78367218e-01, 8.66389312e-02],
[-1.20117255e+00, 8.44373592e-02],
[-2.76810246e+00, 3.21995363e-02],
[-7.76854039e-01, -1.65916185e+00],
[-3.49805433e+00, -1.68495616e+00],
[-1.09042788e+00, -1.62658350e+00],
[-3.71589615e+00, 1.04451442e+00],
[-9.97610366e-01, -4.90530602e-01],
[-3.83525931e+00, -1.40595806e+00],
[-2.25741249e+00, -1.42679423e+00],
[-1.25571326e+00, -5.46424197e-01],
[-1.43755762e+00, -1.34424979e-01],
[-2.45906137e+00, -9.35277280e-01],
[-3.51848495e+00, 1.60588866e-01],
[-2.58979871e+00, -1.74611728e-01],
[ 3.07487884e-01, -1.31887146e+00],
[-1.10669179e+00, -1.75225371e+00],
[-6.05524589e-01, -1.94298038e+00],
[-8.98703769e-01, -9.04940034e-01],
[-4.49846635e+00, -8.82749915e-01],
[-2.93397799e+00, 2.73791065e-02],
[-2.10360821e+00, 1.19156767e+00],
[-2.14258208e+00, 8.87797815e-02],
[-2.47945603e+00, -1.94073927e+00],
[-1.32552574e+00, -1.62869550e-01],
[-1.95557887e+00, -1.15434826e+00],
[-2.40157020e+00, -1.59458341e+00],
[-2.29248878e+00, -3.32860296e-01],
[-1.27227224e+00, -1.21458428e+00],
[-2.93176055e-01, -1.79871509e+00],
[-2.00598883e+00, -9.05418042e-01],
[-1.18166311e+00, -5.37570242e-01],
[-1.61615645e+00, -4.70103580e-01],
[-1.42158879e+00, -5.51244626e-01],
[ 4.75973788e-01, -7.99905482e-01],
[-1.54948259e+00, -5.93363582e-01],
[-7.83947399e+00, 2.13973345e+00],
[-5.50747997e+00, -3.58139892e-02],
[-6.29200850e+00, 4.67175777e-01],
[-5.60545633e+00, -3.40738058e-01],
[-6.85055995e+00, 8.29825394e-01],
[-7.41816784e+00, -1.73117995e-01],
[-4.67799541e+00, -4.99095015e-01],
[-6.31692685e+00, -9.68980756e-01],
[-6.32773684e+00, -1.38328993e+00],
[-6.85281335e+00, 2.71758963e+00],
[-4.44072512e+00, 1.34723692e+00],
[-5.45009572e+00, -2.07736942e-01],
[-5.66033713e+00, 8.32713617e-01],
[-5.95823722e+00, -9.40175447e-02],
[-6.75926282e+00, 1.60023206e+00],
[-5.80704331e+00, 2.01019882e+00],
[-5.06601233e+00, -2.62733839e-02],
[-6.60881882e+00, 1.75163587e+00],
[-9.17147486e+00, -7.48255067e-01],
[-4.76453569e+00, -2.15573720e+00],
[-6.27283915e+00, 1.64948141e+00],
[-5.36071189e+00, 6.46120732e-01],
[-7.58119982e+00, -9.80722934e-01],
[-4.37150279e+00, -1.21297458e-01],
[-5.72317531e+00, 1.29327553e+00],
[-5.27915920e+00, -4.24582377e-02],
[-4.08087208e+00, 1.85936572e-01],
[-4.07703640e+00, 5.23238483e-01],
[-6.51910397e+00, 2.96976389e-01],
[-4.58371942e+00, -8.56815813e-01],
[-6.22824009e+00, -7.12719638e-01],
[-5.22048773e+00, 1.46819509e+00],
[-6.80015000e+00, 5.80895175e-01],
[-3.81515972e+00, -9.42985932e-01],
[-5.10748966e+00, -2.13059000e+00],
[-6.79671631e+00, 8.63090395e-01],
[-6.52449599e+00, 2.44503527e+00],
[-4.99550279e+00, 1.87768525e-01],
[-3.93985300e+00, 6.14020389e-01],
[-5.20383090e+00, 1.14476808e+00],
[-6.65308685e+00, 1.80531976e+00],
[-5.10555946e+00, 1.99218201e+00],
[-5.50747997e+00, -3.58139892e-02],
[-6.79601924e+00, 1.46068695e+00],
[-6.84735943e+00, 2.42895067e+00],
[-5.64500346e+00, 1.67771734e+00],
[-5.17956460e+00, -3.63475041e-01],
[-4.96774090e+00, 8.21140550e-01],
[-5.88614539e+00, 2.34509051e+00],
[-4.68315426e+00, 3.32033811e-01]])
赛题特征工程
异常值分析
plt.figure(figsize=(18,10))
plt.boxplot(x=train_data.values,labels=train_data.columns)
plt.hlines([-7.5,7.5],0,40,colors='r')
plt.show()
train_data=train_data[train_data['V9']>-7.5]
test_data=test_data[test_data['V9']>-7.5]
display(train_data.describe())
display(test_data.describe())
V0 | V1 | V2 | V3 | V4 | ... | V34 | V35 | V36 | V37 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|
count | 2886.000000 | 2886.000000 | 2886.000000 | 2886.000000 | 2886.000000 | ... | 2886.000000 | 2886.000000 | 2886.000000 | 2886.000000 | 2886.000000 |
mean | 0.123725 | 0.056856 | 0.290340 | -0.068364 | 0.012254 | ... | 0.006959 | 0.198513 | 0.030099 | -0.131957 | 0.127451 |
std | 0.927984 | 0.941269 | 0.911231 | 0.970357 | 0.888037 | ... | 1.003411 | 0.985058 | 0.970258 | 1.015666 | 0.983144 |
min | -4.335000 | -5.122000 | -3.420000 | -3.956000 | -4.742000 | ... | -4.789000 | -5.695000 | -2.608000 | -3.630000 | -3.044000 |
25% | -0.292000 | -0.224250 | -0.310000 | -0.652750 | -0.385000 | ... | -0.290000 | -0.199750 | -0.412750 | -0.798750 | -0.347500 |
50% | 0.359500 | 0.273000 | 0.386000 | -0.045000 | 0.109500 | ... | 0.160000 | 0.364000 | 0.137000 | -0.186000 | 0.314000 |
75% | 0.726000 | 0.599000 | 0.918750 | 0.623500 | 0.550000 | ... | 0.273000 | 0.602000 | 0.643750 | 0.493000 | 0.793750 |
max | 2.121000 | 1.918000 | 2.828000 | 2.457000 | 2.689000 | ... | 5.110000 | 2.324000 | 5.238000 | 3.000000 | 2.538000 |
8 rows × 39 columns
V0 | V1 | V2 | V3 | V4 | ... | V33 | V34 | V35 | V36 | V37 | |
---|---|---|---|---|---|---|---|---|---|---|---|
count | 1925.000000 | 1925.000000 | 1925.000000 | 1925.000000 | 1925.000000 | ... | 1925.000000 | 1925.000000 | 1925.000000 | 1925.000000 | 1925.000000 |
mean | -0.184404 | -0.083912 | -0.434762 | 0.101671 | -0.019172 | ... | -0.011433 | -0.009985 | -0.296895 | -0.046270 | 0.195735 |
std | 1.073333 | 1.076670 | 0.969541 | 1.034925 | 1.147286 | ... | 0.989732 | 0.995213 | 0.946896 | 1.040854 | 0.940599 |
min | -4.814000 | -5.488000 | -4.283000 | -3.276000 | -4.921000 | ... | -4.627000 | -4.789000 | -7.477000 | -2.608000 | -3.346000 |
25% | -0.664000 | -0.451000 | -0.978000 | -0.644000 | -0.497000 | ... | -0.460000 | -0.290000 | -0.349000 | -0.593000 | -0.432000 |
50% | 0.065000 | 0.195000 | -0.267000 | 0.220000 | 0.118000 | ... | -0.040000 | 0.160000 | -0.270000 | 0.083000 | 0.152000 |
75% | 0.549000 | 0.589000 | 0.278000 | 0.793000 | 0.610000 | ... | 0.419000 | 0.273000 | 0.364000 | 0.651000 | 0.797000 |
max | 2.100000 | 2.120000 | 1.946000 | 2.603000 | 4.475000 | ... | 5.465000 | 5.110000 | 1.671000 | 2.861000 | 3.021000 |
8 rows × 38 columns
最大值和最小值的归一化
from sklearn.preprocessing import MinMaxScaler
features=[col for col in train_data.columns if col not in ['target']]
Scaler=MinMaxScaler()
Scaler=Scaler.fit(train_data[features])
train_data_scaler=Scaler.transform(train_data[features])
test_data_scaler=Scaler.transform(test_data[features])
train_data_scaler=pd.DataFrame(train_data_scaler)
train_data_scaler.columns=features
test_data_scaler=pd.DataFrame(test_data_scaler)
test_data_scaler.columns=features
train_data_scaler['target']=train_data['target']
display(train_data_scaler.describe())
display(test_data_scaler.describe())
V0 | V1 | V2 | V3 | V4 | ... | V34 | V35 | V36 | V37 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|
count | 2886.000000 | 2886.000000 | 2886.000000 | 2886.000000 | 2886.000000 | ... | 2886.000000 | 2886.000000 | 2886.000000 | 2886.000000 | 2884.000000 |
mean | 0.690633 | 0.735633 | 0.593844 | 0.606212 | 0.639787 | ... | 0.484489 | 0.734944 | 0.336235 | 0.527608 | 0.127274 |
std | 0.143740 | 0.133703 | 0.145844 | 0.151311 | 0.119504 | ... | 0.101365 | 0.122840 | 0.123663 | 0.153192 | 0.983462 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -3.044000 |
25% | 0.626239 | 0.695703 | 0.497759 | 0.515087 | 0.586328 | ... | 0.454490 | 0.685279 | 0.279792 | 0.427036 | -0.348500 |
50% | 0.727153 | 0.766335 | 0.609155 | 0.609855 | 0.652873 | ... | 0.499949 | 0.755580 | 0.349860 | 0.519457 | 0.313000 |
75% | 0.783922 | 0.812642 | 0.694422 | 0.714096 | 0.712152 | ... | 0.511365 | 0.785260 | 0.414447 | 0.621870 | 0.794250 |
max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 2.538000 |
8 rows × 39 columns
V0 | V1 | V2 | V3 | V4 | ... | V33 | V34 | V35 | V36 | V37 | |
---|---|---|---|---|---|---|---|---|---|---|---|
count | 1925.000000 | 1925.000000 | 1925.000000 | 1925.000000 | 1925.000000 | ... | 1925.000000 | 1925.000000 | 1925.000000 | 1925.000000 | 1925.000000 |
mean | 0.642905 | 0.715637 | 0.477791 | 0.632726 | 0.635558 | ... | 0.457349 | 0.482778 | 0.673164 | 0.326501 | 0.577034 |
std | 0.166253 | 0.152936 | 0.155176 | 0.161379 | 0.154392 | ... | 0.098071 | 0.100537 | 0.118082 | 0.132661 | 0.141870 |
min | -0.074195 | -0.051989 | -0.138124 | 0.106035 | -0.024088 | ... | 0.000000 | 0.000000 | -0.222222 | 0.000000 | 0.042836 |
25% | 0.568618 | 0.663494 | 0.390845 | 0.516451 | 0.571256 | ... | 0.412901 | 0.454490 | 0.666667 | 0.256819 | 0.482353 |
50% | 0.681537 | 0.755256 | 0.504641 | 0.651177 | 0.654017 | ... | 0.454518 | 0.499949 | 0.676518 | 0.342977 | 0.570437 |
75% | 0.756506 | 0.811222 | 0.591869 | 0.740527 | 0.720226 | ... | 0.500000 | 0.511365 | 0.755580 | 0.415371 | 0.667722 |
max | 0.996747 | 1.028693 | 0.858835 | 1.022766 | 1.240345 | ... | 1.000000 | 1.000000 | 0.918568 | 0.697043 | 1.003167 |
8 rows × 38 columns
查看数据分布
特征相关性
plt.figure(figsize=(20,16))
column=train_data_scaler.columns.tolist()
mcorr=train_data_scaler[column].corr(method='spearman')
mask=np.zeros_like(mcorr,dtype=np.bool)
mask[np.triu_indices_from(mask)]=True
cmap=sns.diverging_palette(220,10,as_cmap=True)
g=sns.heatmap(mcorr,mask=mask,cmap=cmap,square=True,annot=True,fmt='0.2f')
plt.show()
多重共线性分析
from statsmodels.stats.outliers_influence import variance_inflation_factor
new_numerical=['V0','V2','V3','V4','V5','V6','V10','V11','V13','V15','V16','V18','V19','V20','V22','V24','V30','V31','V37']
X=np.matrix(train_data_scaler[new_numerical])
VIF_list=[variance_inflation_factor(X,i) for i in range(X.shape[1])]
VIF_list
[216.73387180903222,
114.38118723828812,
27.863778129686356,
201.96436579080174,
78.93722825798903,
151.06983667656212,
14.519604941508451,
82.69750284665385,
28.479378440614585,
27.759176471505945,
526.6483470743831,
23.50166642638334,
19.920315849901424,
24.640481765008683,
11.816055964845381,
4.958208708452915,
37.09877416736591,
298.26442986612767,
47.854002539887034]
PCA处理
from sklearn.decomposition import PCA
pca=PCA(n_components=0.9)
new_train_pca_90=pca.fit_transform(train_data_scaler.iloc[:,:-1])
new_test_pca_90=pca.fit_transform(test_data_scaler.iloc[:,:-1])
new_train_pca_90=pd.DataFrame(new_train_pca_90)
new_test_pca_90=pd.DataFrame(new_test_pca_90)
new_train_pca_90['target']=train_data_scaler['target']
new_train_pca_90.describe()
0 | 1 | 2 | 3 | 4 | ... | 12 | 13 | 14 | 15 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|
count | 2.886000e+03 | 2.886000e+03 | 2.886000e+03 | 2.886000e+03 | 2.886000e+03 | ... | 2.886000e+03 | 2.886000e+03 | 2.886000e+03 | 2.886000e+03 | 2884.000000 |
mean | -1.969626e-17 | -2.954440e-17 | 1.969626e-17 | -4.924066e-17 | 7.878506e-17 | ... | -8.001607e-18 | -5.908879e-17 | -9.848132e-18 | 1.274102e-16 | 0.127274 |
std | 3.998976e-01 | 3.500240e-01 | 2.938631e-01 | 2.728023e-01 | 2.077128e-01 | ... | 1.193301e-01 | 1.149758e-01 | 1.133507e-01 | 1.019259e-01 | 0.983462 |
min | -1.071795e+00 | -9.429479e-01 | -9.948314e-01 | -7.103087e-01 | -7.703987e-01 | ... | -4.175153e-01 | -4.310613e-01 | -4.170535e-01 | -3.601627e-01 | -3.044000 |
25% | -2.804085e-01 | -2.613727e-01 | -2.090797e-01 | -1.945196e-01 | -1.315620e-01 | ... | -7.139961e-02 | -7.474073e-02 | -7.709743e-02 | -6.603914e-02 | -0.348500 |
50% | -1.417104e-02 | -1.277241e-02 | 2.112166e-02 | -2.337401e-02 | -5.122797e-03 | ... | -4.140670e-03 | 1.054915e-03 | -1.758387e-03 | -7.533392e-04 | 0.313000 |
75% | 2.287306e-01 | 2.317720e-01 | 2.069571e-01 | 1.657590e-01 | 1.281660e-01 | ... | 6.786199e-02 | 7.574868e-02 | 7.116829e-02 | 6.357449e-02 | 0.794250 |
max | 1.597730e+00 | 1.382802e+00 | 1.010250e+00 | 1.448007e+00 | 1.034061e+00 | ... | 5.156118e-01 | 4.978126e-01 | 4.673189e-01 | 4.570870e-01 | 2.538000 |
8 rows × 17 columns
new_train_pca_16=pca.fit_transform(train_data_scaler.iloc[:,:-1])
new_test_pca_16=pca.fit_transform(test_data_scaler.iloc[:,:-1])
new_train_pca_16=pd.DataFrame(new_train_pca_16)
new_test_pca_16=pd.DataFrame(new_test_pca_16)
new_train_pca_16['target']=train_data_scaler['target']
模型训练
导入相关库
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
切分数据
new_train_pca_16=new_train_pca_16.fillna(0)
x=new_train_pca_16[new_test_pca_16.columns]
y=new_train_pca_16['target']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)
多元线性回归
model=LinearRegression()
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
score=mean_squared_error(y_test,y_pred)
print('多元线性回归:',score)
多元线性回归: 0.2767801458340533
K近邻回归
model=KNeighborsRegressor(n_neighbors=10)
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
score=mean_squared_error(y_test,y_pred)
print('K近邻:',score)
K近邻: 0.2616963992906574
随机森林回归
model=RandomForestRegressor(n_estimators=200)
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
score=mean_squared_error(y_test,y_pred)
print('随机森林回归:',score)
随机森林回归: 0.25248205414117647
LGB回归
model=lgb.LGBMRegressor(
learning_rate=0.01,
max_depth=-1,
n_estimators=5000,
boosting_type='gbdt',
random_state=2019,
objective='regression'
)
model.fit(X=x_train,y=y_train,eval_metric='MSE',verbose=50)
LGBMRegressor(learning_rate=0.01, n_estimators=5000, objective='regression',
random_state=2019)
y_pred=model.predict(x_test)
score=mean_squared_error(y_test,y_pred)
print('LGB模型回归:',score)
LGB模型回归: 0.2658406071425124
支持向量机回归
model=SVR()
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
score=mean_squared_error(y_test,y_pred)
print('支持向量机回归:',score)
支持向量机回归: 0.23747772569513606
模型验证
欠拟合与过拟合
rng=np.random.RandomState(0)
x=rng.uniform(-3,3,100)
y=0.5*x**2+x+2+np.random.normal(0,1,size=100)
plt.scatter(x,y)
plt.show()
x=x.reshape(-1,1)
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(x,y)
mean_squared_error(y,model.predict(x))
2.9123067960253923
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
def PolynomialRegression(degree):
return Pipeline([('poly',PolynomialFeatures(degree=degree)),
('std_scaler',StandardScaler()),
('lin_reg',LinearRegression())])
poly2_reg=PolynomialRegression(degree=10)
poly2_reg.fit(x,y)
y_pred=poly2_reg.predict(x)
print(mean_squared_error(y,y_pred))
plt.scatter(x,y)
plt.plot(np.sort(x.reshape(1,-1).ravel()),y_pred[np.argsort(x.reshape(1,-1).ravel())],color='r')
plt.show()
1.005649633099633
poly2_reg=PolynomialRegression(degree=100)
poly2_reg.fit(x,y)
y_pred=poly2_reg.predict(x)
print(mean_squared_error(y,y_pred))
plt.scatter(x,y)
plt.plot(np.sort(x.reshape(1,-1).ravel()),y_pred[np.argsort(x.reshape(1,-1).ravel())],color='r')
plt.show()
0.5642131672188152
交叉验证
简单交叉验证
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(load_iris().data,load_iris().target,test_size=0.2,random_state=0)
K折交叉验证
from sklearn.model_selection import KFold
kf=KFold(n_splits=10)
kf.split(load_iris().data,load_iris().target)
<generator object _BaseKFold.split at 0x000001766560CF20>
留一法交叉验证
from sklearn.model_selection import LeaveOneOut
loo=LeaveOneOut()
留P法交叉验证
from sklearn.model_selection import LeavePOut
lpo=LeavePOut(p=5)
模型调参
网格搜索
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
x=load_iris().data
y=load_iris().target
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)
best_score=0
for gamma in [0.001,0.01,0.1,1,10,100]:
for C in [0.001,0.01,0.1,1,10,100]:
model=SVC(gamma=gamma,C=C)
model.fit(x_train,y_train)
score=model.score(x_test,y_test)
if score>best_score:
best_score=score
best_params={'gramma':gamma,'C':C}
print(best_score,best_params)
1.0 {'gramma': 0.001, 'C': 100}
学习曲线
赛题模型验证和调参
模型正则化
L2范数正则化
from sklearn.linear_model import SGDRegressor
model=SGDRegressor(max_iter=1000,tol=1e-3,penalty='L2',alpha=0.0001)
model.fit(x_train,y_train)
score_train=mean_squared_error(y_train,model.predict(x_train))
score_test=mean_squared_error(y_test,model.predict(x_test))
print('train:',score_train)
print('test:',score_test)
train: 0.05647435493144304
test: 0.05792926777671479
L1范数正则化
from sklearn.linear_model import SGDRegressor
model=SGDRegressor(max_iter=1000,tol=1e-3,penalty='L1',alpha=0.0001)
model.fit(x_train,y_train)
score_train=mean_squared_error(y_train,model.predict(x_train))
score_test=mean_squared_error(y_test,model.predict(x_test))
print('train:',score_train)
print('test:',score_test)
train: 0.05984852253374636
test: 0.05781197002659376
ElasticNet联合L1和L2范数加权正则化
from sklearn.linear_model import SGDRegressor
model=SGDRegressor(max_iter=1000,tol=1e-3,penalty='elasticnet',alpha=0.0001)
model.fit(x_train,y_train)
score_train=mean_squared_error(y_train,model.predict(x_train))
score_test=mean_squared_error(y_test,model.predict(x_test))
print('train:',score_train)
print('test:',score_test)
train: 0.05359655625009887
test: 0.06449737961501686
模型交叉验证
简单交叉验证
# x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.1,random_state=0)
model=SGDRegressor(max_iter=1000,tol=1e-3,penalty='L1',alpha=0.0001)
model.fit(x_train,y_train)
score_train=mean_squared_error(y_train,model.predict(x_train))
score_test=mean_squared_error(y_test,model.predict(x_test))
print('train:',score_train)
print('test:',score_test)
train: 0.05308657115834889
test: 0.061714302326653835
new_train_pca_16=new_train_pca_16.fillna(0)
x=new_train_pca_16[new_test_pca_16.columns]
y=new_train_pca_16['target']
K折交叉验证
from sklearn.model_selection import KFold
kf=KFold(n_splits=10)
for k,(train_index,test_index) in enumerate(kf.split(x)):
x_train,x_test,y_train,y_test=x.values[train_index],x.values[test_index],y.values[train_index],y.values[test_index]
model=SGDRegressor(max_iter=1000,tol=1e-3)
model.fit(x_train,y_train)
score_train=mean_squared_error(y_train,model.predict(x_train))
score_test=mean_squared_error(y_test,model.predict(x_test))
print('{}折 train_score:{:.10f} test_score:{:.10f}'.format(k,score_train,score_test))
0折 train_score:0.3602318315 test_score:0.1767761698
1折 train_score:0.3501239272 test_score:0.2484806010
2折 train_score:0.3556734961 test_score:0.2017659596
3折 train_score:0.3324155551 test_score:0.4088211803
4折 train_score:0.3573550484 test_score:0.2108892370
5折 train_score:0.3322709623 test_score:0.4482557026
6折 train_score:0.3244468803 test_score:0.5228384581
7折 train_score:0.3288228718 test_score:0.4946368648
8折 train_score:0.3500469469 test_score:0.2672951469
9折 train_score:0.3042545411 test_score:0.7095465815
留一法交叉验证
from sklearn.model_selection import LeaveOneOut
loo=LeaveOneOut()
for k,(train_index,test_index) in enumerate(loo.split(x)):
x_train,x_test,y_train,y_test=x.values[train_index],x.values[test_index],y.values[train_index],y.values[test_index]
model=SGDRegressor(max_iter=1000,tol=1e-3)
model.fit(x_train,y_train)
score_train=mean_squared_error(y_train,model.predict(x_train))
score_test=mean_squared_error(y_test,model.predict(x_test))
print('{} train_score:{:.10f} test_score:{:.10f}'.format(k,score_train,score_test))
if k>10:
break
0 train_score:0.3406805076 test_score:0.5057733852
1 train_score:0.3407447403 test_score:0.4344293538
2 train_score:0.3407808103 test_score:0.1584206032
3 train_score:0.3409301300 test_score:0.0324807358
4 train_score:0.3407225085 test_score:0.1596873904
5 train_score:0.3407800526 test_score:0.0046326254
6 train_score:0.3408423258 test_score:0.0320690996
7 train_score:0.3409134343 test_score:0.0560273184
8 train_score:0.3406269465 test_score:0.4012466323
9 train_score:0.3408646877 test_score:0.0149950572
10 train_score:0.3408549372 test_score:0.0269544246
11 train_score:0.3408678993 test_score:0.1228838022
留p法交叉验证
from sklearn.model_selection import LeavePOut
lpo=LeaveOneOut()
for k,(train_index,test_index) in enumerate(lpo.split(x)):
x_train,x_test,y_train,y_test=x.values[train_index],x.values[test_index],y.values[train_index],y.values[test_index]
model=SGDRegressor(max_iter=1000,tol=1e-3)
model.fit(x_train,y_train)
score_train=mean_squared_error(y_train,model.predict(x_train))
score_test=mean_squared_error(y_test,model.predict(x_test))
print('{} train_score:{:.10f} test_score:{:.10f}'.format(k,score_train,score_test))
if k>10:
break
0 train_score:0.3407367010 test_score:0.4906048379
1 train_score:0.3408172840 test_score:0.4394575062
2 train_score:0.3408328069 test_score:0.1551440570
3 train_score:0.3409097661 test_score:0.0316359796
4 train_score:0.3408101333 test_score:0.1584800014
5 train_score:0.3403482646 test_score:0.0049208166
6 train_score:0.3408397215 test_score:0.0283428261
7 train_score:0.3407767680 test_score:0.0595601493
8 train_score:0.3408073577 test_score:0.4095043626
9 train_score:0.3408192984 test_score:0.0118659336
10 train_score:0.3408125770 test_score:0.0280156182
11 train_score:0.3407741379 test_score:0.1156643127
模型超参空间及调参
穷举网格搜索
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)
model=RandomForestRegressor()
params={'n_estimators':[50,100,200],
'max_depth':[1,2,3]}
GV=GridSearchCV(model,params,cv=10)
GV.fit(x_train,y_train)
GridSearchCV(cv=10, estimator=RandomForestRegressor(),
param_grid={'max_depth': [1, 2, 3],
'n_estimators': [50, 100, 200]})
GV.best_score_
0.5352571593888877
GV.best_params_
{'max_depth': 3, 'n_estimators': 100}
mean_squared_error(y_test,GV.predict(x_test))
0.3555098696794756
随机参数优化
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)
model=RandomForestRegressor()
params={'n_estimators':[50,100,200],
'max_depth':[1,2,3]}
RV=RandomizedSearchCV(model,params,cv=10)
RV.fit(x_train,y_train)
RandomizedSearchCV(cv=10, estimator=RandomForestRegressor(),
param_distributions={'max_depth': [1, 2, 3],
'n_estimators': [50, 100, 200]})
RV.best_score_
0.5349094783965866
RV.best_params_
{'n_estimators': 50, 'max_depth': 3}
mean_squared_error(y_test,RV.predict(x_test))
0.3606795415655421
学习曲线和验证曲线
学习曲线
特征优化
赛题特征优化
导入数据
train_data=pd.read_csv('data/zhengqi_train.txt',sep='\t',encoding='utf-8')
test_data=pd.read_csv('data/zhengqi_test.txt',sep='\t',encoding='utf-8')
特征构造方法
epsilon=1e-5
func_dict={'add':lambda x,y:x+y,
'mins':lambda x,y:x-y,
'multi':lambda x,y:x*y,
'div':lambda x,y:x/(y+epsilon)
}
特征构造函数
def make_features(train_data,test_data,func_dict,col_list):
train_data,test_data=train_data.copy(),test_data.copy()
for coli in col_list:
for colj in col_list:
for func_name,func in func_dict.items():
for data in [train_data,test_data]:
feature=func(data[coli],data[colj])
feature_name='-'.join([coli,func_name,colj])
data[feature_name]=feature
return train_data,test_data
train_data2,test_data2=make_features(train_data,test_data,func_dict,test_data.columns)
特征降维处理
from sklearn.decomposition import PCA
pca=PCA(n_components=500)
pca=pca.fit(train_data2.iloc[:,:-1])
train_data2_pca=pca.transform(train_data2.iloc[:,:-1])
test_data2_pca=pca.transform(test_data2)
train_data2_pca=pd.DataFrame(train_data2_pca)
test_data2_pca=pd.DataFrame(test_data2_pca)
train_data2_pca['target']=train_data2['target']
x=train_data2_pca[test_data2_pca.columns].values
y=train_data2_pca['target']
模型训练与评估
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import numpy as np
Folds=5
kf=KFold(n_splits=Folds,shuffle=True,random_state=2021)
mse_dict={'train_mse':[],'test_mse':[]}
for i,(train_index,test_index) in enumerate(kf.split(x)):
model=lgb.LGBMRegressor(learning_rate=0.01,
max_depth=-1,
n_estimators=5000,
boosting_type='gbdt',
random_state=2021,
objective='regression')
x_train=x[train_index]
x_test=x[test_index]
y_train=y[train_index]
y_test=y[test_index]
model.fit(X=x_train,
y=y_train,
eval_set=[(x_train,y_train),
(x_test,y_test)],
eval_names=['Train','Test'],
early_stopping_rounds=100,
eval_metric='MSE',
verbose=50)
y_train_pred=model.predict(x_train,num_iteration=model.best_iteration_)
y_test_pred=model.predict(x_test,num_iteration=model.best_iteration_)
print('{}折训练和预测MSE'.format(i+1))
train_mse=mean_squared_error(y_train,y_train_pred)
test_mse=mean_squared_error(y_test,y_test_pred)
print('train_mse:{:.10f}'.format(train_mse))
print('test_mse:{:.10f}'.format(test_mse))
mse_dict['train_mse'].append(train_mse)
mse_dict['test_mse'].append(test_mse)
print('************************************')
print('train_mse:{:.10f}'.format(np.mean(mse_dict['train_mse'])))
print('test_mse:{:.10f}'.format(np.mean(mse_dict['test_mse'])))
Training until validation scores don't improve for 100 rounds
[50] Train's l2: 0.543717 Test's l2: 0.604572
[100] Train's l2: 0.340544 Test's l2: 0.463274
[150] Train's l2: 0.2306 Test's l2: 0.388697
[200] Train's l2: 0.167484 Test's l2: 0.351103
[250] Train's l2: 0.127077 Test's l2: 0.332442
[300] Train's l2: 0.0993325 Test's l2: 0.322074
[350] Train's l2: 0.079262 Test's l2: 0.315931
[400] Train's l2: 0.0637016 Test's l2: 0.309216
[450] Train's l2: 0.0517757 Test's l2: 0.303667
[500] Train's l2: 0.0424371 Test's l2: 0.300606
[550] Train's l2: 0.0350781 Test's l2: 0.29846
[600] Train's l2: 0.0291966 Test's l2: 0.296649
[650] Train's l2: 0.0243927 Test's l2: 0.295125
[700] Train's l2: 0.0204512 Test's l2: 0.294314
[750] Train's l2: 0.01721 Test's l2: 0.294088
[800] Train's l2: 0.0145537 Test's l2: 0.29318
[850] Train's l2: 0.0123332 Test's l2: 0.292481
[900] Train's l2: 0.0104506 Test's l2: 0.291354
[950] Train's l2: 0.00889746 Test's l2: 0.290487
[1000] Train's l2: 0.00758584 Test's l2: 0.289841
[1050] Train's l2: 0.0064809 Test's l2: 0.289542
[1100] Train's l2: 0.00557043 Test's l2: 0.288911
[1150] Train's l2: 0.00477429 Test's l2: 0.288043
[1200] Train's l2: 0.00409921 Test's l2: 0.287434
[1250] Train's l2: 0.00352492 Test's l2: 0.287135
[1300] Train's l2: 0.00303538 Test's l2: 0.286735
[1350] Train's l2: 0.00262049 Test's l2: 0.286454
[1400] Train's l2: 0.00226543 Test's l2: 0.286244
[1450] Train's l2: 0.00196362 Test's l2: 0.285955
[1500] Train's l2: 0.00170203 Test's l2: 0.285766
[1550] Train's l2: 0.00148613 Test's l2: 0.285561
[1600] Train's l2: 0.00130007 Test's l2: 0.285492
[1650] Train's l2: 0.00114392 Test's l2: 0.285355
[1700] Train's l2: 0.00100922 Test's l2: 0.285303
[1750] Train's l2: 0.000892295 Test's l2: 0.285215
[1800] Train's l2: 0.00079264 Test's l2: 0.28507
[1850] Train's l2: 0.000704515 Test's l2: 0.28501
[1900] Train's l2: 0.000627791 Test's l2: 0.284935
[1950] Train's l2: 0.000561643 Test's l2: 0.284884
[2000] Train's l2: 0.000505995 Test's l2: 0.284834
[2050] Train's l2: 0.000458034 Test's l2: 0.284834
[2100] Train's l2: 0.000415711 Test's l2: 0.284781
[2150] Train's l2: 0.000378973 Test's l2: 0.284741
[2200] Train's l2: 0.000346808 Test's l2: 0.284702
[2250] Train's l2: 0.000318028 Test's l2: 0.284677
[2300] Train's l2: 0.00029307 Test's l2: 0.284681
[2350] Train's l2: 0.000270541 Test's l2: 0.284679
Early stopping, best iteration is:
[2265] Train's l2: 0.000310325 Test's l2: 0.284666
1折训练和预测MSE
train_mse:0.0003103246
test_mse:0.2846657916
Training until validation scores don't improve for 100 rounds
[50] Train's l2: 0.531303 Test's l2: 0.684378
[100] Train's l2: 0.335423 Test's l2: 0.524272
[150] Train's l2: 0.228201 Test's l2: 0.442181
[200] Train's l2: 0.166035 Test's l2: 0.403058
[250] Train's l2: 0.125805 Test's l2: 0.377363
[300] Train's l2: 0.0984113 Test's l2: 0.361378
[350] Train's l2: 0.078351 Test's l2: 0.351406
[400] Train's l2: 0.0629611 Test's l2: 0.34581
[450] Train's l2: 0.0511864 Test's l2: 0.341351
[500] Train's l2: 0.0419425 Test's l2: 0.338046
[550] Train's l2: 0.0347016 Test's l2: 0.335465
[600] Train's l2: 0.0288848 Test's l2: 0.333737
[650] Train's l2: 0.0240954 Test's l2: 0.331649
[700] Train's l2: 0.0201629 Test's l2: 0.330009
[750] Train's l2: 0.0169717 Test's l2: 0.328117
[800] Train's l2: 0.0143134 Test's l2: 0.327074
[850] Train's l2: 0.0121158 Test's l2: 0.326252
[900] Train's l2: 0.0102999 Test's l2: 0.325618
[950] Train's l2: 0.0087568 Test's l2: 0.325154
[1000] Train's l2: 0.00746425 Test's l2: 0.324834
[1050] Train's l2: 0.006398 Test's l2: 0.324458
[1100] Train's l2: 0.00549152 Test's l2: 0.323998
[1150] Train's l2: 0.0047352 Test's l2: 0.323608
[1200] Train's l2: 0.00409785 Test's l2: 0.323079
[1250] Train's l2: 0.00356497 Test's l2: 0.322605
[1300] Train's l2: 0.00310907 Test's l2: 0.322257
[1350] Train's l2: 0.00272473 Test's l2: 0.322146
[1400] Train's l2: 0.00239642 Test's l2: 0.321893
[1450] Train's l2: 0.00211186 Test's l2: 0.321744
[1500] Train's l2: 0.00187086 Test's l2: 0.321672
[1550] Train's l2: 0.00166422 Test's l2: 0.321655
[1600] Train's l2: 0.00148658 Test's l2: 0.321614
[1650] Train's l2: 0.0013316 Test's l2: 0.321636
[1700] Train's l2: 0.00119632 Test's l2: 0.321584
[1750] Train's l2: 0.00107889 Test's l2: 0.321541
[1800] Train's l2: 0.000979346 Test's l2: 0.321536
Early stopping, best iteration is:
[1733] Train's l2: 0.00111674 Test's l2: 0.321515
2折训练和预测MSE
train_mse:0.0011167390
test_mse:0.3215149248
Training until validation scores don't improve for 100 rounds
[50] Train's l2: 0.540152 Test's l2: 0.63719
[100] Train's l2: 0.339946 Test's l2: 0.471552
[150] Train's l2: 0.23119 Test's l2: 0.395507
[200] Train's l2: 0.167178 Test's l2: 0.35583
[250] Train's l2: 0.126276 Test's l2: 0.338397
[300] Train's l2: 0.0984726 Test's l2: 0.326181
[350] Train's l2: 0.078372 Test's l2: 0.318223
[400] Train's l2: 0.0631969 Test's l2: 0.313255
[450] Train's l2: 0.0516128 Test's l2: 0.309614
[500] Train's l2: 0.042384 Test's l2: 0.307441
[550] Train's l2: 0.035031 Test's l2: 0.305475
[600] Train's l2: 0.0291888 Test's l2: 0.304062
[650] Train's l2: 0.0244644 Test's l2: 0.302963
[700] Train's l2: 0.020578 Test's l2: 0.302588
[750] Train's l2: 0.0174154 Test's l2: 0.302293
[800] Train's l2: 0.0147865 Test's l2: 0.302077
[850] Train's l2: 0.0126023 Test's l2: 0.301957
[900] Train's l2: 0.0107725 Test's l2: 0.301826
[950] Train's l2: 0.00925973 Test's l2: 0.301487
[1000] Train's l2: 0.00798974 Test's l2: 0.301299
[1050] Train's l2: 0.00690221 Test's l2: 0.301037
[1100] Train's l2: 0.00598077 Test's l2: 0.300697
[1150] Train's l2: 0.00519833 Test's l2: 0.300648
[1200] Train's l2: 0.00452834 Test's l2: 0.300608
[1250] Train's l2: 0.00396283 Test's l2: 0.300488
[1300] Train's l2: 0.00347648 Test's l2: 0.300355
[1350] Train's l2: 0.00306102 Test's l2: 0.300381
[1400] Train's l2: 0.00269621 Test's l2: 0.300229
[1450] Train's l2: 0.0023818 Test's l2: 0.300237
[1500] Train's l2: 0.0021141 Test's l2: 0.300083
[1550] Train's l2: 0.00187851 Test's l2: 0.300017
[1600] Train's l2: 0.00167574 Test's l2: 0.299953
[1650] Train's l2: 0.00149718 Test's l2: 0.299943
[1700] Train's l2: 0.00134817 Test's l2: 0.299958
[1750] Train's l2: 0.00121449 Test's l2: 0.299946
Early stopping, best iteration is:
[1663] Train's l2: 0.00145717 Test's l2: 0.299902
3折训练和预测MSE
train_mse:0.0014571665
test_mse:0.2999015382
Training until validation scores don't improve for 100 rounds
[50] Train's l2: 0.553251 Test's l2: 0.57018
[100] Train's l2: 0.3481 Test's l2: 0.426055
[150] Train's l2: 0.237178 Test's l2: 0.358121
[200] Train's l2: 0.172136 Test's l2: 0.321284
[250] Train's l2: 0.130327 Test's l2: 0.30248
[300] Train's l2: 0.101715 Test's l2: 0.290685
[350] Train's l2: 0.0805702 Test's l2: 0.282179
[400] Train's l2: 0.0649358 Test's l2: 0.276455
[450] Train's l2: 0.0527739 Test's l2: 0.27231
[500] Train's l2: 0.0433096 Test's l2: 0.269285
[550] Train's l2: 0.0357324 Test's l2: 0.26767
[600] Train's l2: 0.0296251 Test's l2: 0.265957
[650] Train's l2: 0.0246598 Test's l2: 0.264329
[700] Train's l2: 0.0206447 Test's l2: 0.263283
[750] Train's l2: 0.0173093 Test's l2: 0.262082
[800] Train's l2: 0.0145206 Test's l2: 0.261284
[850] Train's l2: 0.0122394 Test's l2: 0.260971
[900] Train's l2: 0.0103238 Test's l2: 0.260822
[950] Train's l2: 0.00874249 Test's l2: 0.260449
[1000] Train's l2: 0.00743096 Test's l2: 0.26005
[1050] Train's l2: 0.00629931 Test's l2: 0.259851
[1100] Train's l2: 0.005356 Test's l2: 0.259677
[1150] Train's l2: 0.00457325 Test's l2: 0.259395
[1200] Train's l2: 0.00390414 Test's l2: 0.259323
[1250] Train's l2: 0.00332433 Test's l2: 0.259174
[1300] Train's l2: 0.00283924 Test's l2: 0.259046
[1350] Train's l2: 0.00243486 Test's l2: 0.258849
[1400] Train's l2: 0.00208991 Test's l2: 0.258884
[1450] Train's l2: 0.00179522 Test's l2: 0.258815
[1500] Train's l2: 0.001541 Test's l2: 0.258647
[1550] Train's l2: 0.00132456 Test's l2: 0.258518
[1600] Train's l2: 0.00113926 Test's l2: 0.25847
[1650] Train's l2: 0.000977932 Test's l2: 0.258347
[1700] Train's l2: 0.000843273 Test's l2: 0.258304
[1750] Train's l2: 0.000728304 Test's l2: 0.258191
[1800] Train's l2: 0.000627096 Test's l2: 0.258132
[1850] Train's l2: 0.000541629 Test's l2: 0.258123
[1900] Train's l2: 0.000467428 Test's l2: 0.258073
[1950] Train's l2: 0.000404477 Test's l2: 0.258052
[2000] Train's l2: 0.000351368 Test's l2: 0.258051
[2050] Train's l2: 0.000304616 Test's l2: 0.257984
[2100] Train's l2: 0.000263359 Test's l2: 0.257944
[2150] Train's l2: 0.00022769 Test's l2: 0.25789
[2200] Train's l2: 0.000197298 Test's l2: 0.257826
[2250] Train's l2: 0.000170155 Test's l2: 0.257816
[2300] Train's l2: 0.000147563 Test's l2: 0.257776
[2350] Train's l2: 0.000127935 Test's l2: 0.257767
[2400] Train's l2: 0.000111268 Test's l2: 0.257763
[2450] Train's l2: 9.69653e-05 Test's l2: 0.257755
[2500] Train's l2: 8.48729e-05 Test's l2: 0.257736
[2550] Train's l2: 7.43364e-05 Test's l2: 0.257733
[2600] Train's l2: 6.49667e-05 Test's l2: 0.257718
[2650] Train's l2: 5.67423e-05 Test's l2: 0.257705
[2700] Train's l2: 4.96559e-05 Test's l2: 0.257674
[2750] Train's l2: 4.34688e-05 Test's l2: 0.257662
[2800] Train's l2: 3.83567e-05 Test's l2: 0.257652
[2850] Train's l2: 3.38728e-05 Test's l2: 0.257646
[2900] Train's l2: 2.98855e-05 Test's l2: 0.257639
[2950] Train's l2: 2.63891e-05 Test's l2: 0.257639
[3000] Train's l2: 2.33554e-05 Test's l2: 0.257642
Early stopping, best iteration is:
[2931] Train's l2: 2.76319e-05 Test's l2: 0.257634
4折训练和预测MSE
train_mse:0.0000276319
test_mse:0.2576340931
Training until validation scores don't improve for 100 rounds
[50] Train's l2: 0.542659 Test's l2: 0.617169
[100] Train's l2: 0.340008 Test's l2: 0.477799
[150] Train's l2: 0.231568 Test's l2: 0.406837
[200] Train's l2: 0.168562 Test's l2: 0.367475
[250] Train's l2: 0.127808 Test's l2: 0.346231
[300] Train's l2: 0.0996046 Test's l2: 0.332399
[350] Train's l2: 0.079321 Test's l2: 0.324124
[400] Train's l2: 0.0639138 Test's l2: 0.318926
[450] Train's l2: 0.051887 Test's l2: 0.31537
[500] Train's l2: 0.0425211 Test's l2: 0.312224
[550] Train's l2: 0.0350794 Test's l2: 0.310244
[600] Train's l2: 0.0291183 Test's l2: 0.309378
[650] Train's l2: 0.024318 Test's l2: 0.308742
[700] Train's l2: 0.0203776 Test's l2: 0.307923
[750] Train's l2: 0.0171659 Test's l2: 0.307359
[800] Train's l2: 0.01449 Test's l2: 0.306549
[850] Train's l2: 0.0122634 Test's l2: 0.306337
[900] Train's l2: 0.01044 Test's l2: 0.30606
[950] Train's l2: 0.00890911 Test's l2: 0.305977
[1000] Train's l2: 0.00759587 Test's l2: 0.305823
[1050] Train's l2: 0.00654359 Test's l2: 0.305626
[1100] Train's l2: 0.00564639 Test's l2: 0.305201
[1150] Train's l2: 0.00487897 Test's l2: 0.305092
[1200] Train's l2: 0.00424035 Test's l2: 0.305018
[1250] Train's l2: 0.00369461 Test's l2: 0.304932
[1300] Train's l2: 0.00322024 Test's l2: 0.304962
[1350] Train's l2: 0.00281733 Test's l2: 0.304944
Early stopping, best iteration is:
[1261] Train's l2: 0.00358574 Test's l2: 0.304864
5折训练和预测MSE
train_mse:0.0035857405
test_mse:0.3048641005
************************************
train_mse:0.0012995205
test_mse:0.2937160896