制造业
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
from scipy import stats
from scipy.stats import norm, skew
t1=pd.read_csv("制造业.csv" )
t1_train=t1.drop("FLAG" ,axis=1 )
t1
TICKER_SYMBOL
ACT_PUBTIME
PUBLISH_DATE
END_DATE_REP
END_DATE
REPORT_TYPE
FISCAL_PERIOD
MERGED_FLAG
ACCOUTING_STANDARDS
CURRENCY_CD
...
CA_TURNOVER
OPER_CYCLE
INVEN_TURNOVER
FA_TURNOVER
TFA_TURNOVER
DAYS_AP
DAYS_INVEN
TA_TURNOVER
AR_TURNOVER
FLAG
0
4019
3
3
2
1
A
12
1
CHAS_2007
CNY
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
0.0
1
8166
3
3
2
1
A
12
1
CHAS_2007
CNY
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
0.0
2
11737
3
3
2
1
A
12
1
CHAS_2007
CNY
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
0.0
3
16479
3
3
2
1
A
12
1
CHAS_2007
CNY
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
0.0
4
16842
4
4
3
1
A
12
1
CHAS_2007
CNY
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
0.0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
13965
4992204
7
7
7
6
A
12
1
CHAS_2007
CNY
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
13966
4992858
7
7
7
6
A
12
1
CHAS_2007
CNY
...
NaN
0.000
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
13967
4993201
7
7
7
6
A
12
1
CHAS_2007
CNY
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
13968
4998808
7
7
7
6
A
12
1
CHAS_2007
CNY
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
13969
4999709
7
7
7
6
A
12
1
CHAS_2007
CNY
...
2.6656
23.084
21.9179
0.6571
0.6256
33.6589
16.4249
0.3692
54.0618
NaN
13970 rows × 363 columns
1 数据预处理
1.1计算缺失率,并降序排序
all_data_na = (t1_train.isnull().sum () / len (t1_train) * 100 ).sort_values(ascending=False )
missing_data = pd.DataFrame({'missing_data' : all_data_na})
missing_data
missing_data
ACCRUED_EXP
99.971367
N_INC_BORR_OTH_FI
99.806729
PERPETUAL_BOND_L
99.634932
PREFERRED_STOCK_L
99.606299
PREFERRED_STOCK_E
99.591983
...
...
T_COMPR_INCOME
0.000000
N_INCOME_ATTR_P
0.000000
FINAN_EXP
0.000000
ACT_PUBTIME
0.000000
TICKER_SYMBOL
0.000000
362 rows × 1 columns
将缺失率用图表的方式展示
f, ax = plt.subplots(figsize=(30 , 15 ))
plt.xticks(rotation='90' )
sns.barplot(x=all_data_na.index, y=all_data_na)
plt.xlabel('Features' , fontsize=15 )
plt.ylabel('Percent of missing values' , fontsize=15 )
plt.title('Percent missing data by feature' , fontsize=15 )
Text(0.5 , 1.0 , 'Percent missing data by feature')
missing_data_count1 = all_data_na.index[all_data_na > 80 ]
missing_data_count2 = all_data_na.index[all_data_na < 20 ]
print (missing_data_count1.shape,missing_data_count2.shape)
a=missing_data.values[:93 ]
x=pd.DataFrame(a, index = missing_data.index[:93 ])
x
0
ACCRUED_EXP
99.971367
N_INC_BORR_OTH_FI
99.806729
PERPETUAL_BOND_L
99.634932
PREFERRED_STOCK_L
99.606299
PREFERRED_STOCK_E
99.591983
...
...
OP_CL
81.338583
R_D
81.159628
N_CF_OPA_LIAB
80.952040
N_CF_NFA_LIAB
80.952040
OP_TL
80.916249
93 rows × 1 columns
1.2 删除80%以上的缺失率
t2=t1_train.drop(columns=x.index)
t2
TICKER_SYMBOL
ACT_PUBTIME
PUBLISH_DATE
END_DATE_REP
END_DATE
REPORT_TYPE
FISCAL_PERIOD
MERGED_FLAG
ACCOUTING_STANDARDS
CURRENCY_CD
...
AP_TURNOVER
CA_TURNOVER
OPER_CYCLE
INVEN_TURNOVER
FA_TURNOVER
TFA_TURNOVER
DAYS_AP
DAYS_INVEN
TA_TURNOVER
AR_TURNOVER
0
4019
3
3
2
1
A
12
1
CHAS_2007
CNY
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
1
8166
3
3
2
1
A
12
1
CHAS_2007
CNY
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
2
11737
3
3
2
1
A
12
1
CHAS_2007
CNY
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
3
16479
3
3
2
1
A
12
1
CHAS_2007
CNY
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
4
16842
4
4
3
1
A
12
1
CHAS_2007
CNY
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
13965
4992204
7
7
7
6
A
12
1
CHAS_2007
CNY
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
13966
4992858
7
7
7
6
A
12
1
CHAS_2007
CNY
...
NaN
NaN
0.000
NaN
NaN
NaN
NaN
NaN
NaN
NaN
13967
4993201
7
7
7
6
A
12
1
CHAS_2007
CNY
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
13968
4998808
7
7
7
6
A
12
1
CHAS_2007
CNY
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
13969
4999709
7
7
7
6
A
12
1
CHAS_2007
CNY
...
10.6956
2.6656
23.084
21.9179
0.6571
0.6256
33.6589
16.4249
0.3692
54.0618
13970 rows × 269 columns
1.3 对缺失率20%到80%的数据填充中位数
b=missing_data.index[93 :278 ]
for o in b:
t2[o]=t2[o].fillna(t2[o].median())
t2
TICKER_SYMBOL
ACT_PUBTIME
PUBLISH_DATE
END_DATE_REP
END_DATE
REPORT_TYPE
FISCAL_PERIOD
MERGED_FLAG
ACCOUTING_STANDARDS
CURRENCY_CD
...
AP_TURNOVER
CA_TURNOVER
OPER_CYCLE
INVEN_TURNOVER
FA_TURNOVER
TFA_TURNOVER
DAYS_AP
DAYS_INVEN
TA_TURNOVER
AR_TURNOVER
0
4019
3
3
2
1
A
12
1
CHAS_2007
CNY
...
4.8617
1.0942
149.7293
4.1120
3.0696
2.7145
74.30515
87.75175
0.5354
8.49245
1
8166
3
3
2
1
A
12
1
CHAS_2007
CNY
...
4.8617
1.0942
149.7293
4.1120
3.0696
2.7145
74.30515
87.75175
0.5354
8.49245
2
11737
3
3
2
1
A
12
1
CHAS_2007
CNY
...
4.8617
1.0942
149.7293
4.1120
3.0696
2.7145
74.30515
87.75175
0.5354
8.49245
3
16479
3
3
2
1
A
12
1
CHAS_2007
CNY
...
4.8617
1.0942
149.7293
4.1120
3.0696
2.7145
74.30515
87.75175
0.5354
8.49245
4
16842
4
4
3
1
A
12
1
CHAS_2007
CNY
...
4.8617
1.0942
149.7293
4.1120
3.0696
2.7145
74.30515
87.75175
0.5354
8.49245
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
13965
4992204
7
7
7
6
A
12
1
CHAS_2007
CNY
...
4.8617
1.0942
149.7293
4.1120
3.0696
2.7145
74.30515
87.75175
0.5354
8.49245
13966
4992858
7
7
7
6
A
12
1
CHAS_2007
CNY
...
4.8617
1.0942
0.0000
4.1120
3.0696
2.7145
74.30515
87.75175
0.5354
8.49245
13967
4993201
7
7
7
6
A
12
1
CHAS_2007
CNY
...
4.8617
1.0942
149.7293
4.1120
3.0696
2.7145
74.30515
87.75175
0.5354
8.49245
13968
4998808
7
7
7
6
A
12
1
CHAS_2007
CNY
...
4.8617
1.0942
149.7293
4.1120
3.0696
2.7145
74.30515
87.75175
0.5354
8.49245
13969
4999709
7
7
7
6
A
12
1
CHAS_2007
CNY
...
10.6956
2.6656
23.0840
21.9179
0.6571
0.6256
33.65890
16.42490
0.3692
54.06180
13970 rows × 269 columns
1.4 对缺失率20%以下的数据使用KNN填充
d=missing_data.index[278 :336 ]
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=10 )
t2[d] = imputer.fit_transform(t2[d])
print (t2.isnull().sum ())
TICKER_SYMBOL 0
ACT_PUBTIME 0
PUBLISH_DATE 0
END_DATE_REP 0
END_DATE 0
..
TFA_TURNOVER 0
DAYS_AP 0
DAYS_INVEN 0
TA_TURNOVER 0
AR_TURNOVER 0
Length: 269 , dtype: int64
1.5 删除与预测是否造假结果无关的特征因子
删除股票代码,实际披露时间,发布时间,报告截止日期,截止日期,报告类型,会计区间,合并标志:1-合并,2-母公司,会计准则,货币代码共 10 个与预测是否造假结果无关的特征因子
t2=t2.drop(["TICKER_SYMBOL" ,"ACT_PUBTIME" ,"PUBLISH_DATE" ,"END_DATE_REP" ,"END_DATE" ,"REPORT_TYPE" ,"FISCAL_PERIOD" ,"MERGED_FLAG" ,"ACCOUTING_STANDARDS" ,"CURRENCY_CD" ],axis=1 )
1.6 查看是否还存在缺失值
1.7 对数据进行标准化
from sklearn.preprocessing import StandardScaler
t4=pd.DataFrame(StandardScaler().fit_transform(t2),columns=t2.columns)
t4
CASH_C_EQUIV
NOTES_RECEIV
AR
PREPAYMENT
INT_RECEIV
OTH_RECEIV
INVENTORIES
OTH_CA
T_CA
AVAIL_FOR_SALE_FA
...
AP_TURNOVER
CA_TURNOVER
OPER_CYCLE
INVEN_TURNOVER
FA_TURNOVER
TFA_TURNOVER
DAYS_AP
DAYS_INVEN
TA_TURNOVER
AR_TURNOVER
0
-0.110544
-0.106696
-0.161667
-0.182694
-0.067294
-0.177580
-0.271929
-0.054680
-0.201905
-0.087742
...
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
1
-0.036496
1.088871
-0.182107
-0.052401
-0.085668
-0.026558
0.016419
-0.171927
0.060346
-0.087742
...
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
2
0.070766
-0.189223
0.057981
-0.140868
0.021829
-0.115114
-0.100801
0.073932
-0.023286
-0.110754
...
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
3
-0.039637
-0.205146
-0.184401
-0.159863
-0.062639
-0.060387
-0.197651
0.346521
-0.105029
-0.087742
...
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
4
-0.244743
-0.199970
-0.265148
-0.148300
-0.085668
-0.182752
-0.279125
-0.178592
-0.283117
-0.087742
...
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
13965
-0.245654
-0.175257
-0.248184
-0.192613
-0.085668
-0.180662
-0.279316
-0.178050
-0.279115
-0.087742
...
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
13966
-0.204023
-0.205182
-0.257308
-0.191965
-0.085668
-0.175087
-0.270255
-0.175323
-0.266459
-0.087742
...
-0.071554
-0.120861
-0.063158
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
13967
-0.227119
-0.204127
-0.201336
-0.164736
-0.085668
-0.164288
-0.183161
-0.162139
-0.237732
-0.087742
...
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
13968
0.100220
-0.204577
-0.038156
-0.128786
-0.085668
-0.128173
0.075970
-0.152256
-0.019633
-0.068500
...
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
13969
1.609554
0.222399
1.025478
0.436742
-0.085668
0.581671
0.691256
0.019537
1.080323
-0.087742
...
0.306581
2.042642
-0.058198
0.017576
-0.069761
-0.013200
-0.009263
-0.050238
-0.593860
-0.006262
13970 rows × 259 columns
2 划分数据集
以前5年数据为训练集、验证集train,第6年为测试集test
train=t4.iloc[:11310 ,:]
test=t4.iloc[11310 :,:259 ]
train["FLAG" ]=t1["FLAG" ]
train
CASH_C_EQUIV
NOTES_RECEIV
AR
PREPAYMENT
INT_RECEIV
OTH_RECEIV
INVENTORIES
OTH_CA
T_CA
AVAIL_FOR_SALE_FA
...
CA_TURNOVER
OPER_CYCLE
INVEN_TURNOVER
FA_TURNOVER
TFA_TURNOVER
DAYS_AP
DAYS_INVEN
TA_TURNOVER
AR_TURNOVER
FLAG
0
-0.110544
-0.106696
-0.161667
-0.182694
-0.067294
-0.177580
-0.271929
-0.054680
-0.201905
-0.087742
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
1
-0.036496
1.088871
-0.182107
-0.052401
-0.085668
-0.026558
0.016419
-0.171927
0.060346
-0.087742
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
2
0.070766
-0.189223
0.057981
-0.140868
0.021829
-0.115114
-0.100801
0.073932
-0.023286
-0.110754
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
3
-0.039637
-0.205146
-0.184401
-0.159863
-0.062639
-0.060387
-0.197651
0.346521
-0.105029
-0.087742
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
4
-0.244743
-0.199970
-0.265148
-0.148300
-0.085668
-0.182752
-0.279125
-0.178592
-0.283117
-0.087742
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
11305
-0.248180
-0.177748
-0.244404
-0.195324
-0.085668
-0.182942
-0.277525
-0.177054
-0.279415
-0.087742
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
11306
-0.218672
-0.196336
-0.255531
-0.193333
-0.085668
-0.160477
-0.268560
-0.174560
-0.270623
-0.087742
...
-1.587291
2.125000
-0.031456
-0.041665
-0.012392
0.005011
2.477695
-1.585968
-0.054737
0.0
11307
-0.200565
-0.204200
-0.232985
-0.177734
-0.085668
-0.175507
-0.207126
-0.160690
-0.242246
-0.087742
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
11308
-0.101380
-0.197020
-0.049710
-0.100780
-0.085668
-0.178231
0.042636
-0.123428
-0.095392
-0.064501
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
11309
1.326097
0.141651
0.889289
0.170126
0.029234
0.515964
0.529339
0.024492
0.854325
2.263116
...
0.710450
-0.058365
0.016279
-0.071072
-0.013249
-0.009230
-0.050121
-0.719755
0.004747
0.0
11310 rows × 260 columns
import pandas as pd
train.to_excel("训练集、验证集.xlsx" )
test.to_excel("测试集.xlsx" )
2.1 样本不均衡处理
X_train1=np.array(train.iloc[:11310 ,:259 ])
y_train1 =train.FLAG.values
from collections import Counter
print (Counter(y_train1))
Counter ({0.0 : 11219 , 1.0 : 91 })
import matplotlib.pyplot as plt
x = [11219 , 91 ]
labels = ['0' , '1' ]
fig, ax = plt.subplots()
ax.pie(x, radius=3 , center=(4 , 4 ),labels=labels,
wedgeprops={"linewidth" : 1 , "edgecolor" : "white" }, autopct='%.1f%%' , frame=True )
ax.set (xlim=(0 , 8 ), xticks=np.arange(1 , 8 ),
ylim=(0 , 8 ), yticks=np.arange(1 , 8 ))
plt.show()
from imblearn.over_sampling import SMOTE
oversample = SMOTE(sampling_strategy=0.2 ,random_state=42 )
X_os, y_os = oversample.fit_resample(X_train1,y_train1)
print (Counter(y_os))
Counter ({0.0 : 11219 , 1.0 : 2243 })
import pandas as pd
a1 = pd.DataFrame(X_os)
a1["259" ] = y_os
a1.columns = train.columns
a1
CASH_C_EQUIV
NOTES_RECEIV
AR
PREPAYMENT
INT_RECEIV
OTH_RECEIV
INVENTORIES
OTH_CA
T_CA
AVAIL_FOR_SALE_FA
...
CA_TURNOVER
OPER_CYCLE
INVEN_TURNOVER
FA_TURNOVER
TFA_TURNOVER
DAYS_AP
DAYS_INVEN
TA_TURNOVER
AR_TURNOVER
FLAG
0
-0.110544
-0.106696
-0.161667
-0.182694
-0.067294
-0.177580
-0.271929
-0.054680
-0.201905
-0.087742
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
1
-0.036496
1.088871
-0.182107
-0.052401
-0.085668
-0.026558
0.016419
-0.171927
0.060346
-0.087742
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
2
0.070766
-0.189223
0.057981
-0.140868
0.021829
-0.115114
-0.100801
0.073932
-0.023286
-0.110754
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
3
-0.039637
-0.205146
-0.184401
-0.159863
-0.062639
-0.060387
-0.197651
0.346521
-0.105029
-0.087742
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
4
-0.244743
-0.199970
-0.265148
-0.148300
-0.085668
-0.182752
-0.279125
-0.178592
-0.283117
-0.087742
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
13457
-0.194605
-0.204111
-0.235016
-0.192665
-0.095194
-0.166159
-0.266567
-0.157112
-0.255136
-0.082809
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
1.0
13458
-0.231584
-0.196071
-0.240270
-0.175277
-0.085668
0.100651
-0.104562
-0.135625
-0.215914
-0.093468
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
1.0
13459
-0.172396
-0.090448
-0.126067
-0.083162
-0.085668
-0.109957
-0.217281
-0.088728
-0.182285
-0.087742
...
-0.459342
-0.013455
-0.022908
-0.067021
-0.013149
-0.008900
-0.020502
-0.563696
-0.051747
1.0
13460
0.220213
0.428407
0.539064
0.129878
0.930931
0.152119
0.261990
0.167506
0.343409
-0.036143
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
1.0
13461
0.015222
0.092724
0.156657
0.054408
-0.085668
-0.104527
0.074379
-0.154946
0.042608
-0.085088
...
-0.462918
0.015973
-0.025581
0.770781
0.029360
-0.008998
0.032707
-0.261838
0.219923
1.0
13462 rows × 260 columns
a2 = a1.drop("FLAG" ,axis=1 )
a2
CASH_C_EQUIV
NOTES_RECEIV
AR
PREPAYMENT
INT_RECEIV
OTH_RECEIV
INVENTORIES
OTH_CA
T_CA
AVAIL_FOR_SALE_FA
...
AP_TURNOVER
CA_TURNOVER
OPER_CYCLE
INVEN_TURNOVER
FA_TURNOVER
TFA_TURNOVER
DAYS_AP
DAYS_INVEN
TA_TURNOVER
AR_TURNOVER
0
-0.110544
-0.106696
-0.161667
-0.182694
-0.067294
-0.177580
-0.271929
-0.054680
-0.201905
-0.087742
...
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
1
-0.036496
1.088871
-0.182107
-0.052401
-0.085668
-0.026558
0.016419
-0.171927
0.060346
-0.087742
...
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
2
0.070766
-0.189223
0.057981
-0.140868
0.021829
-0.115114
-0.100801
0.073932
-0.023286
-0.110754
...
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
3
-0.039637
-0.205146
-0.184401
-0.159863
-0.062639
-0.060387
-0.197651
0.346521
-0.105029
-0.087742
...
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
4
-0.244743
-0.199970
-0.265148
-0.148300
-0.085668
-0.182752
-0.279125
-0.178592
-0.283117
-0.087742
...
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
13457
-0.194605
-0.204111
-0.235016
-0.192665
-0.095194
-0.166159
-0.266567
-0.157112
-0.255136
-0.082809
...
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
13458
-0.231584
-0.196071
-0.240270
-0.175277
-0.085668
0.100651
-0.104562
-0.135625
-0.215914
-0.093468
...
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
13459
-0.172396
-0.090448
-0.126067
-0.083162
-0.085668
-0.109957
-0.217281
-0.088728
-0.182285
-0.087742
...
0.103209
-0.459342
-0.013455
-0.022908
-0.067021
-0.013149
-0.008900
-0.020502
-0.563696
-0.051747
13460
0.220213
0.428407
0.539064
0.129878
0.930931
0.152119
0.261990
0.167506
0.343409
-0.036143
...
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
13461
0.015222
0.092724
0.156657
0.054408
-0.085668
-0.104527
0.074379
-0.154946
0.042608
-0.085088
...
-0.040714
-0.462918
0.015973
-0.025581
0.770781
0.029360
-0.008998
0.032707
-0.261838
0.219923
13462 rows × 259 columns
2.2 划分训练集、验证集
from sklearn.model_selection import train_test_split
import pandas as pd
train_data,test_data1 = train_test_split(a1,test_size = 0.2 ,random_state=0 )
test_data1
CASH_C_EQUIV
NOTES_RECEIV
AR
PREPAYMENT
INT_RECEIV
OTH_RECEIV
INVENTORIES
OTH_CA
T_CA
AVAIL_FOR_SALE_FA
...
CA_TURNOVER
OPER_CYCLE
INVEN_TURNOVER
FA_TURNOVER
TFA_TURNOVER
DAYS_AP
DAYS_INVEN
TA_TURNOVER
AR_TURNOVER
FLAG
10307
-0.237276
-0.149240
-0.209804
-0.164470
-0.085668
-0.159014
-0.254104
-0.178466
-0.256564
-0.087742
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
6913
2.860131
1.594538
0.480208
2.647223
-0.085668
1.097509
8.681112
1.615339
3.903271
0.161419
...
-1.074846
-0.055897
-0.007669
-0.055792
-0.012656
-0.009343
-0.045661
-0.929869
-0.047602
0.0
7530
-0.236536
-0.178238
-0.227843
-0.184537
-0.085668
-0.175392
-0.267297
-0.177575
-0.268344
-0.087742
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
8204
-0.247465
-0.196230
-0.194191
-0.192289
-0.085668
-0.174138
-0.264405
-0.158679
-0.264602
-0.087742
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
11212
0.204918
-0.167106
0.011418
-0.155947
0.087527
-0.077206
0.006010
-0.041689
0.031319
-0.087742
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
11098
-0.204823
-0.191856
-0.257564
-0.195262
-0.085668
-0.181840
-0.262093
-0.140187
-0.261923
-0.087742
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
451
-0.146761
-0.059989
-0.141490
-0.150467
-0.087393
-0.178277
-0.149462
-0.135039
-0.148100
-0.064996
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
5634
-0.217310
-0.048995
-0.150073
-0.185264
-0.085668
-0.147963
-0.269352
-0.144624
-0.223935
-0.085432
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
4379
-0.225533
-0.153682
-0.128768
-0.162863
-0.111326
-0.171458
-0.151384
0.109614
-0.174713
-0.087742
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
7712
-0.229782
-0.188186
-0.269640
-0.172389
-0.085668
-0.171999
-0.278531
-0.171582
-0.279448
-0.087742
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
2693 rows × 260 columns
CASH_C_EQUIV
NOTES_RECEIV
AR
PREPAYMENT
INT_RECEIV
OTH_RECEIV
INVENTORIES
OTH_CA
T_CA
AVAIL_FOR_SALE_FA
...
CA_TURNOVER
OPER_CYCLE
INVEN_TURNOVER
FA_TURNOVER
TFA_TURNOVER
DAYS_AP
DAYS_INVEN
TA_TURNOVER
AR_TURNOVER
FLAG
11732
-0.115619
-0.159654
-0.215413
-0.066286
-0.055511
-0.135853
-0.226678
-0.152113
-0.203331
-0.087742
...
2.124952
-0.047612
-0.005608
-0.059694
-0.012875
-0.009210
-0.042106
0.674642
0.081817
1.0
2849
-0.232070
-0.138400
-0.136509
-0.174268
-0.085668
-0.166162
-0.091276
-0.171693
-0.196572
-0.087742
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
4938
0.147070
0.316345
0.139180
0.228339
0.028565
1.242537
-0.003413
-0.167509
0.159136
-0.087742
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
10029
-0.214163
-0.154632
-0.230625
-0.187055
-0.085668
-0.165990
-0.213737
-0.150521
-0.240806
-0.087742
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
5420
-0.210466
-0.171303
-0.145414
-0.145187
-0.085668
-0.153739
-0.053173
-0.062340
-0.168963
-0.087742
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
13123
-0.244645
-0.113926
-0.200190
-0.112910
-0.095363
-0.145159
-0.109428
-0.093625
-0.201378
-0.092453
...
-0.958218
0.086090
-0.028829
-0.063935
-0.013042
-0.007976
0.044354
-1.065248
-0.053682
1.0
3264
-0.230011
-0.182265
-0.250535
-0.182594
-0.114804
-0.176142
-0.289794
-0.178597
-0.276976
-0.087742
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
9845
-0.078908
-0.201787
-0.191256
-0.195170
-0.085668
-0.155015
-0.258678
-0.178345
-0.205729
-0.087742
...
-0.812152
-0.054565
0.101067
-0.071234
-0.013280
-0.009285
-0.052963
-1.054317
-0.045676
1.0
10799
-0.239831
-0.174050
-0.177255
-0.192585
-0.085668
-0.101249
-0.273990
-0.151724
-0.254010
-0.103202
...
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
0.0
2732
1.702119
0.274794
-0.037051
0.189707
2.197139
-0.087228
0.200058
-0.144783
0.668527
-0.087742
...
0.010898
-0.026220
-0.022478
-0.055215
-0.012707
-0.009076
-0.031109
0.198842
-0.051366
0.0
10769 rows × 260 columns
test_data2=test_data1.drop("FLAG" ,axis=1 )
3 造假指标模型建立
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
from sklearn.metrics import auc
from xgboost import plot_importance
X_train=np.array(train_data.iloc[:,:259 ])
y_train =np.array(train_data["FLAG" ])
y=np.array(test_data1["FLAG" ])
feature_1 = a1.drop('FLAG' ,axis = 1 )
feature_1
CASH_C_EQUIV
NOTES_RECEIV
AR
PREPAYMENT
INT_RECEIV
OTH_RECEIV
INVENTORIES
OTH_CA
T_CA
AVAIL_FOR_SALE_FA
LT_EQUITY_INVEST
INVEST_REAL_ESTATE
FIXED_ASSETS
CIP
INTAN_ASSETS
GOODWILL
LT_AMOR_EXP
DEFER_TAX_ASSETS
OTH_NCA
T_NCA
T_ASSETS
ST_BORR
NOTES_PAYABLE
AP
ADVANCE_RECEIPTS
PAYROLL_PAYABLE
TAXES_PAYABLE
INT_PAYABLE
DIV_PAYABLE
OTH_PAYABLE
NCL_WITHIN_1Y
OTH_CL
T_CL
LT_BORR
LT_PAYABLE
ESTIMATED_LIAB
DEFER_REVENUE
DEFER_TAX_LIAB
T_NCL
T_LIAB
PAID_IN_CAPITAL
CAPITAL_RESER
SPECIAL_RESER
SURPLUS_RESER
RETAINED_EARNINGS
T_EQUITY_ATTR_P
MINORITY_INT
T_SH_EQUITY
T_LIAB_EQUITY
OTH_COMPRE_INCOME
C_PAID_OTH_FINAN_A
N_CF_FR_INVEST_A
C_FR_BORR
N_CF_OPERATE_A
C_FR_CAP_CONTR
C_PAID_INVEST
C_FR_OTH_FINAN_A
C_PAID_OTH_INVEST_A
C_INF_FR_INVEST_A
C_PAID_G_S
...
TSE_TA
C_TA
TEAP_IC
LT_AMOR_EXP_TA
NCA_TA
ST_BORR_TA
NCL_TA
EQU_MULTIPLIER
CAP_FIX_RATIO
N_TAN_A_TA
REPAY_TA
ID_IC
AP_TA
INVEN_TA
CL_TA
ADV_R_TA
AR_TA
TEAP_TA
T_FIXED_A_TA
FIXED_A_TA
TRE_TA
CA_TA
INTAN_A_TA
AIL_TR
VAL_CHG_P_TR
COGS_TR
SELL_EXP_TR
PERIOD_EXP_TR
INV_INC_TR
IT_TP
OPA_P_TP
OP_TR
FINAN_EXP_TR
VAL_CHG_P_TP
NI_CUT_NI
OPA_P_TR
N_NOPI_TP
R_TR
NOPG_TR
NI_TR
TCOGS_TR
TP_TR
NOPL_TR
ADMIN_EXP_TR
EBITDA_TR
BTAX_SURCHG_TR
IT_TR
EBIT_TR
OP_TP
DAYS_AR
AP_TURNOVER
CA_TURNOVER
OPER_CYCLE
INVEN_TURNOVER
FA_TURNOVER
TFA_TURNOVER
DAYS_AP
DAYS_INVEN
TA_TURNOVER
AR_TURNOVER
0
-0.110544
-0.106696
-0.161667
-0.182694
-0.067294
-0.177580
-0.271929
-0.054680
-0.201905
-0.087742
-0.142028
-0.086266
-0.200848
-0.164105
-0.277532
-0.163757
-0.234584
-0.116027
-0.114840
-0.232795
-0.228647
-0.229273
-0.225301
-0.204865
-0.163052
-0.185749
-0.160000
-0.198005
-0.077002
-0.183300
-0.187825
-0.081143
-0.254193
-0.203991
-0.139548
-0.074228
-0.097049
-0.109180
-0.209598
-0.257919
-0.252988
0.005258
-0.146179
-0.153425
-0.171217
-0.157934
-0.154017
-0.167159
-0.228648
-0.048196
-0.157168
0.275978
-0.286845
-0.143991
-0.194064
-0.048096
-0.153902
-0.159625
-0.019451
-0.183022
...
0.011147
-0.127655
0.007816
-0.090990
-0.018944
-0.014594
-0.009392
-0.020234
-0.079771
0.011144
-0.158489
-0.020358
-0.016036
-0.144829
-0.011974
-0.010509
-0.168016
0.011145
-0.132080
-0.126937
0.011562
0.019110
-0.139211
-0.008475
-0.044078
0.087840
-0.156706
-0.008489
-0.041648
0.010630
0.016545
0.008475
-0.008480
-0.011569
0.034511
0.008481
-0.022897
0.038886
-0.008784
0.008472
-0.008481
0.008471
-0.008531
-0.008533
-0.003868
-0.044050
-0.054146
0.008459
0.022926
-0.012153
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
1
-0.036496
1.088871
-0.182107
-0.052401
-0.085668
-0.026558
0.016419
-0.171927
0.060346
-0.087742
-0.153631
-0.086266
-0.186912
-0.173041
-0.111736
-0.163757
-0.172424
-0.023101
-0.114840
-0.209060
-0.067471
-0.229273
0.827724
0.093545
0.092487
0.069331
0.087165
-0.198005
-0.077002
-0.053152
-0.187825
-0.076646
0.043979
-0.150792
-0.139548
-0.074228
0.100215
-0.109180
-0.186920
-0.010804
-0.109135
-0.403590
-0.146179
0.015458
0.038215
-0.141095
-0.154017
-0.152307
-0.067472
-0.048196
-0.157168
0.182495
-0.232185
0.099247
-0.194064
-0.121115
-0.153902
-0.159625
-0.154439
-0.020811
...
0.011147
-0.127655
0.007816
-0.090990
-0.018944
-0.014594
-0.009392
-0.020234
-0.079771
0.011144
-0.158489
-0.020358
-0.016036
-0.144829
-0.011974
-0.010509
-0.168016
0.011145
-0.132080
-0.126937
0.011562
0.019110
-0.139211
-0.008475
-0.044078
0.087840
-0.156706
-0.008489
-0.041648
0.010630
0.016545
0.008475
-0.008480
-0.011569
0.034511
0.008481
-0.022897
0.038886
-0.008784
0.008472
-0.008481
0.008471
-0.008531
-0.008533
-0.003868
-0.044050
-0.054146
0.008459
0.022926
-0.012153
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
2
0.070766
-0.189223
0.057981
-0.140868
0.021829
-0.115114
-0.100801
0.073932
-0.023286
-0.110754
-0.067454
-0.120642
-0.200108
0.077432
-0.148904
0.586702
-0.089601
-0.113231
-0.059961
-0.105702
-0.064816
-0.316157
-0.231363
-0.137904
-0.011700
-0.191030
-0.024851
-0.198005
-0.105756
-0.108134
-0.187825
-0.076646
-0.200263
-0.150792
-0.139548
-0.074228
0.260842
-0.062466
-0.163225
-0.202685
-0.029154
0.651345
-0.146179
-0.042070
-0.044831
0.188303
-0.064006
0.157133
-0.064817
-0.048196
-0.157168
-0.431703
-0.250391
0.005059
-0.243632
0.327603
-0.153902
-0.159625
0.228867
-0.158202
...
0.011147
-0.127655
0.007816
-0.090990
-0.018944
-0.014594
-0.009392
-0.020234
-0.079771
0.011144
-0.158489
-0.020358
-0.016036
-0.144829
-0.011974
-0.010509
-0.168016
0.011145
-0.132080
-0.126937
0.011562
0.019110
-0.139211
-0.008475
-0.044078
0.087840
-0.156706
-0.008489
-0.041648
0.010630
0.016545
0.008475
-0.008480
-0.011569
0.034511
0.008481
-0.022897
0.038886
-0.008784
0.008472
-0.008481
0.008471
-0.008531
-0.008533
-0.003868
-0.044050
-0.054146
0.008459
0.022926
-0.012153
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
3
-0.039637
-0.205146
-0.184401
-0.159863
-0.062639
-0.060387
-0.197651
0.346521
-0.105029
-0.087742
-0.120960
-0.086266
-0.221463
-0.199303
-0.177723
-0.171951
-0.027574
-0.127111
-0.117747
-0.236460
-0.175166
-0.230604
-0.164503
-0.125771
-0.142193
-0.045072
-0.156540
-0.209112
-0.077002
-0.148004
-0.242135
-0.076646
-0.188544
-0.116849
-0.139548
-0.074228
-0.128884
-0.039623
-0.162012
-0.192893
-0.286260
0.076386
-0.146179
-0.171182
-0.131627
-0.123079
-0.177217
-0.135453
-0.175168
-0.062454
-0.157168
0.120646
-0.198916
-0.093394
-0.245538
0.044030
-0.189950
0.105294
0.073860
-0.109958
...
0.011147
-0.127655
0.007816
-0.090990
-0.018944
-0.014594
-0.009392
-0.020234
-0.079771
0.011144
-0.158489
-0.020358
-0.016036
-0.144829
-0.011974
-0.010509
-0.168016
0.011145
-0.132080
-0.126937
0.011562
0.019110
-0.139211
-0.008475
-0.044078
0.087840
-0.156706
-0.008489
-0.041648
0.010630
0.016545
0.008475
-0.008480
-0.011569
0.034511
0.008481
-0.022897
0.038886
-0.008784
0.008472
-0.008481
0.008471
-0.008531
-0.008533
-0.003868
-0.044050
-0.054146
0.008459
0.022926
-0.012153
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
4
-0.244743
-0.199970
-0.265148
-0.148300
-0.085668
-0.182752
-0.279125
-0.178592
-0.283117
-0.087742
-0.120960
-0.086266
-0.282834
-0.196378
-0.306345
-0.163757
-0.189353
-0.135653
-0.114840
-0.288391
-0.302069
-0.312094
-0.194654
-0.204394
-0.168588
-0.229982
-0.185895
-0.217441
-0.108710
-0.183572
-0.187825
-0.076646
-0.259576
-0.150792
-0.139548
-0.074228
-0.162071
-0.109180
-0.218931
-0.264605
-0.401198
-0.402463
-0.146179
-0.176925
-0.219024
-0.355697
-0.179565
-0.341002
-0.302070
-0.048196
-0.176421
0.224868
-0.285642
-0.201230
-0.194064
-0.121115
-0.153902
-0.159625
-0.156845
-0.177266
...
0.011147
-0.127655
0.007816
-0.090990
-0.018944
-0.014594
-0.009392
-0.020234
-0.079771
0.011144
-0.158489
-0.020358
-0.016036
-0.144829
-0.011974
-0.010509
-0.168016
0.011145
-0.132080
-0.126937
0.011562
0.019110
-0.139211
-0.008475
-0.044078
0.087840
-0.156706
-0.008489
-0.041648
0.010630
0.016545
0.008475
-0.008480
-0.011569
0.034511
0.008481
-0.022897
0.038886
-0.008784
0.008472
-0.008481
0.008471
-0.008531
-0.008533
-0.003868
-0.044050
-0.054146
0.008459
0.022926
-0.012153
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
13457
-0.194605
-0.204111
-0.235016
-0.192665
-0.095194
-0.166159
-0.266567
-0.157112
-0.255136
-0.082809
-0.144837
-0.086266
-0.234505
-0.179108
-0.247421
-0.173251
-0.202890
-0.117034
-0.115165
-0.249601
-0.267204
-0.241517
-0.233453
-0.203503
-0.160887
-0.201682
-0.171888
-0.198005
-0.077002
-0.156267
-0.187825
-0.076646
-0.251653
-0.150792
-0.139548
-0.085263
-0.142678
-0.124578
-0.215774
-0.257401
-0.138162
-0.345235
-0.146179
-0.164382
-0.194570
-0.269117
-0.180658
-0.264808
-0.267205
-0.048232
-0.157168
0.282769
-0.277337
-0.214423
-0.237723
-0.089799
-0.174903
-0.159625
-0.081430
-0.168416
...
0.011147
-0.127655
0.007816
-0.090990
-0.018944
-0.014594
-0.009392
-0.020234
-0.079771
0.011144
-0.158489
-0.020358
-0.016036
-0.144829
-0.011974
-0.010509
-0.168016
0.011145
-0.132080
-0.126937
0.011562
0.019110
-0.139211
-0.008475
-0.044078
0.087840
-0.156706
-0.008489
-0.041648
0.010630
0.016545
0.008475
-0.008480
-0.011569
0.034511
0.008481
-0.022897
0.038886
-0.008784
0.008472
-0.008481
0.008471
-0.008531
-0.008533
-0.003868
-0.044050
-0.054146
0.008459
0.022926
-0.012153
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
13458
-0.231584
-0.196071
-0.240270
-0.175277
-0.085668
0.100651
-0.104562
-0.135625
-0.215914
-0.093468
-0.075270
0.064067
-0.170791
-0.197127
-0.291951
-0.174709
-0.235937
-0.132742
-0.119921
-0.205796
-0.223482
-0.183847
-0.242438
-0.113028
-0.165859
-0.198643
-0.352867
-0.101828
-0.078867
0.014294
-0.243134
-0.077372
-0.178927
-0.182819
-0.139548
0.072025
-0.158947
-0.109180
-0.191181
-0.192376
-0.015725
-0.131649
-0.146179
-0.104321
-0.381811
-0.255928
-0.208644
-0.257609
-0.223483
-0.037582
-0.157168
0.219093
-0.215293
-0.297951
-0.196203
-0.126649
0.038585
-0.159625
-0.140010
-0.169934
...
0.011147
-0.127655
0.007816
-0.090990
-0.018944
-0.014594
-0.009392
-0.020234
-0.079771
0.011144
-0.158489
-0.020358
-0.016036
-0.144829
-0.011974
-0.010509
-0.168016
0.011145
-0.132080
-0.126937
0.011562
0.019110
-0.139211
-0.008475
-0.044078
0.087840
-0.156706
-0.008489
-0.041648
0.010630
0.016545
0.008475
-0.008480
-0.011569
0.034511
0.008481
-0.022897
0.038886
-0.008784
0.008472
-0.008481
0.008471
-0.008531
-0.008533
-0.003868
-0.044050
-0.054146
0.008459
0.022926
-0.012153
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
13459
-0.172396
-0.090448
-0.126067
-0.083162
-0.085668
-0.109957
-0.217281
-0.088728
-0.182285
-0.087742
-0.123844
-0.120632
-0.234071
-0.054888
-0.188660
-0.163757
-0.128242
-0.031042
-0.114840
-0.221879
-0.212135
-0.252805
-0.194654
-0.079903
-0.078436
-0.206288
-0.167967
0.184241
-0.077002
-0.023900
-0.187825
-0.076646
-0.186352
-0.150792
-0.139548
0.849701
-0.147337
-0.109180
-0.130924
-0.183368
0.092909
-0.037259
-0.222978
-0.104361
-0.472813
-0.247493
-0.145101
-0.243325
-0.212136
-0.048196
-0.157168
0.284709
-0.227859
-0.224250
-0.194064
-0.041249
-0.132306
-0.192049
-0.032639
-0.181799
...
0.011101
-0.091564
-0.763986
-0.071247
0.996515
-0.014163
-0.009115
0.068415
0.295576
0.011159
-0.254426
0.988053
0.000208
0.124612
-0.011954
-0.011675
0.593479
0.011130
2.383589
2.401008
0.011301
-1.000220
0.351214
-0.008270
-0.051766
1.752778
-0.611433
-0.008477
-0.053741
-0.113486
0.027801
0.008361
-0.008459
-0.013867
0.070320
0.008370
-0.032261
0.038886
-0.008778
0.008361
-0.008370
0.008355
-0.008532
-0.008450
-0.472817
0.014996
-0.339346
0.008169
0.032168
0.010236
0.103209
-0.459342
-0.013455
-0.022908
-0.067021
-0.013149
-0.008900
-0.020502
-0.563696
-0.051747
13460
0.220213
0.428407
0.539064
0.129878
0.930931
0.152119
0.261990
0.167506
0.343409
-0.036143
-0.133788
-0.086266
0.574080
1.685421
0.752913
-0.025926
-0.010816
0.002251
0.311119
0.653651
0.514408
0.162320
-0.251056
0.137400
-0.154652
-0.148068
0.305776
2.339665
-0.077002
-0.083185
-0.018381
0.503922
0.097436
-0.155138
-0.158031
-0.074228
0.376548
0.404251
1.020602
0.333509
-0.014059
1.199035
-0.146179
0.190460
0.870264
0.871184
-0.024417
0.765729
0.514407
-0.079826
0.026608
-0.935087
0.135455
0.398947
-0.243537
-0.125528
-0.021834
-0.024758
-0.114136
0.050575
...
0.011147
-0.127655
0.007816
-0.090990
-0.018944
-0.014594
-0.009392
-0.020234
-0.079771
0.011144
-0.158489
-0.020358
-0.016036
-0.144829
-0.011974
-0.010509
-0.168016
0.011145
-0.132080
-0.126937
0.011562
0.019110
-0.139211
-0.008475
-0.044078
0.087840
-0.156706
-0.008489
-0.041648
0.010630
0.016545
0.008475
-0.008480
-0.011569
0.034511
0.008481
-0.022897
0.038886
-0.008784
0.008472
-0.008481
0.008471
-0.008531
-0.008533
-0.003868
-0.044050
-0.054146
0.008459
0.022926
-0.012153
-0.071554
-0.120861
-0.030988
-0.022326
-0.046674
-0.012361
-0.008952
-0.031444
-0.112856
-0.047602
13461
0.015222
0.092724
0.156657
0.054408
-0.085668
-0.104527
0.074379
-0.154946
0.042608
-0.085088
-0.117501
-0.086266
0.083504
-0.131142
-0.256025
-0.170433
0.338018
-0.074333
0.013288
-0.046307
0.001719
-0.154665
0.307370
0.160793
-0.096559
-0.026013
-0.139392
-0.198598
-0.077002
0.065349
0.128240
-0.076646
0.029736
0.175793
-0.115495
-0.083274
0.024891
-0.113892
0.041858
0.034694
-0.113173
0.029863
-0.146179
-0.023991
-0.028819
-0.031130
-0.140034
-0.050337
0.001718
-0.020687
-0.167773
0.198412
-0.077565
0.225320
-0.194064
-0.131067
-0.063616
-0.128632
-0.114486
-0.004891
...
0.011075
0.102449
-0.414292
-0.090990
-0.811893
-0.014594
-0.009182
0.034719
-0.130274
0.011120
-0.128349
0.499511
-0.014519
1.970255
-0.011936
-0.004084
-0.587565
0.011085
-0.768414
-0.818383
0.011551
0.806651
-0.422075
-0.008475
-0.038851
0.377146
-0.372627
-0.008509
-0.038106
0.069489
0.016263
0.008474
-0.008479
-0.008330
0.041396
0.008481
-0.024758
0.015333
-0.008814
0.008469
-0.008481
0.008468
-0.008534
-0.008605
-0.058418
0.213152
-0.037802
0.008452
0.024738
-0.018517
-0.040714
-0.462918
0.015973
-0.025581
0.770781
0.029360
-0.008998
0.032707
-0.261838
0.219923
13462 rows × 259 columns
3.1 Logistics Regression 调参过程
在模型中先固定参数的默认值,然后进行参数调节,进行网格搜索与专业文献查阅寻找精确度最高而又不引起模型过拟合的参数值.模型优化评价指标为 AUC 值.
在逻辑回归模型中,需要调整的参数共有 2 个:penalty 与 C, 其中 penalty 是正则化方法,C 为逻辑回归中的超参数,表示正则化强度的倒数,在模型中默认为 1,表示正则项与损失函数的比值为 1:1.当模型中的 C 越小时,会导致损失损失函数越小,从而对其惩罚更重,正则化作用越强.
from sklearn.linear_model import LogisticRegression
clf1 = LogisticRegression(C=0.2 ,penalty="l2" ).fit(X_train, y_train)
y_pred_gbc = clf1.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc=metrics.auc(fpr, tpr)
roc_auc
D: \anaconda3\envs\Rshimmering\lib\site-packages\sklearn\linear_model\_logistic.py: 763 : ConvergenceWarning: lbfgs failed to converge (status=1 ):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https: //scikit-learn.org/stable /modules/preprocessing .html
Please also refer to the documentation for alternative solver options:
https: //scikit-learn.org/stable /modules/linear _model.html
n_iter_i = _check_optimize_result(
0.884673004897724
clf2= LogisticRegression(penalty="none" ).fit(X_train, y_train)
y_pred_gbc = clf2.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc=metrics.auc(fpr, tpr)
roc_auc
D: \anaconda3\envs\Rshimmering\lib\site-packages\sklearn\linear_model\_logistic.py: 763 : ConvergenceWarning: lbfgs failed to converge (status=1 ):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https: //scikit-learn.org/stable /modules/preprocessing .html
Please also refer to the documentation for alternative solver options:
https: //scikit-learn.org/stable /modules/linear _model.html
n_iter_i = _check_optimize_result(
0.9013195044655719
逻辑回归属于线性判别模型,而本文所处理的数据集维度较高,故可能存在其他非线性模型能够表现的更好.
3.2 SVM 调参过程
SVM 需要调整的参数有 2 个,分别为 kernal 和 C,
其中 kernal 代表核方法,可选的函数有:“poly”:多项式核函数,“rbf”:高斯核函数 (径向基函数),“linear”:线性核函数,“sigmod”:核函数.核函数在 SVM 中发挥着重要功能,在简化向量内积运算起着重要作用,其中高斯核函数在非线性分类问题上广泛应用.
C 代表错误项的惩罚系数,在软间隔分类中应用较多.C 越大,对错误样本的惩罚力度就越大,训练的样本准确率越高.但是容易产生过拟合现象,机器模型的泛化能力降低.相反,C 取较小的值时,允许训练样本中存在错误分类的样本,能够增强模型的泛化能力.
svm1 = svm.SVC(kernel='rbf' ,probability=True ).fit(X_train, y_train)
y_pred_gbc = svm1.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc=metrics.auc(fpr, tpr)
roc_auc
svm2 = svm.SVC(C=0.003 ,probability=True ).fit(X_train, y_train)
y_pred_gbc = svm2.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc=metrics.auc(fpr, tpr)
roc_auc
3.3 RF 调参过程
在随机森林模型中需要调节的参数有 4 个,分别为
max_depth:树的最大深度、
n_estinators:树模型的数量、
min_samples_split:中间节点分支所需的最小样本数量、
min_sample_leaf:叶节点存在所需的最小样本数量.
为了防止模型出现过拟合现象,本文在调节其他参数时控制 max_depth=3.
from sklearn.ensemble import RandomForestClassifier
RF1 = RandomForestClassifier(max_depth=3 , random_state=0 ).fit(X_train, y_train)
y_pred_gbc = RF1.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
RF2 = RandomForestClassifier(n_estimators=600 , random_state=0 ,max_depth=3 ).fit(X_train, y_train)
y_pred_gbc = RF2.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
RF3= RandomForestClassifier(min_samples_leaf=30 , random_state=0 ,max_depth=3 ).fit(X_train, y_train)
y_pred_gbc = RF3.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
RF4 = RandomForestClassifier(min_samples_split=80 , random_state=0 ,max_depth=3 ).fit(X_train, y_train)
y_pred_gbc = RF4.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
3.4 DT 调参过程
在决策树模型中需要调整的参数共有 4 个,分别为
max_depth:树的最大深度、
min_samples_split:中间节点分支所需要的的最小样本量、
min_sample_leaf:叶节点存在所需的最小样本量、
max_leaf_nodes:最大叶子节点数.
为了防止模型出现过拟合现象,本文调节其他参数对模型的 AUC 影响时控制 max_depth=6
DT1 =tree.DecisionTreeClassifier(max_depth=6 ).fit(X_train,y_train)
y_pred_gbc = DT1.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
DT1 =tree.DecisionTreeClassifier(min_samples_leaf=3 ,max_depth=6 ).fit(X_train,y_train)
y_pred_gbc = DT1.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
DT2 =tree.DecisionTreeClassifier(max_leaf_nodes=70 , max_depth=6 ).fit(X_train,y_train)
y_pred_gbc = DT2.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
DT3 =tree.DecisionTreeClassifier(min_samples_split=3 ,max_depth=6 ).fit(X_train,y_train)
y_pred_gbc = DT3.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
——筛选在DT算法中特征重要性系数前20个指标
DT_importances = DT3.feature_importances_*10000
DT = pd.Series(DT_importances, index = a2.columns)
DT = DT.sort_values(ascending=False )
DT = pd.DataFrame({'feature_importances' : DT})
DT.head(20 )
feature_importances
DILUTED_EPS
1876.459630
ESTIMATED_LIAB
1418.956658
RETAINED_EARNINGS
1274.737100
ASSETS_DISP_GAIN
1189.341616
C_FR_CAP_CONTR
350.146297
CASH_C_EQUIV
305.869870
DEFER_TAX_LIAB
305.319042
INT_RECEIV
278.811896
CURRENT_RATIO
254.761597
N_CF_FR_FINAN_A
245.185566
OTH_GAIN
214.434895
GOODWILL
207.893979
N_INCOME
199.309895
CL_TA
190.726453
NOPERATE_EXP
180.024039
OTH_CL
167.889986
GAIN_INVEST
160.657452
DIV_PAYABLE
157.439514
IT_TR
117.051125
A_J_INVEST_INCOME
101.434231
3.5 XGBoost 调参过程
XGBoost 需要调节的参数共有 9 个,下面本文只介绍对该模型相对重要的两个参数:
第一个参数是 n_estimators,在 XGBoost 模型中这个参数发挥着重要作用,表示该模型中分类器的个数,该参数的值越大,模型的学习能力就会越强.
第二个参数是learning_rate,learning_rate 表示集成模型中的学习速率,又被称之为步长控制迭代速率,有效的调节该参数值能够防止模型出现过拟合现象,默认值为 0.1,调节范围为[0,1].
本文为了尽可能防止模型出现过拟合,在调节其他参数的值时将学习率设定为0.001.
from xgboost import XGBClassifier
from xgboost import plot_importance
XGBoost1 = XGBClassifier(learning_rate=0.001 ).fit(X_train,y_train)
y_pred_gbc = XGBoost1.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146 : UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release . To remove this warning , do the following: 1 ) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2 ) Encode your labels (y) as integers starting with 0 , i.e. 0 , 1 , 2 , ..., [num_class - 1 ].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[22 :58 :21 ] WARNING : C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0 /src/learner.cc:1095 : Starting in XGBoost 1.3 .0 , the default evaluation metric used with the objective 'binary :logistic' was changed from 'error ' to 'logloss '. Explicitly set eval_metric if you'd like to restore the old behavior.
0.9089825218476904
XGBoost2 = XGBClassifier(n_estimators=120 ,learning_rate=0.001 ).fit(X_train,y_train)
y_pred_gbc = XGBoost2.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146 : UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release . To remove this warning , do the following: 1 ) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2 ) Encode your labels (y) as integers starting with 0 , i.e. 0 , 1 , 2 , ..., [num_class - 1 ].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[22 :58 :59 ] WARNING : C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0 /src/learner.cc:1095 : Starting in XGBoost 1.3 .0 , the default evaluation metric used with the objective 'binary :logistic' was changed from 'error ' to 'logloss '. Explicitly set eval_metric if you'd like to restore the old behavior.
0.911076058772688
XGBoost3 = XGBClassifier(max_depth=6 ,learning_rate=0.001 ).fit(X_train,y_train)
y_pred_gbc = XGBoost3.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146 : UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release . To remove this warning , do the following: 1 ) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2 ) Encode your labels (y) as integers starting with 0 , i.e. 0 , 1 , 2 , ..., [num_class - 1 ].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[22 :59 :42 ] WARNING : C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0 /src/learner.cc:1095 : Starting in XGBoost 1.3 .0 , the default evaluation metric used with the objective 'binary :logistic' was changed from 'error ' to 'logloss '. Explicitly set eval_metric if you'd like to restore the old behavior.
0.9089825218476904
XGBoost4 = XGBClassifier(min_child_weight=3 ,learning_rate=0.001 ).fit(X_train,y_train)
y_pred_gbc = XGBoost4.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146 : UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release . To remove this warning , do the following: 1 ) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2 ) Encode your labels (y) as integers starting with 0 , i.e. 0 , 1 , 2 , ..., [num_class - 1 ].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[23 :00 :24 ] WARNING : C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0 /src/learner.cc:1095 : Starting in XGBoost 1.3 .0 , the default evaluation metric used with the objective 'binary :logistic' was changed from 'error ' to 'logloss '. Explicitly set eval_metric if you'd like to restore the old behavior.
0.9118438490348603
XGBoost5 = XGBClassifier(gamma=0.4 ,learning_rate=0.001 ).fit(X_train,y_train)
y_pred_gbc = XGBoost5.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146 : UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release . To remove this warning , do the following: 1 ) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2 ) Encode your labels (y) as integers starting with 0 , i.e. 0 , 1 , 2 , ..., [num_class - 1 ].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[23 :01 :06 ] WARNING : C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0 /src/learner.cc:1095 : Starting in XGBoost 1.3 .0 , the default evaluation metric used with the objective 'binary :logistic' was changed from 'error ' to 'logloss '. Explicitly set eval_metric if you'd like to restore the old behavior.
0.9089844425237683
XGBoost7 = XGBClassifier(colsample_btree=0.85 ,learning_rate=0.001 ).fit(X_train,y_train)
y_pred_gbc = XGBoost7.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
[23 :01 :49 ] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0 /src/learner.cc:573 :
Parameters: might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146 : UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1 ) Pass option use_label_encoder=False when constructing XGBClassifier object ; and 2 ) Encode your labels (y) as integers starting with 0 , i.e. 0 , 1 , 2 , ..., [num_class - 1 ].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[23 :01 :49 ] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0 /src/learner.cc:1095 : Starting in XGBoost 1.3 .0 , the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss' . Explicitly set eval_metric if you'd like to restore the old behavior.
0.9089825218476904
XGBoost8 = XGBClassifier(reg_alpha=0.2 ,learning_rate=0.001 ).fit(X_train,y_train)
y_pred_gbc = XGBoost8.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146 : UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release . To remove this warning , do the following: 1 ) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2 ) Encode your labels (y) as integers starting with 0 , i.e. 0 , 1 , 2 , ..., [num_class - 1 ].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[23 :02 :35 ] WARNING : C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0 /src/learner.cc:1095 : Starting in XGBoost 1.3 .0 , the default evaluation metric used with the objective 'binary :logistic' was changed from 'error ' to 'logloss '. Explicitly set eval_metric if you'd like to restore the old behavior.
0.9087702871410737
XGBoost9 = XGBClassifier(reg_lambda=0.3 ,learning_rate=0.001 ).fit(X_train,y_train)
y_pred_gbc = XGBoost9.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146 : UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release . To remove this warning , do the following: 1 ) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2 ) Encode your labels (y) as integers starting with 0 , i.e. 0 , 1 , 2 , ..., [num_class - 1 ].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[23 :03 :49 ] WARNING : C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0 /src/learner.cc:1095 : Starting in XGBoost 1.3 .0 , the default evaluation metric used with the objective 'binary :logistic' was changed from 'error ' to 'logloss '. Explicitly set eval_metric if you'd like to restore the old behavior.
0.9160871026601364
——筛选在XGBoost算法中特征重要性系数前20个指标
XGBoost_importances = XGBoost9.feature_importances_*10000
XGBoost = pd.Series(XGBoost_importances, index = a2.columns)
XGBoost = XGBoost.sort_values(ascending=False )
XGBoost = pd.DataFrame({'feature_importances' : XGBoost})
XGBoost.head(20 )
feature_importances
DILUTED_EPS
1092.639893
ASSETS_DISP_GAIN
839.089539
RETAINED_EARNINGS
501.631378
T_CA
428.929138
ESTIMATED_LIAB
363.835968
DEFER_TAX_LIAB
311.677368
CURRENT_RATIO
294.901703
N_CF_FR_FINAN_A
284.164001
GOODWILL
239.289185
N_INCOME
238.080658
CL_TA
224.853226
INVENTORIES
224.492996
ROE_A
214.091843
NOPERATE_EXP
196.429474
OTH_CA
192.453537
OTH_CL
191.517349
C_FR_MINO_S_SUBS
191.478973
GAIN_INVEST
187.904297
C_INF_FR_INVEST_A
180.430847
CASH_C_EQUIV
178.665329
3.6 GBM 调参过程
该模型需要添加的参数共有 7 个,本文选取了对该模型相对重要的几个参数进行调节.
第一个参数是:max_depth:模型中树的最大深度.
第二个参数是 n_estimators:模型中分类器的数量,该参数在模型中的作用较为强大,可以有效的提升模型的学习能力.
第三个参数是 learning_rate:学习率,该参数的有效调节对模型是否会过拟合发挥着重要作用,参数的取值范围为 [0,1],默认值为 0.1.为了能够有效的提升模型的泛化能力并且防止模型出现过拟合现象,本文经过网络搜索法并且查阅大量机器学习专业文献将 learning 设置为 0.0088.
from sklearn.ensemble import GradientBoostingClassifier
GBM1 = GradientBoostingClassifier(learning_rate=0.0088 ).fit(X_train,y_train)
y_pred_gbc = GBM1.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
GBM2 = GradientBoostingClassifier(n_estimators=130 ,learning_rate=0.0088 ).fit(X_train,y_train)
y_pred_gbc = GBM2.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
GBM3 = GradientBoostingClassifier(subsample=0.3 ,learning_rate=0.0088 ).fit(X_train,y_train)
y_pred_gbc = GBM3.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
GBM4 = GradientBoostingClassifier(min_samples_split=4 ,learning_rate=0.0088 ).fit(X_train,y_train)
y_pred_gbc = GBM4.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
GBM5 = GradientBoostingClassifier(min_samples_leaf=3 ,learning_rate=0.0088 ).fit(X_train,y_train)
y_pred_gbc = GBM5.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
GBM6 = GradientBoostingClassifier(max_depth=3 ,learning_rate=0.0088 ).fit(X_train,y_train)
y_pred_gbc = GBM6.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
GBM7 =GradientBoostingClassifier(validation_fraction=0.1 ,learning_rate=0.0088 ).fit(X_train,y_train)
y_pred_gbc = GBM7.predict_proba(test_data2)[:,1 ]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1 )
roc_auc = metrics.auc(fpr, tpr)
roc_auc
——筛选在GBM算法中特征重要性系数前20个指标
GBM_importances = GBM2.feature_importances_*10000
GBM = pd.Series(GBM_importances, index = a2.columns)
GBM = GBM.sort_values(ascending=False )
GBM = pd.DataFrame({'feature_importances' : GBM})
GBM.head(20 )
feature_importances
DILUTED_EPS
2545.405622
ASSETS_DISP_GAIN
1393.862699
RETAINED_EARNINGS
1201.324132
ESTIMATED_LIAB
1123.032285
NCA_DISPLOSS
496.913323
C_FR_CAP_CONTR
429.072252
OTH_GAIN
392.266813
NOPERATE_EXP
216.007424
PROC_SELL_INVEST
191.362051
T_CA
179.957164
DEFER_TAX_LIAB
172.171302
N_CF_FR_INVEST_A
157.682156
INT_PAYABLE
157.357831
DIV_PAYABLE
103.399260
C_PAID_OTH_FINAN_A
95.313147
INT_RECEIV
87.237988
REV_PS
86.175900
C_INF_FR_INVEST_A
76.066347
ADVANCE_RECEIPTS
64.396530
T_EQUITY_ATTR_P
62.686717
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步