Python——基于数据挖掘的上市公司财务造假识别(制造业)
制造业
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
from scipy import stats
from scipy.stats import norm, skew
t1=pd.read_csv("制造业.csv")
t1_train=t1.drop("FLAG",axis=1)
t1
TICKER_SYMBOL | ACT_PUBTIME | PUBLISH_DATE | END_DATE_REP | END_DATE | REPORT_TYPE | FISCAL_PERIOD | MERGED_FLAG | ACCOUTING_STANDARDS | CURRENCY_CD | ... | CA_TURNOVER | OPER_CYCLE | INVEN_TURNOVER | FA_TURNOVER | TFA_TURNOVER | DAYS_AP | DAYS_INVEN | TA_TURNOVER | AR_TURNOVER | FLAG | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4019 | 3 | 3 | 2 | 1 | A | 12 | 1 | CHAS_2007 | CNY | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 |
1 | 8166 | 3 | 3 | 2 | 1 | A | 12 | 1 | CHAS_2007 | CNY | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 |
2 | 11737 | 3 | 3 | 2 | 1 | A | 12 | 1 | CHAS_2007 | CNY | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 |
3 | 16479 | 3 | 3 | 2 | 1 | A | 12 | 1 | CHAS_2007 | CNY | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 |
4 | 16842 | 4 | 4 | 3 | 1 | A | 12 | 1 | CHAS_2007 | CNY | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
13965 | 4992204 | 7 | 7 | 7 | 6 | A | 12 | 1 | CHAS_2007 | CNY | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
13966 | 4992858 | 7 | 7 | 7 | 6 | A | 12 | 1 | CHAS_2007 | CNY | ... | NaN | 0.000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
13967 | 4993201 | 7 | 7 | 7 | 6 | A | 12 | 1 | CHAS_2007 | CNY | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
13968 | 4998808 | 7 | 7 | 7 | 6 | A | 12 | 1 | CHAS_2007 | CNY | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
13969 | 4999709 | 7 | 7 | 7 | 6 | A | 12 | 1 | CHAS_2007 | CNY | ... | 2.6656 | 23.084 | 21.9179 | 0.6571 | 0.6256 | 33.6589 | 16.4249 | 0.3692 | 54.0618 | NaN |
13970 rows × 363 columns
1 数据预处理
1.1计算缺失率,并降序排序
all_data_na = (t1_train.isnull().sum() / len(t1_train) * 100).sort_values(ascending=False)
missing_data = pd.DataFrame({'missing_data' : all_data_na})
missing_data
missing_data | |
---|---|
ACCRUED_EXP | 99.971367 |
N_INC_BORR_OTH_FI | 99.806729 |
PERPETUAL_BOND_L | 99.634932 |
PREFERRED_STOCK_L | 99.606299 |
PREFERRED_STOCK_E | 99.591983 |
... | ... |
T_COMPR_INCOME | 0.000000 |
N_INCOME_ATTR_P | 0.000000 |
FINAN_EXP | 0.000000 |
ACT_PUBTIME | 0.000000 |
TICKER_SYMBOL | 0.000000 |
362 rows × 1 columns
将缺失率用图表的方式展示
f, ax = plt.subplots(figsize=(30, 15))
plt.xticks(rotation='90')
sns.barplot(x=all_data_na.index, y=all_data_na) #条形图
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)
Text(0.5, 1.0, 'Percent missing data by feature')
# 统计缺失率大于80%的个数
missing_data_count1 = all_data_na.index[all_data_na > 80]
# 统计缺失率小于20%的个数
missing_data_count2 = all_data_na.index[all_data_na < 20]
print(missing_data_count1.shape,missing_data_count2.shape)
(93,) (84,)
#缺失率>80%的特征
a=missing_data.values[:93]
x=pd.DataFrame(a, index = missing_data.index[:93])
x
0 | |
---|---|
ACCRUED_EXP | 99.971367 |
N_INC_BORR_OTH_FI | 99.806729 |
PERPETUAL_BOND_L | 99.634932 |
PREFERRED_STOCK_L | 99.606299 |
PREFERRED_STOCK_E | 99.591983 |
... | ... |
OP_CL | 81.338583 |
R_D | 81.159628 |
N_CF_OPA_LIAB | 80.952040 |
N_CF_NFA_LIAB | 80.952040 |
OP_TL | 80.916249 |
93 rows × 1 columns
1.2 删除80%以上的缺失率
t2=t1_train.drop(columns=x.index)
t2
TICKER_SYMBOL | ACT_PUBTIME | PUBLISH_DATE | END_DATE_REP | END_DATE | REPORT_TYPE | FISCAL_PERIOD | MERGED_FLAG | ACCOUTING_STANDARDS | CURRENCY_CD | ... | AP_TURNOVER | CA_TURNOVER | OPER_CYCLE | INVEN_TURNOVER | FA_TURNOVER | TFA_TURNOVER | DAYS_AP | DAYS_INVEN | TA_TURNOVER | AR_TURNOVER | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4019 | 3 | 3 | 2 | 1 | A | 12 | 1 | CHAS_2007 | CNY | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 8166 | 3 | 3 | 2 | 1 | A | 12 | 1 | CHAS_2007 | CNY | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 11737 | 3 | 3 | 2 | 1 | A | 12 | 1 | CHAS_2007 | CNY | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 16479 | 3 | 3 | 2 | 1 | A | 12 | 1 | CHAS_2007 | CNY | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 16842 | 4 | 4 | 3 | 1 | A | 12 | 1 | CHAS_2007 | CNY | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
13965 | 4992204 | 7 | 7 | 7 | 6 | A | 12 | 1 | CHAS_2007 | CNY | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
13966 | 4992858 | 7 | 7 | 7 | 6 | A | 12 | 1 | CHAS_2007 | CNY | ... | NaN | NaN | 0.000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
13967 | 4993201 | 7 | 7 | 7 | 6 | A | 12 | 1 | CHAS_2007 | CNY | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
13968 | 4998808 | 7 | 7 | 7 | 6 | A | 12 | 1 | CHAS_2007 | CNY | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
13969 | 4999709 | 7 | 7 | 7 | 6 | A | 12 | 1 | CHAS_2007 | CNY | ... | 10.6956 | 2.6656 | 23.084 | 21.9179 | 0.6571 | 0.6256 | 33.6589 | 16.4249 | 0.3692 | 54.0618 |
13970 rows × 269 columns
1.3 对缺失率20%到80%的数据填充中位数
b=missing_data.index[93:278]
for o in b:
t2[o]=t2[o].fillna(t2[o].median())
t2
TICKER_SYMBOL | ACT_PUBTIME | PUBLISH_DATE | END_DATE_REP | END_DATE | REPORT_TYPE | FISCAL_PERIOD | MERGED_FLAG | ACCOUTING_STANDARDS | CURRENCY_CD | ... | AP_TURNOVER | CA_TURNOVER | OPER_CYCLE | INVEN_TURNOVER | FA_TURNOVER | TFA_TURNOVER | DAYS_AP | DAYS_INVEN | TA_TURNOVER | AR_TURNOVER | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4019 | 3 | 3 | 2 | 1 | A | 12 | 1 | CHAS_2007 | CNY | ... | 4.8617 | 1.0942 | 149.7293 | 4.1120 | 3.0696 | 2.7145 | 74.30515 | 87.75175 | 0.5354 | 8.49245 |
1 | 8166 | 3 | 3 | 2 | 1 | A | 12 | 1 | CHAS_2007 | CNY | ... | 4.8617 | 1.0942 | 149.7293 | 4.1120 | 3.0696 | 2.7145 | 74.30515 | 87.75175 | 0.5354 | 8.49245 |
2 | 11737 | 3 | 3 | 2 | 1 | A | 12 | 1 | CHAS_2007 | CNY | ... | 4.8617 | 1.0942 | 149.7293 | 4.1120 | 3.0696 | 2.7145 | 74.30515 | 87.75175 | 0.5354 | 8.49245 |
3 | 16479 | 3 | 3 | 2 | 1 | A | 12 | 1 | CHAS_2007 | CNY | ... | 4.8617 | 1.0942 | 149.7293 | 4.1120 | 3.0696 | 2.7145 | 74.30515 | 87.75175 | 0.5354 | 8.49245 |
4 | 16842 | 4 | 4 | 3 | 1 | A | 12 | 1 | CHAS_2007 | CNY | ... | 4.8617 | 1.0942 | 149.7293 | 4.1120 | 3.0696 | 2.7145 | 74.30515 | 87.75175 | 0.5354 | 8.49245 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
13965 | 4992204 | 7 | 7 | 7 | 6 | A | 12 | 1 | CHAS_2007 | CNY | ... | 4.8617 | 1.0942 | 149.7293 | 4.1120 | 3.0696 | 2.7145 | 74.30515 | 87.75175 | 0.5354 | 8.49245 |
13966 | 4992858 | 7 | 7 | 7 | 6 | A | 12 | 1 | CHAS_2007 | CNY | ... | 4.8617 | 1.0942 | 0.0000 | 4.1120 | 3.0696 | 2.7145 | 74.30515 | 87.75175 | 0.5354 | 8.49245 |
13967 | 4993201 | 7 | 7 | 7 | 6 | A | 12 | 1 | CHAS_2007 | CNY | ... | 4.8617 | 1.0942 | 149.7293 | 4.1120 | 3.0696 | 2.7145 | 74.30515 | 87.75175 | 0.5354 | 8.49245 |
13968 | 4998808 | 7 | 7 | 7 | 6 | A | 12 | 1 | CHAS_2007 | CNY | ... | 4.8617 | 1.0942 | 149.7293 | 4.1120 | 3.0696 | 2.7145 | 74.30515 | 87.75175 | 0.5354 | 8.49245 |
13969 | 4999709 | 7 | 7 | 7 | 6 | A | 12 | 1 | CHAS_2007 | CNY | ... | 10.6956 | 2.6656 | 23.0840 | 21.9179 | 0.6571 | 0.6256 | 33.65890 | 16.42490 | 0.3692 | 54.06180 |
13970 rows × 269 columns
1.4 对缺失率20%以下的数据使用KNN填充
d=missing_data.index[278:336] #列名
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=10)
t2[d] = imputer.fit_transform(t2[d])
print(t2.isnull().sum())
TICKER_SYMBOL 0
ACT_PUBTIME 0
PUBLISH_DATE 0
END_DATE_REP 0
END_DATE 0
..
TFA_TURNOVER 0
DAYS_AP 0
DAYS_INVEN 0
TA_TURNOVER 0
AR_TURNOVER 0
Length: 269, dtype: int64
1.5 删除与预测是否造假结果无关的特征因子
删除股票代码,实际披露时间,发布时间,报告截止日期,截止日期,报告类型,会计区间,合并标志:1-合并,2-母公司,会计准则,货币代码共 10 个与预测是否造假结果无关的特征因子
t2=t2.drop(["TICKER_SYMBOL","ACT_PUBTIME","PUBLISH_DATE","END_DATE_REP","END_DATE","REPORT_TYPE","FISCAL_PERIOD","MERGED_FLAG","ACCOUTING_STANDARDS","CURRENCY_CD"],axis=1)
1.6 查看是否还存在缺失值
t2.isna().any().sum()
0
1.7 对数据进行标准化
from sklearn.preprocessing import StandardScaler
#标准化,返回值为标准化后的数据
t4=pd.DataFrame(StandardScaler().fit_transform(t2),columns=t2.columns)
t4
CASH_C_EQUIV | NOTES_RECEIV | AR | PREPAYMENT | INT_RECEIV | OTH_RECEIV | INVENTORIES | OTH_CA | T_CA | AVAIL_FOR_SALE_FA | ... | AP_TURNOVER | CA_TURNOVER | OPER_CYCLE | INVEN_TURNOVER | FA_TURNOVER | TFA_TURNOVER | DAYS_AP | DAYS_INVEN | TA_TURNOVER | AR_TURNOVER | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.110544 | -0.106696 | -0.161667 | -0.182694 | -0.067294 | -0.177580 | -0.271929 | -0.054680 | -0.201905 | -0.087742 | ... | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
1 | -0.036496 | 1.088871 | -0.182107 | -0.052401 | -0.085668 | -0.026558 | 0.016419 | -0.171927 | 0.060346 | -0.087742 | ... | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
2 | 0.070766 | -0.189223 | 0.057981 | -0.140868 | 0.021829 | -0.115114 | -0.100801 | 0.073932 | -0.023286 | -0.110754 | ... | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
3 | -0.039637 | -0.205146 | -0.184401 | -0.159863 | -0.062639 | -0.060387 | -0.197651 | 0.346521 | -0.105029 | -0.087742 | ... | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
4 | -0.244743 | -0.199970 | -0.265148 | -0.148300 | -0.085668 | -0.182752 | -0.279125 | -0.178592 | -0.283117 | -0.087742 | ... | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
13965 | -0.245654 | -0.175257 | -0.248184 | -0.192613 | -0.085668 | -0.180662 | -0.279316 | -0.178050 | -0.279115 | -0.087742 | ... | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
13966 | -0.204023 | -0.205182 | -0.257308 | -0.191965 | -0.085668 | -0.175087 | -0.270255 | -0.175323 | -0.266459 | -0.087742 | ... | -0.071554 | -0.120861 | -0.063158 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
13967 | -0.227119 | -0.204127 | -0.201336 | -0.164736 | -0.085668 | -0.164288 | -0.183161 | -0.162139 | -0.237732 | -0.087742 | ... | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
13968 | 0.100220 | -0.204577 | -0.038156 | -0.128786 | -0.085668 | -0.128173 | 0.075970 | -0.152256 | -0.019633 | -0.068500 | ... | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
13969 | 1.609554 | 0.222399 | 1.025478 | 0.436742 | -0.085668 | 0.581671 | 0.691256 | 0.019537 | 1.080323 | -0.087742 | ... | 0.306581 | 2.042642 | -0.058198 | 0.017576 | -0.069761 | -0.013200 | -0.009263 | -0.050238 | -0.593860 | -0.006262 |
13970 rows × 259 columns
2 划分数据集
以前5年数据为训练集、验证集train,第6年为测试集test
#以前5年数据为训练集、验证集train,第6年为测试集test
train=t4.iloc[:11310,:]
test=t4.iloc[11310:,:259]
train["FLAG"]=t1["FLAG"]
train
train["FLAG"]=t1["FLAG"]
CASH_C_EQUIV | NOTES_RECEIV | AR | PREPAYMENT | INT_RECEIV | OTH_RECEIV | INVENTORIES | OTH_CA | T_CA | AVAIL_FOR_SALE_FA | ... | CA_TURNOVER | OPER_CYCLE | INVEN_TURNOVER | FA_TURNOVER | TFA_TURNOVER | DAYS_AP | DAYS_INVEN | TA_TURNOVER | AR_TURNOVER | FLAG | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.110544 | -0.106696 | -0.161667 | -0.182694 | -0.067294 | -0.177580 | -0.271929 | -0.054680 | -0.201905 | -0.087742 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
1 | -0.036496 | 1.088871 | -0.182107 | -0.052401 | -0.085668 | -0.026558 | 0.016419 | -0.171927 | 0.060346 | -0.087742 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
2 | 0.070766 | -0.189223 | 0.057981 | -0.140868 | 0.021829 | -0.115114 | -0.100801 | 0.073932 | -0.023286 | -0.110754 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
3 | -0.039637 | -0.205146 | -0.184401 | -0.159863 | -0.062639 | -0.060387 | -0.197651 | 0.346521 | -0.105029 | -0.087742 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
4 | -0.244743 | -0.199970 | -0.265148 | -0.148300 | -0.085668 | -0.182752 | -0.279125 | -0.178592 | -0.283117 | -0.087742 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
11305 | -0.248180 | -0.177748 | -0.244404 | -0.195324 | -0.085668 | -0.182942 | -0.277525 | -0.177054 | -0.279415 | -0.087742 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
11306 | -0.218672 | -0.196336 | -0.255531 | -0.193333 | -0.085668 | -0.160477 | -0.268560 | -0.174560 | -0.270623 | -0.087742 | ... | -1.587291 | 2.125000 | -0.031456 | -0.041665 | -0.012392 | 0.005011 | 2.477695 | -1.585968 | -0.054737 | 0.0 |
11307 | -0.200565 | -0.204200 | -0.232985 | -0.177734 | -0.085668 | -0.175507 | -0.207126 | -0.160690 | -0.242246 | -0.087742 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
11308 | -0.101380 | -0.197020 | -0.049710 | -0.100780 | -0.085668 | -0.178231 | 0.042636 | -0.123428 | -0.095392 | -0.064501 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
11309 | 1.326097 | 0.141651 | 0.889289 | 0.170126 | 0.029234 | 0.515964 | 0.529339 | 0.024492 | 0.854325 | 2.263116 | ... | 0.710450 | -0.058365 | 0.016279 | -0.071072 | -0.013249 | -0.009230 | -0.050121 | -0.719755 | 0.004747 | 0.0 |
11310 rows × 260 columns
import pandas as pd
train.to_excel("训练集、验证集.xlsx")
test.to_excel("测试集.xlsx")
2.1 样本不均衡处理
X_train1=np.array(train.iloc[:11310,:259])
y_train1 =train.FLAG.values
from collections import Counter
# 查看所生成的样本类别分布,0和1样本比例9比1,属于类别不平衡数据
print(Counter(y_train1))
Counter({0.0: 11219, 1.0: 91})
import matplotlib.pyplot as plt
# make data
x = [11219, 91]
labels = ['0', '1']
# plot
fig, ax = plt.subplots()
ax.pie(x, radius=3, center=(4, 4),labels=labels,
wedgeprops={"linewidth": 1, "edgecolor": "white"}, autopct='%.1f%%', frame=True)
ax.set(xlim=(0, 8), xticks=np.arange(1, 8),
ylim=(0, 8), yticks=np.arange(1, 8))
plt.show()
from imblearn.over_sampling import SMOTE
# 生成0和1比例为3比1的数据样本
oversample = SMOTE(sampling_strategy=0.2,random_state=42)
X_os, y_os = oversample.fit_resample(X_train1,y_train1)
print(Counter(y_os))
Counter({0.0: 11219, 1.0: 2243})
X_os.shape
(13462, 259)
import pandas as pd
a1 = pd.DataFrame(X_os)
a1["259"] = y_os
a1.columns = train.columns #添加列名
a1
#a.to_excel("洗好的数据.xlsx")
CASH_C_EQUIV | NOTES_RECEIV | AR | PREPAYMENT | INT_RECEIV | OTH_RECEIV | INVENTORIES | OTH_CA | T_CA | AVAIL_FOR_SALE_FA | ... | CA_TURNOVER | OPER_CYCLE | INVEN_TURNOVER | FA_TURNOVER | TFA_TURNOVER | DAYS_AP | DAYS_INVEN | TA_TURNOVER | AR_TURNOVER | FLAG | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.110544 | -0.106696 | -0.161667 | -0.182694 | -0.067294 | -0.177580 | -0.271929 | -0.054680 | -0.201905 | -0.087742 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
1 | -0.036496 | 1.088871 | -0.182107 | -0.052401 | -0.085668 | -0.026558 | 0.016419 | -0.171927 | 0.060346 | -0.087742 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
2 | 0.070766 | -0.189223 | 0.057981 | -0.140868 | 0.021829 | -0.115114 | -0.100801 | 0.073932 | -0.023286 | -0.110754 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
3 | -0.039637 | -0.205146 | -0.184401 | -0.159863 | -0.062639 | -0.060387 | -0.197651 | 0.346521 | -0.105029 | -0.087742 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
4 | -0.244743 | -0.199970 | -0.265148 | -0.148300 | -0.085668 | -0.182752 | -0.279125 | -0.178592 | -0.283117 | -0.087742 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
13457 | -0.194605 | -0.204111 | -0.235016 | -0.192665 | -0.095194 | -0.166159 | -0.266567 | -0.157112 | -0.255136 | -0.082809 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 1.0 |
13458 | -0.231584 | -0.196071 | -0.240270 | -0.175277 | -0.085668 | 0.100651 | -0.104562 | -0.135625 | -0.215914 | -0.093468 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 1.0 |
13459 | -0.172396 | -0.090448 | -0.126067 | -0.083162 | -0.085668 | -0.109957 | -0.217281 | -0.088728 | -0.182285 | -0.087742 | ... | -0.459342 | -0.013455 | -0.022908 | -0.067021 | -0.013149 | -0.008900 | -0.020502 | -0.563696 | -0.051747 | 1.0 |
13460 | 0.220213 | 0.428407 | 0.539064 | 0.129878 | 0.930931 | 0.152119 | 0.261990 | 0.167506 | 0.343409 | -0.036143 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 1.0 |
13461 | 0.015222 | 0.092724 | 0.156657 | 0.054408 | -0.085668 | -0.104527 | 0.074379 | -0.154946 | 0.042608 | -0.085088 | ... | -0.462918 | 0.015973 | -0.025581 | 0.770781 | 0.029360 | -0.008998 | 0.032707 | -0.261838 | 0.219923 | 1.0 |
13462 rows × 260 columns
a2 = a1.drop("FLAG",axis=1)
a2
CASH_C_EQUIV | NOTES_RECEIV | AR | PREPAYMENT | INT_RECEIV | OTH_RECEIV | INVENTORIES | OTH_CA | T_CA | AVAIL_FOR_SALE_FA | ... | AP_TURNOVER | CA_TURNOVER | OPER_CYCLE | INVEN_TURNOVER | FA_TURNOVER | TFA_TURNOVER | DAYS_AP | DAYS_INVEN | TA_TURNOVER | AR_TURNOVER | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.110544 | -0.106696 | -0.161667 | -0.182694 | -0.067294 | -0.177580 | -0.271929 | -0.054680 | -0.201905 | -0.087742 | ... | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
1 | -0.036496 | 1.088871 | -0.182107 | -0.052401 | -0.085668 | -0.026558 | 0.016419 | -0.171927 | 0.060346 | -0.087742 | ... | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
2 | 0.070766 | -0.189223 | 0.057981 | -0.140868 | 0.021829 | -0.115114 | -0.100801 | 0.073932 | -0.023286 | -0.110754 | ... | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
3 | -0.039637 | -0.205146 | -0.184401 | -0.159863 | -0.062639 | -0.060387 | -0.197651 | 0.346521 | -0.105029 | -0.087742 | ... | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
4 | -0.244743 | -0.199970 | -0.265148 | -0.148300 | -0.085668 | -0.182752 | -0.279125 | -0.178592 | -0.283117 | -0.087742 | ... | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
13457 | -0.194605 | -0.204111 | -0.235016 | -0.192665 | -0.095194 | -0.166159 | -0.266567 | -0.157112 | -0.255136 | -0.082809 | ... | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
13458 | -0.231584 | -0.196071 | -0.240270 | -0.175277 | -0.085668 | 0.100651 | -0.104562 | -0.135625 | -0.215914 | -0.093468 | ... | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
13459 | -0.172396 | -0.090448 | -0.126067 | -0.083162 | -0.085668 | -0.109957 | -0.217281 | -0.088728 | -0.182285 | -0.087742 | ... | 0.103209 | -0.459342 | -0.013455 | -0.022908 | -0.067021 | -0.013149 | -0.008900 | -0.020502 | -0.563696 | -0.051747 |
13460 | 0.220213 | 0.428407 | 0.539064 | 0.129878 | 0.930931 | 0.152119 | 0.261990 | 0.167506 | 0.343409 | -0.036143 | ... | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
13461 | 0.015222 | 0.092724 | 0.156657 | 0.054408 | -0.085668 | -0.104527 | 0.074379 | -0.154946 | 0.042608 | -0.085088 | ... | -0.040714 | -0.462918 | 0.015973 | -0.025581 | 0.770781 | 0.029360 | -0.008998 | 0.032707 | -0.261838 | 0.219923 |
13462 rows × 259 columns
2.2 划分训练集、验证集
#前 5 年制造业数据分别进行训练集与验证集的切割
from sklearn.model_selection import train_test_split
import pandas as pd
train_data,test_data1 = train_test_split(a1,test_size = 0.2,random_state=0)
#验证集
test_data1
CASH_C_EQUIV | NOTES_RECEIV | AR | PREPAYMENT | INT_RECEIV | OTH_RECEIV | INVENTORIES | OTH_CA | T_CA | AVAIL_FOR_SALE_FA | ... | CA_TURNOVER | OPER_CYCLE | INVEN_TURNOVER | FA_TURNOVER | TFA_TURNOVER | DAYS_AP | DAYS_INVEN | TA_TURNOVER | AR_TURNOVER | FLAG | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
10307 | -0.237276 | -0.149240 | -0.209804 | -0.164470 | -0.085668 | -0.159014 | -0.254104 | -0.178466 | -0.256564 | -0.087742 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
6913 | 2.860131 | 1.594538 | 0.480208 | 2.647223 | -0.085668 | 1.097509 | 8.681112 | 1.615339 | 3.903271 | 0.161419 | ... | -1.074846 | -0.055897 | -0.007669 | -0.055792 | -0.012656 | -0.009343 | -0.045661 | -0.929869 | -0.047602 | 0.0 |
7530 | -0.236536 | -0.178238 | -0.227843 | -0.184537 | -0.085668 | -0.175392 | -0.267297 | -0.177575 | -0.268344 | -0.087742 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
8204 | -0.247465 | -0.196230 | -0.194191 | -0.192289 | -0.085668 | -0.174138 | -0.264405 | -0.158679 | -0.264602 | -0.087742 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
11212 | 0.204918 | -0.167106 | 0.011418 | -0.155947 | 0.087527 | -0.077206 | 0.006010 | -0.041689 | 0.031319 | -0.087742 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
11098 | -0.204823 | -0.191856 | -0.257564 | -0.195262 | -0.085668 | -0.181840 | -0.262093 | -0.140187 | -0.261923 | -0.087742 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
451 | -0.146761 | -0.059989 | -0.141490 | -0.150467 | -0.087393 | -0.178277 | -0.149462 | -0.135039 | -0.148100 | -0.064996 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
5634 | -0.217310 | -0.048995 | -0.150073 | -0.185264 | -0.085668 | -0.147963 | -0.269352 | -0.144624 | -0.223935 | -0.085432 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
4379 | -0.225533 | -0.153682 | -0.128768 | -0.162863 | -0.111326 | -0.171458 | -0.151384 | 0.109614 | -0.174713 | -0.087742 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
7712 | -0.229782 | -0.188186 | -0.269640 | -0.172389 | -0.085668 | -0.171999 | -0.278531 | -0.171582 | -0.279448 | -0.087742 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
2693 rows × 260 columns
#训练集
train_data
CASH_C_EQUIV | NOTES_RECEIV | AR | PREPAYMENT | INT_RECEIV | OTH_RECEIV | INVENTORIES | OTH_CA | T_CA | AVAIL_FOR_SALE_FA | ... | CA_TURNOVER | OPER_CYCLE | INVEN_TURNOVER | FA_TURNOVER | TFA_TURNOVER | DAYS_AP | DAYS_INVEN | TA_TURNOVER | AR_TURNOVER | FLAG | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
11732 | -0.115619 | -0.159654 | -0.215413 | -0.066286 | -0.055511 | -0.135853 | -0.226678 | -0.152113 | -0.203331 | -0.087742 | ... | 2.124952 | -0.047612 | -0.005608 | -0.059694 | -0.012875 | -0.009210 | -0.042106 | 0.674642 | 0.081817 | 1.0 |
2849 | -0.232070 | -0.138400 | -0.136509 | -0.174268 | -0.085668 | -0.166162 | -0.091276 | -0.171693 | -0.196572 | -0.087742 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
4938 | 0.147070 | 0.316345 | 0.139180 | 0.228339 | 0.028565 | 1.242537 | -0.003413 | -0.167509 | 0.159136 | -0.087742 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
10029 | -0.214163 | -0.154632 | -0.230625 | -0.187055 | -0.085668 | -0.165990 | -0.213737 | -0.150521 | -0.240806 | -0.087742 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
5420 | -0.210466 | -0.171303 | -0.145414 | -0.145187 | -0.085668 | -0.153739 | -0.053173 | -0.062340 | -0.168963 | -0.087742 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
13123 | -0.244645 | -0.113926 | -0.200190 | -0.112910 | -0.095363 | -0.145159 | -0.109428 | -0.093625 | -0.201378 | -0.092453 | ... | -0.958218 | 0.086090 | -0.028829 | -0.063935 | -0.013042 | -0.007976 | 0.044354 | -1.065248 | -0.053682 | 1.0 |
3264 | -0.230011 | -0.182265 | -0.250535 | -0.182594 | -0.114804 | -0.176142 | -0.289794 | -0.178597 | -0.276976 | -0.087742 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
9845 | -0.078908 | -0.201787 | -0.191256 | -0.195170 | -0.085668 | -0.155015 | -0.258678 | -0.178345 | -0.205729 | -0.087742 | ... | -0.812152 | -0.054565 | 0.101067 | -0.071234 | -0.013280 | -0.009285 | -0.052963 | -1.054317 | -0.045676 | 1.0 |
10799 | -0.239831 | -0.174050 | -0.177255 | -0.192585 | -0.085668 | -0.101249 | -0.273990 | -0.151724 | -0.254010 | -0.103202 | ... | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 | 0.0 |
2732 | 1.702119 | 0.274794 | -0.037051 | 0.189707 | 2.197139 | -0.087228 | 0.200058 | -0.144783 | 0.668527 | -0.087742 | ... | 0.010898 | -0.026220 | -0.022478 | -0.055215 | -0.012707 | -0.009076 | -0.031109 | 0.198842 | -0.051366 | 0.0 |
10769 rows × 260 columns
#删除验证集FLAG
test_data2=test_data1.drop("FLAG",axis=1)
3 造假指标模型建立
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
from sklearn.metrics import auc
#特征重要性选择
from xgboost import plot_importance
#训练集数据
X_train=np.array(train_data.iloc[:,:259])
y_train =np.array(train_data["FLAG"])
#验证集数据
y=np.array(test_data1["FLAG"])
feature_1 = a1.drop('FLAG',axis = 1)
feature_1
CASH_C_EQUIV | NOTES_RECEIV | AR | PREPAYMENT | INT_RECEIV | OTH_RECEIV | INVENTORIES | OTH_CA | T_CA | AVAIL_FOR_SALE_FA | LT_EQUITY_INVEST | INVEST_REAL_ESTATE | FIXED_ASSETS | CIP | INTAN_ASSETS | GOODWILL | LT_AMOR_EXP | DEFER_TAX_ASSETS | OTH_NCA | T_NCA | T_ASSETS | ST_BORR | NOTES_PAYABLE | AP | ADVANCE_RECEIPTS | PAYROLL_PAYABLE | TAXES_PAYABLE | INT_PAYABLE | DIV_PAYABLE | OTH_PAYABLE | NCL_WITHIN_1Y | OTH_CL | T_CL | LT_BORR | LT_PAYABLE | ESTIMATED_LIAB | DEFER_REVENUE | DEFER_TAX_LIAB | T_NCL | T_LIAB | PAID_IN_CAPITAL | CAPITAL_RESER | SPECIAL_RESER | SURPLUS_RESER | RETAINED_EARNINGS | T_EQUITY_ATTR_P | MINORITY_INT | T_SH_EQUITY | T_LIAB_EQUITY | OTH_COMPRE_INCOME | C_PAID_OTH_FINAN_A | N_CF_FR_INVEST_A | C_FR_BORR | N_CF_OPERATE_A | C_FR_CAP_CONTR | C_PAID_INVEST | C_FR_OTH_FINAN_A | C_PAID_OTH_INVEST_A | C_INF_FR_INVEST_A | C_PAID_G_S | ... | TSE_TA | C_TA | TEAP_IC | LT_AMOR_EXP_TA | NCA_TA | ST_BORR_TA | NCL_TA | EQU_MULTIPLIER | CAP_FIX_RATIO | N_TAN_A_TA | REPAY_TA | ID_IC | AP_TA | INVEN_TA | CL_TA | ADV_R_TA | AR_TA | TEAP_TA | T_FIXED_A_TA | FIXED_A_TA | TRE_TA | CA_TA | INTAN_A_TA | AIL_TR | VAL_CHG_P_TR | COGS_TR | SELL_EXP_TR | PERIOD_EXP_TR | INV_INC_TR | IT_TP | OPA_P_TP | OP_TR | FINAN_EXP_TR | VAL_CHG_P_TP | NI_CUT_NI | OPA_P_TR | N_NOPI_TP | R_TR | NOPG_TR | NI_TR | TCOGS_TR | TP_TR | NOPL_TR | ADMIN_EXP_TR | EBITDA_TR | BTAX_SURCHG_TR | IT_TR | EBIT_TR | OP_TP | DAYS_AR | AP_TURNOVER | CA_TURNOVER | OPER_CYCLE | INVEN_TURNOVER | FA_TURNOVER | TFA_TURNOVER | DAYS_AP | DAYS_INVEN | TA_TURNOVER | AR_TURNOVER | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.110544 | -0.106696 | -0.161667 | -0.182694 | -0.067294 | -0.177580 | -0.271929 | -0.054680 | -0.201905 | -0.087742 | -0.142028 | -0.086266 | -0.200848 | -0.164105 | -0.277532 | -0.163757 | -0.234584 | -0.116027 | -0.114840 | -0.232795 | -0.228647 | -0.229273 | -0.225301 | -0.204865 | -0.163052 | -0.185749 | -0.160000 | -0.198005 | -0.077002 | -0.183300 | -0.187825 | -0.081143 | -0.254193 | -0.203991 | -0.139548 | -0.074228 | -0.097049 | -0.109180 | -0.209598 | -0.257919 | -0.252988 | 0.005258 | -0.146179 | -0.153425 | -0.171217 | -0.157934 | -0.154017 | -0.167159 | -0.228648 | -0.048196 | -0.157168 | 0.275978 | -0.286845 | -0.143991 | -0.194064 | -0.048096 | -0.153902 | -0.159625 | -0.019451 | -0.183022 | ... | 0.011147 | -0.127655 | 0.007816 | -0.090990 | -0.018944 | -0.014594 | -0.009392 | -0.020234 | -0.079771 | 0.011144 | -0.158489 | -0.020358 | -0.016036 | -0.144829 | -0.011974 | -0.010509 | -0.168016 | 0.011145 | -0.132080 | -0.126937 | 0.011562 | 0.019110 | -0.139211 | -0.008475 | -0.044078 | 0.087840 | -0.156706 | -0.008489 | -0.041648 | 0.010630 | 0.016545 | 0.008475 | -0.008480 | -0.011569 | 0.034511 | 0.008481 | -0.022897 | 0.038886 | -0.008784 | 0.008472 | -0.008481 | 0.008471 | -0.008531 | -0.008533 | -0.003868 | -0.044050 | -0.054146 | 0.008459 | 0.022926 | -0.012153 | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
1 | -0.036496 | 1.088871 | -0.182107 | -0.052401 | -0.085668 | -0.026558 | 0.016419 | -0.171927 | 0.060346 | -0.087742 | -0.153631 | -0.086266 | -0.186912 | -0.173041 | -0.111736 | -0.163757 | -0.172424 | -0.023101 | -0.114840 | -0.209060 | -0.067471 | -0.229273 | 0.827724 | 0.093545 | 0.092487 | 0.069331 | 0.087165 | -0.198005 | -0.077002 | -0.053152 | -0.187825 | -0.076646 | 0.043979 | -0.150792 | -0.139548 | -0.074228 | 0.100215 | -0.109180 | -0.186920 | -0.010804 | -0.109135 | -0.403590 | -0.146179 | 0.015458 | 0.038215 | -0.141095 | -0.154017 | -0.152307 | -0.067472 | -0.048196 | -0.157168 | 0.182495 | -0.232185 | 0.099247 | -0.194064 | -0.121115 | -0.153902 | -0.159625 | -0.154439 | -0.020811 | ... | 0.011147 | -0.127655 | 0.007816 | -0.090990 | -0.018944 | -0.014594 | -0.009392 | -0.020234 | -0.079771 | 0.011144 | -0.158489 | -0.020358 | -0.016036 | -0.144829 | -0.011974 | -0.010509 | -0.168016 | 0.011145 | -0.132080 | -0.126937 | 0.011562 | 0.019110 | -0.139211 | -0.008475 | -0.044078 | 0.087840 | -0.156706 | -0.008489 | -0.041648 | 0.010630 | 0.016545 | 0.008475 | -0.008480 | -0.011569 | 0.034511 | 0.008481 | -0.022897 | 0.038886 | -0.008784 | 0.008472 | -0.008481 | 0.008471 | -0.008531 | -0.008533 | -0.003868 | -0.044050 | -0.054146 | 0.008459 | 0.022926 | -0.012153 | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
2 | 0.070766 | -0.189223 | 0.057981 | -0.140868 | 0.021829 | -0.115114 | -0.100801 | 0.073932 | -0.023286 | -0.110754 | -0.067454 | -0.120642 | -0.200108 | 0.077432 | -0.148904 | 0.586702 | -0.089601 | -0.113231 | -0.059961 | -0.105702 | -0.064816 | -0.316157 | -0.231363 | -0.137904 | -0.011700 | -0.191030 | -0.024851 | -0.198005 | -0.105756 | -0.108134 | -0.187825 | -0.076646 | -0.200263 | -0.150792 | -0.139548 | -0.074228 | 0.260842 | -0.062466 | -0.163225 | -0.202685 | -0.029154 | 0.651345 | -0.146179 | -0.042070 | -0.044831 | 0.188303 | -0.064006 | 0.157133 | -0.064817 | -0.048196 | -0.157168 | -0.431703 | -0.250391 | 0.005059 | -0.243632 | 0.327603 | -0.153902 | -0.159625 | 0.228867 | -0.158202 | ... | 0.011147 | -0.127655 | 0.007816 | -0.090990 | -0.018944 | -0.014594 | -0.009392 | -0.020234 | -0.079771 | 0.011144 | -0.158489 | -0.020358 | -0.016036 | -0.144829 | -0.011974 | -0.010509 | -0.168016 | 0.011145 | -0.132080 | -0.126937 | 0.011562 | 0.019110 | -0.139211 | -0.008475 | -0.044078 | 0.087840 | -0.156706 | -0.008489 | -0.041648 | 0.010630 | 0.016545 | 0.008475 | -0.008480 | -0.011569 | 0.034511 | 0.008481 | -0.022897 | 0.038886 | -0.008784 | 0.008472 | -0.008481 | 0.008471 | -0.008531 | -0.008533 | -0.003868 | -0.044050 | -0.054146 | 0.008459 | 0.022926 | -0.012153 | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
3 | -0.039637 | -0.205146 | -0.184401 | -0.159863 | -0.062639 | -0.060387 | -0.197651 | 0.346521 | -0.105029 | -0.087742 | -0.120960 | -0.086266 | -0.221463 | -0.199303 | -0.177723 | -0.171951 | -0.027574 | -0.127111 | -0.117747 | -0.236460 | -0.175166 | -0.230604 | -0.164503 | -0.125771 | -0.142193 | -0.045072 | -0.156540 | -0.209112 | -0.077002 | -0.148004 | -0.242135 | -0.076646 | -0.188544 | -0.116849 | -0.139548 | -0.074228 | -0.128884 | -0.039623 | -0.162012 | -0.192893 | -0.286260 | 0.076386 | -0.146179 | -0.171182 | -0.131627 | -0.123079 | -0.177217 | -0.135453 | -0.175168 | -0.062454 | -0.157168 | 0.120646 | -0.198916 | -0.093394 | -0.245538 | 0.044030 | -0.189950 | 0.105294 | 0.073860 | -0.109958 | ... | 0.011147 | -0.127655 | 0.007816 | -0.090990 | -0.018944 | -0.014594 | -0.009392 | -0.020234 | -0.079771 | 0.011144 | -0.158489 | -0.020358 | -0.016036 | -0.144829 | -0.011974 | -0.010509 | -0.168016 | 0.011145 | -0.132080 | -0.126937 | 0.011562 | 0.019110 | -0.139211 | -0.008475 | -0.044078 | 0.087840 | -0.156706 | -0.008489 | -0.041648 | 0.010630 | 0.016545 | 0.008475 | -0.008480 | -0.011569 | 0.034511 | 0.008481 | -0.022897 | 0.038886 | -0.008784 | 0.008472 | -0.008481 | 0.008471 | -0.008531 | -0.008533 | -0.003868 | -0.044050 | -0.054146 | 0.008459 | 0.022926 | -0.012153 | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
4 | -0.244743 | -0.199970 | -0.265148 | -0.148300 | -0.085668 | -0.182752 | -0.279125 | -0.178592 | -0.283117 | -0.087742 | -0.120960 | -0.086266 | -0.282834 | -0.196378 | -0.306345 | -0.163757 | -0.189353 | -0.135653 | -0.114840 | -0.288391 | -0.302069 | -0.312094 | -0.194654 | -0.204394 | -0.168588 | -0.229982 | -0.185895 | -0.217441 | -0.108710 | -0.183572 | -0.187825 | -0.076646 | -0.259576 | -0.150792 | -0.139548 | -0.074228 | -0.162071 | -0.109180 | -0.218931 | -0.264605 | -0.401198 | -0.402463 | -0.146179 | -0.176925 | -0.219024 | -0.355697 | -0.179565 | -0.341002 | -0.302070 | -0.048196 | -0.176421 | 0.224868 | -0.285642 | -0.201230 | -0.194064 | -0.121115 | -0.153902 | -0.159625 | -0.156845 | -0.177266 | ... | 0.011147 | -0.127655 | 0.007816 | -0.090990 | -0.018944 | -0.014594 | -0.009392 | -0.020234 | -0.079771 | 0.011144 | -0.158489 | -0.020358 | -0.016036 | -0.144829 | -0.011974 | -0.010509 | -0.168016 | 0.011145 | -0.132080 | -0.126937 | 0.011562 | 0.019110 | -0.139211 | -0.008475 | -0.044078 | 0.087840 | -0.156706 | -0.008489 | -0.041648 | 0.010630 | 0.016545 | 0.008475 | -0.008480 | -0.011569 | 0.034511 | 0.008481 | -0.022897 | 0.038886 | -0.008784 | 0.008472 | -0.008481 | 0.008471 | -0.008531 | -0.008533 | -0.003868 | -0.044050 | -0.054146 | 0.008459 | 0.022926 | -0.012153 | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
13457 | -0.194605 | -0.204111 | -0.235016 | -0.192665 | -0.095194 | -0.166159 | -0.266567 | -0.157112 | -0.255136 | -0.082809 | -0.144837 | -0.086266 | -0.234505 | -0.179108 | -0.247421 | -0.173251 | -0.202890 | -0.117034 | -0.115165 | -0.249601 | -0.267204 | -0.241517 | -0.233453 | -0.203503 | -0.160887 | -0.201682 | -0.171888 | -0.198005 | -0.077002 | -0.156267 | -0.187825 | -0.076646 | -0.251653 | -0.150792 | -0.139548 | -0.085263 | -0.142678 | -0.124578 | -0.215774 | -0.257401 | -0.138162 | -0.345235 | -0.146179 | -0.164382 | -0.194570 | -0.269117 | -0.180658 | -0.264808 | -0.267205 | -0.048232 | -0.157168 | 0.282769 | -0.277337 | -0.214423 | -0.237723 | -0.089799 | -0.174903 | -0.159625 | -0.081430 | -0.168416 | ... | 0.011147 | -0.127655 | 0.007816 | -0.090990 | -0.018944 | -0.014594 | -0.009392 | -0.020234 | -0.079771 | 0.011144 | -0.158489 | -0.020358 | -0.016036 | -0.144829 | -0.011974 | -0.010509 | -0.168016 | 0.011145 | -0.132080 | -0.126937 | 0.011562 | 0.019110 | -0.139211 | -0.008475 | -0.044078 | 0.087840 | -0.156706 | -0.008489 | -0.041648 | 0.010630 | 0.016545 | 0.008475 | -0.008480 | -0.011569 | 0.034511 | 0.008481 | -0.022897 | 0.038886 | -0.008784 | 0.008472 | -0.008481 | 0.008471 | -0.008531 | -0.008533 | -0.003868 | -0.044050 | -0.054146 | 0.008459 | 0.022926 | -0.012153 | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
13458 | -0.231584 | -0.196071 | -0.240270 | -0.175277 | -0.085668 | 0.100651 | -0.104562 | -0.135625 | -0.215914 | -0.093468 | -0.075270 | 0.064067 | -0.170791 | -0.197127 | -0.291951 | -0.174709 | -0.235937 | -0.132742 | -0.119921 | -0.205796 | -0.223482 | -0.183847 | -0.242438 | -0.113028 | -0.165859 | -0.198643 | -0.352867 | -0.101828 | -0.078867 | 0.014294 | -0.243134 | -0.077372 | -0.178927 | -0.182819 | -0.139548 | 0.072025 | -0.158947 | -0.109180 | -0.191181 | -0.192376 | -0.015725 | -0.131649 | -0.146179 | -0.104321 | -0.381811 | -0.255928 | -0.208644 | -0.257609 | -0.223483 | -0.037582 | -0.157168 | 0.219093 | -0.215293 | -0.297951 | -0.196203 | -0.126649 | 0.038585 | -0.159625 | -0.140010 | -0.169934 | ... | 0.011147 | -0.127655 | 0.007816 | -0.090990 | -0.018944 | -0.014594 | -0.009392 | -0.020234 | -0.079771 | 0.011144 | -0.158489 | -0.020358 | -0.016036 | -0.144829 | -0.011974 | -0.010509 | -0.168016 | 0.011145 | -0.132080 | -0.126937 | 0.011562 | 0.019110 | -0.139211 | -0.008475 | -0.044078 | 0.087840 | -0.156706 | -0.008489 | -0.041648 | 0.010630 | 0.016545 | 0.008475 | -0.008480 | -0.011569 | 0.034511 | 0.008481 | -0.022897 | 0.038886 | -0.008784 | 0.008472 | -0.008481 | 0.008471 | -0.008531 | -0.008533 | -0.003868 | -0.044050 | -0.054146 | 0.008459 | 0.022926 | -0.012153 | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
13459 | -0.172396 | -0.090448 | -0.126067 | -0.083162 | -0.085668 | -0.109957 | -0.217281 | -0.088728 | -0.182285 | -0.087742 | -0.123844 | -0.120632 | -0.234071 | -0.054888 | -0.188660 | -0.163757 | -0.128242 | -0.031042 | -0.114840 | -0.221879 | -0.212135 | -0.252805 | -0.194654 | -0.079903 | -0.078436 | -0.206288 | -0.167967 | 0.184241 | -0.077002 | -0.023900 | -0.187825 | -0.076646 | -0.186352 | -0.150792 | -0.139548 | 0.849701 | -0.147337 | -0.109180 | -0.130924 | -0.183368 | 0.092909 | -0.037259 | -0.222978 | -0.104361 | -0.472813 | -0.247493 | -0.145101 | -0.243325 | -0.212136 | -0.048196 | -0.157168 | 0.284709 | -0.227859 | -0.224250 | -0.194064 | -0.041249 | -0.132306 | -0.192049 | -0.032639 | -0.181799 | ... | 0.011101 | -0.091564 | -0.763986 | -0.071247 | 0.996515 | -0.014163 | -0.009115 | 0.068415 | 0.295576 | 0.011159 | -0.254426 | 0.988053 | 0.000208 | 0.124612 | -0.011954 | -0.011675 | 0.593479 | 0.011130 | 2.383589 | 2.401008 | 0.011301 | -1.000220 | 0.351214 | -0.008270 | -0.051766 | 1.752778 | -0.611433 | -0.008477 | -0.053741 | -0.113486 | 0.027801 | 0.008361 | -0.008459 | -0.013867 | 0.070320 | 0.008370 | -0.032261 | 0.038886 | -0.008778 | 0.008361 | -0.008370 | 0.008355 | -0.008532 | -0.008450 | -0.472817 | 0.014996 | -0.339346 | 0.008169 | 0.032168 | 0.010236 | 0.103209 | -0.459342 | -0.013455 | -0.022908 | -0.067021 | -0.013149 | -0.008900 | -0.020502 | -0.563696 | -0.051747 |
13460 | 0.220213 | 0.428407 | 0.539064 | 0.129878 | 0.930931 | 0.152119 | 0.261990 | 0.167506 | 0.343409 | -0.036143 | -0.133788 | -0.086266 | 0.574080 | 1.685421 | 0.752913 | -0.025926 | -0.010816 | 0.002251 | 0.311119 | 0.653651 | 0.514408 | 0.162320 | -0.251056 | 0.137400 | -0.154652 | -0.148068 | 0.305776 | 2.339665 | -0.077002 | -0.083185 | -0.018381 | 0.503922 | 0.097436 | -0.155138 | -0.158031 | -0.074228 | 0.376548 | 0.404251 | 1.020602 | 0.333509 | -0.014059 | 1.199035 | -0.146179 | 0.190460 | 0.870264 | 0.871184 | -0.024417 | 0.765729 | 0.514407 | -0.079826 | 0.026608 | -0.935087 | 0.135455 | 0.398947 | -0.243537 | -0.125528 | -0.021834 | -0.024758 | -0.114136 | 0.050575 | ... | 0.011147 | -0.127655 | 0.007816 | -0.090990 | -0.018944 | -0.014594 | -0.009392 | -0.020234 | -0.079771 | 0.011144 | -0.158489 | -0.020358 | -0.016036 | -0.144829 | -0.011974 | -0.010509 | -0.168016 | 0.011145 | -0.132080 | -0.126937 | 0.011562 | 0.019110 | -0.139211 | -0.008475 | -0.044078 | 0.087840 | -0.156706 | -0.008489 | -0.041648 | 0.010630 | 0.016545 | 0.008475 | -0.008480 | -0.011569 | 0.034511 | 0.008481 | -0.022897 | 0.038886 | -0.008784 | 0.008472 | -0.008481 | 0.008471 | -0.008531 | -0.008533 | -0.003868 | -0.044050 | -0.054146 | 0.008459 | 0.022926 | -0.012153 | -0.071554 | -0.120861 | -0.030988 | -0.022326 | -0.046674 | -0.012361 | -0.008952 | -0.031444 | -0.112856 | -0.047602 |
13461 | 0.015222 | 0.092724 | 0.156657 | 0.054408 | -0.085668 | -0.104527 | 0.074379 | -0.154946 | 0.042608 | -0.085088 | -0.117501 | -0.086266 | 0.083504 | -0.131142 | -0.256025 | -0.170433 | 0.338018 | -0.074333 | 0.013288 | -0.046307 | 0.001719 | -0.154665 | 0.307370 | 0.160793 | -0.096559 | -0.026013 | -0.139392 | -0.198598 | -0.077002 | 0.065349 | 0.128240 | -0.076646 | 0.029736 | 0.175793 | -0.115495 | -0.083274 | 0.024891 | -0.113892 | 0.041858 | 0.034694 | -0.113173 | 0.029863 | -0.146179 | -0.023991 | -0.028819 | -0.031130 | -0.140034 | -0.050337 | 0.001718 | -0.020687 | -0.167773 | 0.198412 | -0.077565 | 0.225320 | -0.194064 | -0.131067 | -0.063616 | -0.128632 | -0.114486 | -0.004891 | ... | 0.011075 | 0.102449 | -0.414292 | -0.090990 | -0.811893 | -0.014594 | -0.009182 | 0.034719 | -0.130274 | 0.011120 | -0.128349 | 0.499511 | -0.014519 | 1.970255 | -0.011936 | -0.004084 | -0.587565 | 0.011085 | -0.768414 | -0.818383 | 0.011551 | 0.806651 | -0.422075 | -0.008475 | -0.038851 | 0.377146 | -0.372627 | -0.008509 | -0.038106 | 0.069489 | 0.016263 | 0.008474 | -0.008479 | -0.008330 | 0.041396 | 0.008481 | -0.024758 | 0.015333 | -0.008814 | 0.008469 | -0.008481 | 0.008468 | -0.008534 | -0.008605 | -0.058418 | 0.213152 | -0.037802 | 0.008452 | 0.024738 | -0.018517 | -0.040714 | -0.462918 | 0.015973 | -0.025581 | 0.770781 | 0.029360 | -0.008998 | 0.032707 | -0.261838 | 0.219923 |
13462 rows × 259 columns
3.1 Logistics Regression 调参过程
在模型中先固定参数的默认值,然后进行参数调节,进行网格搜索与专业文献查阅寻找精确度最高而又不引起模型过拟合的参数值.模型优化评价指标为 AUC 值.
在逻辑回归模型中,需要调整的参数共有 2 个:penalty 与 C, 其中 penalty 是正则化方法,C 为逻辑回归中的超参数,表示正则化强度的倒数,在模型中默认为 1,表示正则项与损失函数的比值为 1:1.当模型中的 C 越小时,会导致损失损失函数越小,从而对其惩罚更重,正则化作用越强.
from sklearn.linear_model import LogisticRegression
#训练逻辑回归模型 C调参范围 [0.05,0.1,0.2,0.3]
clf1 = LogisticRegression(C=0.2,penalty="l2").fit(X_train, y_train)
#返回预测属于股票代码的概率
y_pred_gbc = clf1.predict_proba(test_data2)[:,1]
#查看召回率
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc=metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
0.884673004897724
#penalty调参范围:[l1、l2、none]
clf2= LogisticRegression(penalty="none").fit(X_train, y_train)
y_pred_gbc = clf2.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc=metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
0.9013195044655719
逻辑回归属于线性判别模型,而本文所处理的数据集维度较高,故可能存在其他非线性模型能够表现的更好.
3.2 SVM 调参过程
SVM 需要调整的参数有 2 个,分别为 kernal 和 C,
其中 kernal 代表核方法,可选的函数有:“poly”:多项式核函数,“rbf”:高斯核函数 (径向基函数),“linear”:线性核函数,“sigmod”:核函数.核函数在 SVM 中发挥着重要功能,在简化向量内积运算起着重要作用,其中高斯核函数在非线性分类问题上广泛应用.
C 代表错误项的惩罚系数,在软间隔分类中应用较多.C 越大,对错误样本的惩罚力度就越大,训练的样本准确率越高.但是容易产生过拟合现象,机器模型的泛化能力降低.相反,C 取较小的值时,允许训练样本中存在错误分类的样本,能够增强模型的泛化能力.
from sklearn import svm
#kernal调参范围: ["linear","rbf","sigmoid","poly"]
svm1 = svm.SVC(kernel='rbf',probability=True).fit(X_train, y_train)
y_pred_gbc = svm1.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc=metrics.auc(fpr, tpr)
roc_auc
0.8988533563814463
#kernal调参范围: ["linear","rbf","sigmoid","poly"]
svm2 = svm.SVC(C=0.003,probability=True).fit(X_train, y_train)
y_pred_gbc = svm2.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc=metrics.auc(fpr, tpr)
roc_auc
0.8741169691731491
3.3 RF 调参过程
在随机森林模型中需要调节的参数有 4 个,分别为
max_depth:树的最大深度、
n_estinators:树模型的数量、
min_samples_split:中间节点分支所需的最小样本数量、
min_sample_leaf:叶节点存在所需的最小样本数量.
为了防止模型出现过拟合现象,本文在调节其他参数时控制 max_depth=3.
from sklearn.ensemble import RandomForestClassifier
#max_depth调参范围: [3,5,7,8,11,13]
RF1 = RandomForestClassifier(max_depth=3, random_state=0).fit(X_train, y_train)
y_pred_gbc = RF1.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.8762738884087198
#n_estinators 调参范围: [300,400,500,600,700]
RF2 = RandomForestClassifier(n_estimators=600, random_state=0,max_depth=3).fit(X_train, y_train)
y_pred_gbc = RF2.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.8856746374723902
#min_samples_leaf 调参范围: [10,20,40,60,70,80,100]
RF3= RandomForestClassifier(min_samples_leaf=30, random_state=0,max_depth=3).fit(X_train, y_train)
y_pred_gbc = RF3.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.8766320944972631
#min_samples_split 调参范围: [60,70,80,90,110,130]
RF4 = RandomForestClassifier(min_samples_split=80, random_state=0,max_depth=3).fit(X_train, y_train)
y_pred_gbc = RF4.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.8757802746566791
3.4 DT 调参过程
在决策树模型中需要调整的参数共有 4 个,分别为
max_depth:树的最大深度、
min_samples_split:中间节点分支所需要的的最小样本量、
min_sample_leaf:叶节点存在所需的最小样本量、
max_leaf_nodes:最大叶子节点数.
为了防止模型出现过拟合现象,本文调节其他参数对模型的 AUC 影响时控制 max_depth=6
from sklearn import tree
#max_depth 调参范围: [5,6,7,8,9,10]
DT1 =tree.DecisionTreeClassifier(max_depth=6).fit(X_train,y_train)
y_pred_gbc = DT1.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.9026846249879958
#min_samples_leaf 调参范围: [2,3,6,8]
DT1 =tree.DecisionTreeClassifier(min_samples_leaf=3,max_depth=6).fit(X_train,y_train)
y_pred_gbc = DT1.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.9008700662633248
#max_leaf_nodes 调参范围: [50,60,70,80,100]
DT2 =tree.DecisionTreeClassifier(max_leaf_nodes=70, max_depth=6).fit(X_train,y_train)
y_pred_gbc = DT2.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.9006472678382791
#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 max_depth=6
#min_samples_split 调参范围: [2,3,4,5,6,8]
DT3 =tree.DecisionTreeClassifier(min_samples_split=3,max_depth=6).fit(X_train,y_train)
y_pred_gbc = DT3.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.9005358686257564
——筛选在DT算法中特征重要性系数前20个指标
DT_importances = DT3.feature_importances_*10000
DT = pd.Series(DT_importances, index = a2.columns)
DT = DT.sort_values(ascending=False)
DT = pd.DataFrame({'feature_importances' : DT})
DT.head(20)
feature_importances | |
---|---|
DILUTED_EPS | 1876.459630 |
ESTIMATED_LIAB | 1418.956658 |
RETAINED_EARNINGS | 1274.737100 |
ASSETS_DISP_GAIN | 1189.341616 |
C_FR_CAP_CONTR | 350.146297 |
CASH_C_EQUIV | 305.869870 |
DEFER_TAX_LIAB | 305.319042 |
INT_RECEIV | 278.811896 |
CURRENT_RATIO | 254.761597 |
N_CF_FR_FINAN_A | 245.185566 |
OTH_GAIN | 214.434895 |
GOODWILL | 207.893979 |
N_INCOME | 199.309895 |
CL_TA | 190.726453 |
NOPERATE_EXP | 180.024039 |
OTH_CL | 167.889986 |
GAIN_INVEST | 160.657452 |
DIV_PAYABLE | 157.439514 |
IT_TR | 117.051125 |
A_J_INVEST_INCOME | 101.434231 |
3.5 XGBoost 调参过程
XGBoost 需要调节的参数共有 9 个,下面本文只介绍对该模型相对重要的两个参数:
第一个参数是 n_estimators,在 XGBoost 模型中这个参数发挥着重要作用,表示该模型中分类器的个数,该参数的值越大,模型的学习能力就会越强.
第二个参数是learning_rate,learning_rate 表示集成模型中的学习速率,又被称之为步长控制迭代速率,有效的调节该参数值能够防止模型出现过拟合现象,默认值为 0.1,调节范围为[0,1].
本文为了尽可能防止模型出现过拟合,在调节其他参数的值时将学习率设定为0.001.
from xgboost import XGBClassifier
from xgboost import plot_importance
#learning_rate 调参范围: [0.001,0.002,0.003,0.0035]
XGBoost1 = XGBClassifier(learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost1.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[22:58:21] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
0.9089825218476904
#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 learning_rate=0.001
#learning_rate 调参范围: [100,110,120,200,300]
XGBoost2 = XGBClassifier(n_estimators=120,learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost2.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[22:58:59] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
0.911076058772688
#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 learning_rate=1
#max_depth 调参范围: [2,3,5,6,7,10]
XGBoost3 = XGBClassifier(max_depth=6,learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost3.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[22:59:42] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
0.9089825218476904
#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 learning_rate=1
#min_child_weight 调参范围: [1,3,4,5,7,8]
XGBoost4 = XGBClassifier(min_child_weight=3,learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost4.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[23:00:24] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
0.9118438490348603
#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 learning_rate=1
#Gamma 调参范围: [0.2,0.3,0.5,0.6,0.7,0.8]
XGBoost5 = XGBClassifier(gamma=0.4,learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost5.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[23:01:06] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
0.9089844425237683
#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 learning_rate=1
#Colsample_btree 调参范围: [0.6,0.7,0.8,0.85,0.9]
XGBoost7 = XGBClassifier(colsample_btree=0.85,learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost7.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
[23:01:49] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573:
Parameters: { "colsample_btree" } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[23:01:49] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
0.9089825218476904
#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 learning_rate=1
#reg_alpha 调参范围: [0.1,0.2,0.25,0.3,0.35]
XGBoost8 = XGBClassifier(reg_alpha=0.2,learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost8.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[23:02:35] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
0.9087702871410737
#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 learning_rate=1
#reg_lambda 调参范围: [0.15,0.3,0.5,0.8]
XGBoost9 = XGBClassifier(reg_lambda=0.3,learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost9.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[23:03:49] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
0.9160871026601364
——筛选在XGBoost算法中特征重要性系数前20个指标
XGBoost_importances = XGBoost9.feature_importances_*10000
XGBoost = pd.Series(XGBoost_importances, index = a2.columns)
XGBoost = XGBoost.sort_values(ascending=False)
XGBoost = pd.DataFrame({'feature_importances' : XGBoost})
XGBoost.head(20)
feature_importances | |
---|---|
DILUTED_EPS | 1092.639893 |
ASSETS_DISP_GAIN | 839.089539 |
RETAINED_EARNINGS | 501.631378 |
T_CA | 428.929138 |
ESTIMATED_LIAB | 363.835968 |
DEFER_TAX_LIAB | 311.677368 |
CURRENT_RATIO | 294.901703 |
N_CF_FR_FINAN_A | 284.164001 |
GOODWILL | 239.289185 |
N_INCOME | 238.080658 |
CL_TA | 224.853226 |
INVENTORIES | 224.492996 |
ROE_A | 214.091843 |
NOPERATE_EXP | 196.429474 |
OTH_CA | 192.453537 |
OTH_CL | 191.517349 |
C_FR_MINO_S_SUBS | 191.478973 |
GAIN_INVEST | 187.904297 |
C_INF_FR_INVEST_A | 180.430847 |
CASH_C_EQUIV | 178.665329 |
3.6 GBM 调参过程
该模型需要添加的参数共有 7 个,本文选取了对该模型相对重要的几个参数进行调节.
第一个参数是:max_depth:模型中树的最大深度.
第二个参数是 n_estimators:模型中分类器的数量,该参数在模型中的作用较为强大,可以有效的提升模型的学习能力.
第三个参数是 learning_rate:学习率,该参数的有效调节对模型是否会过拟合发挥着重要作用,参数的取值范围为 [0,1],默认值为 0.1.为了能够有效的提升模型的泛化能力并且防止模型出现过拟合现象,本文经过网络搜索法并且查阅大量机器学习专业文献将 learning 设置为 0.0088.
from sklearn.ensemble import GradientBoostingClassifier
#learning_rate 调参范围: [0.004,0.007,0.0076,0.0088,0.009]
GBM1 = GradientBoostingClassifier(learning_rate=0.0088).fit(X_train,y_train)
y_pred_gbc = GBM1.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.9016575434552964
#n_estimators 调参范围: [110,120,130,140,160]
GBM2 = GradientBoostingClassifier(n_estimators=130,learning_rate=0.0088).fit(X_train,y_train)
y_pred_gbc = GBM2.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.9171185057140113
#Subsample 调参范围: [0.1,0.2,0.25,0.3,0.4]
GBM3 = GradientBoostingClassifier(subsample=0.3,learning_rate=0.0088).fit(X_train,y_train)
y_pred_gbc = GBM3.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.9131902429655239
#min_samples_split 调参范围: [2,3,4,5,6]
GBM4 = GradientBoostingClassifier(min_samples_split=4,learning_rate=0.0088).fit(X_train,y_train)
y_pred_gbc = GBM4.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.9016575434552964
#mmin_samples_leaf 调参范围: [2,3,4,6,7,9]
GBM5 = GradientBoostingClassifier(min_samples_leaf=3,learning_rate=0.0088).fit(X_train,y_train)
y_pred_gbc = GBM5.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.9021089023336215
#max_depth 调参范围: [2,3,4,5,8]
GBM6 = GradientBoostingClassifier(max_depth=3,learning_rate=0.0088).fit(X_train,y_train)
y_pred_gbc = GBM6.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.9016575434552964
#validation_fraction 调参范围: [0.1,0.3,0.4,0.5,0.7,0.8]
GBM7 =GradientBoostingClassifier(validation_fraction=0.1,learning_rate=0.0088).fit(X_train,y_train)
y_pred_gbc = GBM7.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.9016575434552964
——筛选在GBM算法中特征重要性系数前20个指标
GBM_importances = GBM2.feature_importances_*10000
GBM = pd.Series(GBM_importances, index = a2.columns)
GBM = GBM.sort_values(ascending=False)
GBM = pd.DataFrame({'feature_importances' : GBM})
GBM.head(20)
feature_importances | |
---|---|
DILUTED_EPS | 2545.405622 |
ASSETS_DISP_GAIN | 1393.862699 |
RETAINED_EARNINGS | 1201.324132 |
ESTIMATED_LIAB | 1123.032285 |
NCA_DISPLOSS | 496.913323 |
C_FR_CAP_CONTR | 429.072252 |
OTH_GAIN | 392.266813 |
NOPERATE_EXP | 216.007424 |
PROC_SELL_INVEST | 191.362051 |
T_CA | 179.957164 |
DEFER_TAX_LIAB | 172.171302 |
N_CF_FR_INVEST_A | 157.682156 |
INT_PAYABLE | 157.357831 |
DIV_PAYABLE | 103.399260 |
C_PAID_OTH_FINAN_A | 95.313147 |
INT_RECEIV | 87.237988 |
REV_PS | 86.175900 |
C_INF_FR_INVEST_A | 76.066347 |
ADVANCE_RECEIPTS | 64.396530 |
T_EQUITY_ATTR_P | 62.686717 |