python: 模型的统计信息

</head> <body>
In [57]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.graphics.mosaicplot import mosaic

%pylab inline
mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号 '-' 显示为方块的问题

Populating the interactive namespace from numpy and matplotlib

载入和分析数据

In [4]:
def readData(path):
    """
    使用pandas读取数据
    """
    data = pd.read_csv(path)
    cols = ["age", "education_num", "capital_gain", "capital_loss", "hours_per_week", "label"]
    return data[cols]

data = readData('dataset/adult.data')

In [5]:
data.hist(rwidth=0.9, grid=False, figsize=(8, 8), alpha=0.6)   # 各个变量的直方图
plt.show()
In [6]:
data.head()
Out[6]:
age education_num capital_gain capital_loss hours_per_week label
0 39 13 2174 0 40 <=50K
1 50 13 0 0 13 <=50K
2 38 9 0 0 40 <=50K
3 53 7 0 0 40 <=50K
4 28 13 0 0 40 <=50K
In [8]:
data['label_code'] = pd.Categorical(data.label).codes   # 将文字变量转化为数字变量
In [10]:
data[['label', 'label_code']].head(8)
Out[10]:
label label_code
0 <=50K 0
1 <=50K 0
2 <=50K 0
3 <=50K 0
4 <=50K 0
5 <=50K 0
6 <=50K 0
7 >50K 1
In [18]:
def visualData(df):
    """
    画直方图,直观了解数据
    """
    df.hist(
        rwidth=0.9, grid=False, figsize=(8, 8), alpha=0.6, color="grey")
    plt.show(block=False)
In [19]:
df = data[["age", "education_num", "capital_gain", "capital_loss", "hours_per_week", "label_code"]]
visualData(df)
In [20]:
data.describe()  # 仅仅显示数值型变量的基本统计信息,若要显示全部变量的统计信息只需传入参数 `include='all'`
Out[20]:
age education_num capital_gain capital_loss hours_per_week label_code
count 32561.000000 32561.000000 32561.000000 32561.000000 32561.000000 32561.000000
mean 38.581647 10.080679 1077.648844 87.303830 40.437456 0.240810
std 13.640433 2.572720 7385.292085 402.960219 12.347429 0.427581
min 17.000000 1.000000 0.000000 0.000000 1.000000 0.000000
25% 28.000000 9.000000 0.000000 0.000000 40.000000 0.000000
50% 37.000000 10.000000 0.000000 0.000000 40.000000 0.000000
75% 48.000000 12.000000 0.000000 0.000000 45.000000 0.000000
max 90.000000 16.000000 99999.000000 4356.000000 99.000000 1.000000

交叉报表

交叉报表用来描述两个变量之间的关系。

  • 按四分位数划分为 $4$ 个区间
In [23]:
qeducation_num = pd.qcut(data['education_num'], [.25, .5, .75, 1])  
qeducation_num
Out[23]:
0         (12.0, 16.0]
1         (12.0, 16.0]
2        (8.999, 10.0]
3                  NaN
4         (12.0, 16.0]
5         (12.0, 16.0]
6                  NaN
7        (8.999, 10.0]
8         (12.0, 16.0]
9         (12.0, 16.0]
10       (8.999, 10.0]
11        (12.0, 16.0]
12        (12.0, 16.0]
13        (10.0, 12.0]
14        (10.0, 12.0]
15                 NaN
16       (8.999, 10.0]
17       (8.999, 10.0]
18                 NaN
19        (12.0, 16.0]
20        (12.0, 16.0]
21       (8.999, 10.0]
22                 NaN
23                 NaN
24       (8.999, 10.0]
25        (12.0, 16.0]
26       (8.999, 10.0]
27       (8.999, 10.0]
28       (8.999, 10.0]
29       (8.999, 10.0]
             ...      
32531     (12.0, 16.0]
32532     (12.0, 16.0]
32533     (12.0, 16.0]
32534    (8.999, 10.0]
32535              NaN
32536     (12.0, 16.0]
32537    (8.999, 10.0]
32538     (12.0, 16.0]
32539     (12.0, 16.0]
32540    (8.999, 10.0]
32541    (8.999, 10.0]
32542    (8.999, 10.0]
32543     (10.0, 12.0]
32544     (12.0, 16.0]
32545     (10.0, 12.0]
32546     (10.0, 12.0]
32547    (8.999, 10.0]
32548     (12.0, 16.0]
32549    (8.999, 10.0]
32550    (8.999, 10.0]
32551              NaN
32552     (10.0, 12.0]
32553     (12.0, 16.0]
32554     (12.0, 16.0]
32555    (8.999, 10.0]
32556     (10.0, 12.0]
32557    (8.999, 10.0]
32558    (8.999, 10.0]
32559    (8.999, 10.0]
32560    (8.999, 10.0]
Name: education_num, Length: 32561, dtype: category
Categories (3, interval[float64]): [(8.999, 10.0] < (10.0, 12.0] < (12.0, 16.0]]
  • 计算 education_numlabel 的交叉报表
In [48]:
cross1 = pd.crosstab(data['label'], qeducation_num)
cross1
Out[48]:
education_num (8.999, 10.0] (10.0, 12.0] (12.0, 16.0]
label
<=50K 14730 1823 4158
>50K 3062 626 3909

cross1 图表化

In [65]:
props = lambda key: {"color": "0.45"} if ' >50K' in key else {"color": "#C6E2FF"}
mosaic(cross1.stack(), gap=0.05,properties=props, title='label 和 education_num 的交叉报表')
plt.show()

搭建模型,并训练模型

statsmodels 建模时可以使用类似文字的表达式:

formula 定义了模型的形式, ~ 相当于等号.

In [67]:
def trainModel(data):
    """
    搭建逻辑回归模型,并训练模型
    """
    formula = "label_code ~ age + education_num + capital_gain + capital_loss + hours_per_week"
    model = sm.Logit.from_formula(formula, data=data)
    re = model.fit()
    return re
In [70]:
def modelSummary(re):
    """
    分析逻辑回归模型的统计性质
    """
    # 整体统计分析结果
    print(re.summary())
    # 用f test检验education_num的系数是否显著
    print("检验假设education_num的系数等于0:")
    print(re.f_test("education_num=0"))
    # 用f test检验两个假设是否同时成立
    print("检验假设education_num的系数等于0.32和hours_per_week的系数等于0.04同时成立:")
    print(re.f_test("education_num=0.32, hours_per_week=0.04"))

def interpretModel(re):
"""
理解模型结果

参数
----
re :BinaryResults,训练好的逻辑回归模型
"""
conf = re.conf_int()
conf['OR'] = re.params
# 计算各个变量对事件发生比的影响
# conf里面的三列,分别对应着估计值的下界、上界和估计值本身
conf.columns = ['2.5%', '97.5%', 'OR']
print("各个变量对事件发生比的影响:")
print(np.exp(conf))
# 计算各个变量的边际效应
print("各个变量的边际效应:")
print(re.get_margeff(at="overall").summary())

def makePrediction(re, testSet, alpha=0.5):
"""
使用训练好的模型对测试数据做预测
"""
# 关闭pandas有关chain_assignment的警告
pd.options.mode.chained_assignment = None
# 计算事件发生的概率
testSet["prob"] = re.predict(testSet)
print("事件发生概率(预测概率)大于0.6的数据个数:")
print(testSet[testSet["prob"] > 0.6].shape[0]) # 输出值为576
print("事件发生概率(预测概率)大于0.5的数据个数:")
print(testSet[testSet["prob"] > 0.5].shape[0]) # 输出值为834
# 根据预测的概率,得出最终的预测
testSet["pred"] = testSet.apply(lambda x: 1 if x["prob"] > alpha else 0, axis=1)
return testSet

def evaluation(re):
"""
计算预测结果的查准查全率以及f1

参数
----
re :DataFrame,预测结果,里面包含两列:真实值‘lable_code’、预测值‘pred’
"""
bins = np.array([0, 0.5, 1])
label = re["label_code"]
pred = re["pred"]
tp, fp, fn, tn = np.histogram2d(label, pred, bins=bins)[0].flatten()
precision = tp / (tp + fp) # 0.951
recall = tp / (tp + fn) # 0.826
f1 = 2 precision recall / (precision + recall) # 0.884
print("查准率: %.3f, 查全率: %.3f, f1: %.3f" % (precision, recall, f1))

In [72]:
# 将数据分为训练集和测试集
trainSet, testSet = train_test_split(data, test_size=0.2, random_state=2310)

# 训练模型并分析模型效果
re = trainModel(trainSet)
modelSummary(re)
interpretModel(re)

Optimization terminated successfully.
         Current function value: 0.406094
         Iterations 8
                           Logit Regression Results                           
==============================================================================
Dep. Variable:             label_code   No. Observations:                26048
Model:                          Logit   Df Residuals:                    26042
Method:                           MLE   Df Model:                            5
Date:                Wed, 22 Aug 2018   Pseudo R-squ.:                  0.2639
Time:                        19:29:31   Log-Likelihood:                -10578.
converged:                       True   LL-Null:                       -14370.
                                        LLR p-value:                     0.000
==================================================================================
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         -8.2970      0.128    -64.623      0.000      -8.549      -8.045
age                0.0435      0.001     31.726      0.000       0.041       0.046
education_num      0.3215      0.008     42.231      0.000       0.307       0.336
capital_gain       0.0003   1.07e-05     29.650      0.000       0.000       0.000
capital_loss       0.0007   3.64e-05     20.055      0.000       0.001       0.001
hours_per_week     0.0399      0.001     26.995      0.000       0.037       0.043
==================================================================================
检验假设education_num的系数等于0:
<F test: F=array([[1783.4276255]]), p=0.0, df_denom=26042, df_num=1>
检验假设education_num的系数等于0.32和hours_per_week的系数等于0.04同时成立:
<F test: F=array([[0.01940236]]), p=0.9807846677772952, df_denom=26042, df_num=2>
各个变量对事件发生比的影响:
                    2.5%     97.5%        OR
Intercept       0.000194  0.000321  0.000249
age             1.041611  1.047218  1.044411
education_num   1.358725  1.399879  1.379149
capital_gain    1.000298  1.000340  1.000319
capital_loss    1.000659  1.000802  1.000731
hours_per_week  1.037733  1.043769  1.040746
各个变量的边际效应:
        Logit Marginal Effects       
=====================================
Dep. Variable:             label_code
Method:                          dydx
At:                           overall
==================================================================================
                    dy/dx    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
age                0.0056      0.000     33.563      0.000       0.005       0.006
education_num      0.0413      0.001     47.313      0.000       0.040       0.043
capital_gain     4.09e-05    1.3e-06     31.500      0.000    3.84e-05    4.34e-05
capital_loss    9.372e-05   4.54e-06     20.648      0.000    8.48e-05       0.000
hours_per_week     0.0051      0.000     28.167      0.000       0.005       0.005
==================================================================================
In [73]:
re = makePrediction(re, testSet)
evaluation(re)
事件发生概率(预测概率)大于0.6的数据个数:
576
事件发生概率(预测概率)大于0.5的数据个数:
834
查准率: 0.951, 查全率: 0.826, f1: 0.884
posted @ 2018-08-28 14:11  xinet  阅读(886)  评论(0编辑  收藏  举报