信用卡欺诈检测

  数据集在kaggle在搜索即可。此数据正样本和负样本的的比例不平衡,因此可以采用下采样或者过采样的方法使样本均衡。

 

  

import matplotlib.pylab as plt
import numpy as np
import pandas as pd

data = pd.read_csv("D:/creditcard.csv")
# print(data.head())
#0 - 正常的样本,1 - 有问题的样本

count_classes = pd.value_counts(data['Class'], sort=True).sort_index()
count_classes.plot(kind = 'bar', alpha=0.5)
plt.title('Fraud class histogram')
plt.xlabel('Class')
plt.ylabel('Frequency')
# plt.show()


from sklearn.preprocessing import StandardScaler

#fit_transform():对数据进行一个变化,变化号的列作为一个新属性添加到data
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].reshape(-1, 1))
data = data.drop(['Time', 'Amount'], axis=1)
# print(data.head())

#下采样实现
X = data.ix[:, data.columns != 'Class']
y = data.ix[:, data.columns == 'Class']
# print(X)
# print(y)

#样本的个数
number_records_fraud = len(data[data.Class == 1])
#样本的索引,所有值为1的索引
fraud_indices = np.array(data[data.Class == 1].index)
#值为0的数据的索引
normal_indices = data[data.Class == 0].index

# print(number_records_fraud)
# print(fraud_indices)
# print(normal_indices)

#使两个样本一样少
#随机选取样本中的数据,随机选择:np.random.choice(Class为0的数据索引=>样本,选择多少数据量,是否选择代替=False)
random_normal_indeics = np.random.choice(normal_indices, number_records_fraud, replace=False)
random_normal_indeics = np.array(random_normal_indeics)

#连接Class=1的数据索引和随机选取的Class=0的数据索引
under_sample_indeice = np.concatenate([fraud_indices, random_normal_indeics])
# print(under_sample_indeice)

#下采样(选取该索引下的数据)
under_sample_data = data.iloc[under_sample_indeice, :]

X_undersample = under_sample_data.ix[:, under_sample_data.columns != 'Class']
y_undersample = under_sample_data.ix[:, under_sample_data.columns == 'Class']

#Showing Ratio
# print('Percentage oif normal transactions:', len(under_sample_data[under_sample_data.Class == 0]) / len(under_sample_data))
# print('Percentage oif fraud transactions:', len(under_sample_data[under_sample_data.Class == 1]) / len(under_sample_data))
# print('Total number of transactions in resampled data:', len(under_sample_data))


#交叉验证, train_test_split: 切分数据
from sklearn.cross_validation import  train_test_split

#将数据集分成训练集0.7和测试集0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# print('Number transactions train dataset:', len(X_train))
# print('Number transactions test dataset:', len(X_test))
# print('Total number of transaction:', len(X_train) + len(X_test))

#对下采样数据集进行相同的操作
#只采用下采样数据集进行训练,最终用原始数据集测试
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample, y_undersample, test_size=0.3, random_state=0)
# print('Number transactions train dataset:', len(X_train_undersample))
# print('Number transactions test dataset:', len(X_test_undersample))
# print('Total number of transaction:', len(X_train_undersample) + len(X_test_undersample))


#在类不平衡问题中,用精度来衡量指标是骗人的!没有意义(1000个人,全部预测为正常, 0个癌症) 精度 =  TP / Total
# 这里使用Recall(召回率, 查全率) =  TP / TP+FN , 1000个人(990个正常,10个癌症),如果检测出0个病人  0/10=0,检测出2个病人2/10=0.2
# 检测任务上,通常用Recall作为模型的评估标准
from sklearn.linear_model import LogisticRegression
#KFold->做几倍的交叉验证
from sklearn.cross_validation import KFold
# confusion_matrix:混淆矩阵
from sklearn.metrics import confusion_matrix, recall_score


def printing_Kfold_score(x_train_data, y_train_data):
    #切分成5部分,将原始数据集进行切分
    fold = KFold(len(y_train_data), 5, shuffle=False)

    #正则化惩罚项
    #希望当前模型的泛化能力更好,不仅满足训练数据,还要在测试数据上尽可能满足
    #浮动的差异小 ====> 过拟合的风险小
    c_param_range = [0.01, 0.1, 1, 10, 100]

    results_table = pd.DataFrame(index = range(len(c_param_range), 2), columns = ['C_parameter', 'Mean recall score'])
    results_table['C_parameter'] = c_param_range

    j = 0
    #查看哪一个c值比较好
    for c_param in c_param_range:
        print('------------------------------------')
        print('C paramter:', c_param)
        print('------------------------------------')
        print('')

        recall_accs = []
        for iteration, indices in enumerate(fold, start=1):
            #L1惩罚或者L2惩罚
            lr = LogisticRegression(C = c_param, penalty='l1')

            #进行训练,交叉验证数据 ---> 建立模型
            lr.fit(x_train_data.iloc[indices[0], :], y_train_data.iloc[indices[0], :].values.ravel())

            #进行预测,用交叉验证中的验证集 再进行一个验证测操作
            y_pred_undersameple = lr.predict(x_train_data.iloc[indices[1], :].values)

            #计算当前模型的recall(indices[1]:测试集)
            recall_acc = recall_score(y_train_data.iloc[indices[1], :].values, y_pred_undersameple)
            recall_accs.append(recall_acc)
            print("Iterator", iteration, ': recall score = ', recall_acc)


        results_table.ix[j, 'Mean recall score'] = np.mean(recall_accs)
        j += 1
        print('')
        print('Mean recall score', np.mean(recall_accs))
        print('')

    # print(results_table)

    # best_c = results_table.loc[results_table['Mean recall score'].idxmax()]['C_parameter']

    # Finally, we can check which C parameter is the best amongst the chosen.
    # print('*********************************************************************************')
    # print('Best model to choose from cross validation is with C parameter = ', best_c)
    # print('*********************************************************************************')
    # return best_c

printing_Kfold_score(X_train_undersample, y_train_undersample)


#混淆矩阵
def plot_confusion_matrix(cm, classes, title = 'Confusion matrix', cmap = plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap = cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment='center',
                 color='white' if cm[i, j] > thresh else 'black')

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

import itertools
lr = LogisticRegression(C = 0.1, penalty='l1')
lr.fit(X_train_undersample, y_train_undersample.values.ravel())
y_pred_undersample = lr.predict(X_test_undersample)

#计算混淆矩阵
cnf_matrix = confusion_matrix(y_test_undersample, y_pred_undersample)
np.set_printoptions(precision=2)

print('Recal metric in the testing dataset: ', cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1]))

class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, title = 'Confusion matrix')
# plt.show()



lr = LogisticRegression(C = 0.1, penalty='l1')
lr.fit(X_train_undersample, y_train_undersample.values.ravel())
y_pred = lr.predict(X_test.values)

#用原数据来生成混淆矩阵
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)

print('Recall metric in the testing dataset:', cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1]))

class_names = [0, 1]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')

# plt.show()


#测试阈值对结果的影响
lr = LogisticRegression(C = 0.01, penalty='l1')
lr.fit(X_train_undersample, y_train_undersample.values.ravel())
y_pred_undersample_proba = lr.predict_proba(X_test_undersample.values)

#设置不同的阈值
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

plt.figure(figsize=(10, 10))

j = 1
for i in thresholds:
    y_test_undersample_high_recall = y_pred_undersample_proba[:, 1] > i

    plt.subplot(3, 3, j)
    j += 1

    #计算混淆矩阵
    cnf_matrix = confusion_matrix(y_test_undersample, y_test_undersample_high_recall)
    np.set_printoptions(precision=2)

    print('Recal metrix in the testing dataset:', cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1]))

    class_names = [0, i]
    plot_confusion_matrix(cnf_matrix, classes=class_names, title='Threshold >= %s' %i)

# plt.show()



# #采取过采样的策略 ---- SMOTE样本生成策略
# from imblearn.over_sampling import SMOTE
# from sklearn.ensemble import RandomForestClassifier
# 
# 
# credit_cards = pd.read_csv('creditcard.csv')
# 
# columns = credit_cards.columns
# # The labels are in the last column ('Class'), Simply remove it to obtain features cloumns
# features_columns = columns.delete(len(columns) - 1)
# 
# features = credit_cards[features_columns]
# labels = credit_cards['Class']
# features_train, features_test, labels_train, labels_test = train_test_split(features,
#                                                                             labels,
#                                                                             test_size=0.2,
#                                                                             random_state=0)
# oversampler = SMOTE(random_state=0)
# os_features, os_labels = oversampler.fit_sample(features_train, labels_train)
# 
# len(os_labels[os_labels == 1])

 

posted @ 2018-03-23 13:59  SHUXU  阅读(313)  评论(0)    收藏  举报