SMOTE RF MLP demo use cross_val_score to find best argument 处理不平衡数据的demo代码 先做smote处理 再用交叉验证找到最好的模型参数 实践表明MLP更好
# _*_coding:UTF-8_*_ from sklearn.externals.six import StringIO from sklearn import tree import pydot import sklearn import numpy as np import sys import pickle import os from sklearn.cross_validation import train_test_split import sklearn.ensemble from sklearn.model_selection import cross_val_score # from sklearn.ensemble import ExtraTreesClassifier from sklearn import preprocessing import pdb from sklearn.neural_network import MLPClassifier from sklearn.metrics import classification_report from sklearn.model_selection import StratifiedShuffleSplit import os import collections import imblearn def iterbrowse(path): for home, dirs, files in os.walk(path): for filename in files: yield os.path.join(home, filename) def get_data(filename): white_verify = [] with open(filename) as f: lines = f.readlines() data = {} for line in lines: a = line.split("\t") if len(a) != 78: print(line) raise Exception("fuck") white_verify.append([float(n) for n in a[3:]]) return white_verify # 显示测试结果 def show_cm(cm, labels): # Compute percentanges percent = (cm * 100.0) / np.array(np.matrix(cm.sum(axis=1)).T) print 'Confusion Matrix Stats' for i, label_i in enumerate(labels): for j, label_j in enumerate(labels): print "%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum()) def save_model_to_disk(name, model, model_dir='.'): serialized_model = pickle.dumps(model, protocol=pickle.HIGHEST_PROTOCOL) model_path = os.path.join(model_dir, name + '.model') print 'Storing Serialized Model to Disk (%s:%.2fMeg)' % (name, len(serialized_model) / 1024.0 / 1024.0) open(model_path, 'wb').write(serialized_model) wanted_feature = { 15, #正向头部直方图中位数,-----H 12, # 正向头部直方图最小,-----H 14, #正向头部直方图平均数,-----H 13, # 正向头部直方图最大,-----H 16, #正向头部直方图标准差, -----H 52, #反向头部直方图不同长度类型数, -----M 51, #反向头部直方图平均数, --------------H 47, #反向头部直方图最小,--------------H 48, #反向头部直方图最大,--------------H 49, #反向头部直方图平均数,--------------H 50, #反向头部直方图平均数,--------------H 23, #正向载荷直方图最大, --------------H 24, #正向载荷直方图平均值,--------------H 25, #正向载荷直方图中位数,--------------H 26, #正向载荷直方图标准差,--------------H 17, #正向头部直方图不同长度类型数,---H 46, #反向包文的时间间隔(时间/包数), ----H 28, #正向载荷直方图小于128字节数个数,----H 29, #正向载荷直方图≥128、<512字节数个数,----H 30, #正向载荷直方图≥512、<1024字节数个数,----H 31, #正向载荷直方图>1024字节数个数,----H 57, #x反向载荷直方图最小,--------------H 60, #反向载荷直方图中位数,--------------H 59, #反向载荷直方图平均值, --------------H 61, #反向载荷直方图标准差,--------------H 58, #反向载荷直方图最大,--------------H 42, #反向当前流的数据包数量, 21, #正向头部直方图大于等于40字节数个数, -----------------------H 56, #反向头部直方图大于等于40字节数个数,------------------------H 65, #反向载荷直方图>1024字节数个数,------------------------H 63, #反向载荷直方图小于128字节数个数,------------------------H 64, #反向载荷直方图≥128、<512字节数个数, ------------------------H 66, #反向载荷直方图≥512、<1024字节数个数,------------------------H } unwanted_features = {6, 7, 8, 41,42,43,67,68,69,70,71,72,73,74,75} def get_wanted_data(x): """ return x """ ans = [] for item in x: #row = [data for i, data in enumerate(item) if i+6 in wanted_feature] row = [data for i, data in enumerate(item) if i+6 not in unwanted_features] ans.append(row) #assert len(row) == len(wanted_feature) assert len(row) == len(x[0])-len(unwanted_features) return ans if __name__ == '__main__': # pdb.set_trace() neg_file = "cc_data/black/black_all.txt" pos_file = "cc_data/white/white_all.txt" X = [] y = [] if os.path.isfile(pos_file): if pos_file.endswith('.txt'): pos_set = np.genfromtxt(pos_file) elif pos_file.endswith('.npy'): pos_set = np.load(pos_file) X.extend(pos_set) y += [0] * len(pos_set) print("len of white X:", len(X)) l = len(X) if os.path.isfile(neg_file): if neg_file.endswith('.txt'): neg_set = np.genfromtxt(neg_file) elif neg_file.endswith('.npy'): neg_set = np.load(neg_file) #X.extend(list(neg_set)*5) #y += [1] * (5*len(neg_set)) X.extend(neg_set) y += [1] * len(neg_set) print("len of black X:", len(X)-l) print("len of X:", len(X)) print("X sample:", X[:3]) print("len of y:", len(y)) print("y sample:", y[:3]) X = [x[3:] for x in X] X = get_wanted_data(X) print("filtered X sample:", X[:1]) black_verify = [] for f in iterbrowse("todo/top"): print(f) black_verify += get_data(f) #ValueError: operands could not be broadcast together with shapes (1,74) (75,) (1,74) black_verify = get_wanted_data(black_verify) print(black_verify) black_verify_labels = [1]*len(black_verify) white_verify = get_data("todo/white_verify.txt") white_verify = get_wanted_data(white_verify) print(white_verify) white_verify_labels = [0]*len(white_verify) unknown_verify = get_data("todo/pek_feature74.txt") unknown_verify = get_wanted_data(unknown_verify) print(unknown_verify) black_verify2 = get_data("todo/x_rat.txt") black_verify2 = get_wanted_data(black_verify2) print(black_verify2) black_verify_labels2 = [1]*len(black_verify2) """ # Smote use KNN, so use standard scaler """ from sklearn import preprocessing scaler = preprocessing.StandardScaler().fit(X) #scaler = preprocessing.MinMaxScaler().fit(X) X = scaler.transform(X) print("standard X sample:", X[:3]) black_verify = scaler.transform(black_verify) print(black_verify) white_verify = scaler.transform(white_verify) print(white_verify) unknown_verify = scaler.transform(unknown_verify) print(unknown_verify) black_verify2 = scaler.transform(black_verify2) print(black_verify2) # ValueError: operands could not be broadcast together with shapes (756140,75) (42,75) (756140,75) for i in range(200): # add weight 加大必须检出数据的权重,因为只有10+个样本所以x200增多 X = np.concatenate((X, black_verify)) y += black_verify_labels y = np.array(y) labels = ['white', 'CC'] #if True: for depth in (128, 64, 32): print "***"*20 print "hidden_layer_sizes=>", depth sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) for train_index, test_index in sss.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #ratio_of_train = 0.8 #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1 - ratie_of_train)) print "smote before:" print(sorted(collections.Counter(y_train).items())) print(sorted(collections.Counter(y_test).items())) from imblearn.over_sampling import SMOTE X_train, y_train = SMOTE().fit_sample(X_train, y_train) print "smote after:" print(sorted(collections.Counter(y_train).items())) X_test2, y_test2 = SMOTE().fit_sample(X_test, y_test) # X_train=preprocessing.normalize(X_train) # X_test=preprocessing.normalize(X_test) """ from sklearn.linear_model import LogisticRegression clf = LogisticRegression(C=0.1, penalty='l2', tol=0.01) import xgboost as xgb clf = xgb.XGBClassifier(learning_rate=0.1,n_estimators=50,max_depth=6, objective= 'binary:logistic',nthread=40,scale_pos_weight=0.02,seed=666) clf = sklearn.ensemble.RandomForestClassifier(n_estimators=100, n_jobs=10, max_depth=3, random_state=666, oob_score=True) """ clf = MLPClassifier(batch_size=128, learning_rate='adaptive', max_iter=1024, hidden_layer_sizes=(depth,), random_state=666) clf.fit(X_train, y_train) print "test confusion_matrix:" # print clf.feature_importances_ y_pred = clf.predict(X_test) print(sklearn.metrics.confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) print "test confusion_matrix (SMOTE):" y_pred2 = clf.predict(X_test2) print(sklearn.metrics.confusion_matrix(y_test2, y_pred2)) print(classification_report(y_test2, y_pred2)) print "all confusion_matrix:" y_pred = clf.predict(X) print(sklearn.metrics.confusion_matrix(y, y_pred)) print(classification_report(y, y_pred)) print "black verify confusion_matrix:" black_verify_pred = clf.predict(black_verify) print(black_verify_pred) print(classification_report(black_verify_labels, black_verify_pred)) print "black verify2 confusion_matrix:" black_verify_pred2 = clf.predict(black_verify2) print(black_verify_pred2) print(classification_report(black_verify_labels2, black_verify_pred2)) print "white verify confusion_matrix:" white_verify_pred = clf.predict(white_verify) print(white_verify_pred) print(classification_report(white_verify_labels, white_verify_pred)) print("unknown_verify:") print(clf.predict(unknown_verify)) print "hidden_layer_sizes=>", depth print "***"*20 else: #clf = pickle.loads(open("mpl-acc97-recall98.pkl", 'rb').read()) clf = pickle.loads(open("mlp-add-topx10.model", 'rb').read()) y_pred = clf.predict(X) print(sklearn.metrics.confusion_matrix(y, y_pred)) print(classification_report(y, y_pred)) import sys sys.exit(0) """ dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("iris.pdf") """ model_name = "rf_smote" save_model_to_disk(model_name, clf) # print clf.oob_score_ scores = cross_val_score(clf, X, y, cv=5) print "scores:" print scores
实验结果:
MLP 隐藏层神经元个数 128
test confusion_matrix (SMOTE): 测试数据的混淆矩阵
[[131946 120]
[ 299 131767]]
precision recall f1-score support
0 1.00 1.00 1.00 132066
1 1.00 1.00 1.00 132066
avg / total 1.00 1.00 1.00 264132
all confusion_matrix: 整体数据混淆矩阵
[[659846 483]
[ 52 32474]]
precision recall f1-score support
0 1.00 1.00 1.00 660329
1 0.99 1.00 0.99 32526
avg / total 1.00 1.00 1.00 692855
black verify confusion_matrix: #需要必须检测出来的样本 OK 都检出了
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1]
precision recall f1-score support
1 1.00 1.00 1.00 42
avg / total 1.00 1.00 1.00 42
black verify2 confusion_matrix: # 现网是黑的数据,很难区分的
[0 0 0 0 0 0 0 1 1 1 1]
precision recall f1-score support
0 0.00 0.00 0.00 0
1 1.00 0.36 0.53 11
avg / total 1.00 0.36 0.53 11
white verify confusion_matrix: # 现网是白的数据 很难区分的
[1 1 1 1 0 0 0]
precision recall f1-score support
0 1.00 0.43 0.60 7
1 0.00 0.00 0.00 0
avg / total 1.00 0.43 0.60 7
unknown_verify: # 现网采集的 有好些是黑的数据 希望检出率高 但是不能过高
[1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 0 1
0 1 1 1 1 0 0 1 0 1 0 0 0 1 1 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1] 现网验证检出率还不错
隐藏层为64
************************************************************
hidden_layer_sizes=> 64
smote before:
[(0, 528263), (1, 26021)]
[(0, 132066), (1, 6505)]
smote after:
[(0, 528263), (1, 528263)]
test confusion_matrix:
[[131912 154]
[ 24 6481]]
precision recall f1-score support
0 1.00 1.00 1.00 132066
1 0.98 1.00 0.99 6505
avg / total 1.00 1.00 1.00 138571
test confusion_matrix (SMOTE):
[[131912 154]
[ 193 131873]]
precision recall f1-score support
0 1.00 1.00 1.00 132066
1 1.00 1.00 1.00 132066
avg / total 1.00 1.00 1.00 264132
all confusion_matrix:
[[659566 763]
[ 34 32492]]
precision recall f1-score support
0 1.00 1.00 1.00 660329
1 0.98 1.00 0.99 32526
avg / total 1.00 1.00 1.00 692855
black verify confusion_matrix:
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1]
precision recall f1-score support
1 1.00 1.00 1.00 42
avg / total 1.00 1.00 1.00 42
black verify2 confusion_matrix:
[0 0 0 0 0 0 0 1 1 1 1]
precision recall f1-score support
0 0.00 0.00 0.00 0
1 1.00 0.36 0.53 11
avg / total 1.00 0.36 0.53 11
white verify confusion_matrix:
[1 1 0 1 0 0 0]
precision recall f1-score support
0 1.00 0.57 0.73 7
1 0.00 0.00 0.00 0
avg / total 1.00 0.57 0.73 7
unknown_verify:
[1 0 1 1 1 0 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 0 1 0 0 1 0 1 1 0 1
0 0 1 1 1 0 0 1 1 1 1 0 0 1 1 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1]
看起来也还不错!
看看随机森林的表现:depth=15,100棵树
test confusion_matrix:
[[132045 21]
[ 16 4818]]
precision recall f1-score support
0 1.00 1.00 1.00 132066
1 1.00 1.00 1.00 4834
avg / total 1.00 1.00 1.00 136900
test confusion_matrix (SMOTE):
[[132045 21]
[ 246 131820]]
precision recall f1-score support
0 1.00 1.00 1.00 132066
1 1.00 1.00 1.00 132066
avg / total 1.00 1.00 1.00 264132
all confusion_matrix:
[[660227 102]
[ 29 24139]]
precision recall f1-score support
0 1.00 1.00 1.00 660329
1 1.00 1.00 1.00 24168
avg / total 1.00 1.00 1.00 684497
black verify confusion_matrix:
[0 1 0 0 1 1 1 1 1 1 1 0 0 1 0 1 1 1 1 0 0 1 0 0 1 1 1 0 0 0 0 1 1 1 1 1 1
1 1 1 1 1] 这个是必须要全部检出的
precision recall f1-score support
0 0.00 0.00 0.00 0
1 1.00 0.67 0.80 42
avg / total 1.00 0.67 0.80 42
white verify confusion_matrix:
[0 0 0 0 0 0 1]
precision recall f1-score support
0 1.00 0.86 0.92 7
1 0.00 0.00 0.00 0
avg / total 1.00 0.86 0.92 7
unknown_verify: 现网的检出太低了!过拟合比较严重。。。。
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
depth=14的一个
test confusion_matrix:
[[132038 28]
[ 16 4818]]
precision recall f1-score support
0 1.00 1.00 1.00 132066
1 0.99 1.00 1.00 4834
avg / total 1.00 1.00 1.00 136900
test confusion_matrix (SMOTE):
[[132038 28]
[ 257 131809]]
precision recall f1-score support
0 1.00 1.00 1.00 132066
1 1.00 1.00 1.00 132066
avg / total 1.00 1.00 1.00 264132
all confusion_matrix:
[[660220 109]
[ 34 24134]]
precision recall f1-score support
0 1.00 1.00 1.00 660329
1 1.00 1.00 1.00 24168
avg / total 1.00 1.00 1.00 684497
black verify confusion_matrix:
[1 1 0 0 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 0 1 0 0 0 1 1 1 0 0 0 0 1 1 1 1 1 1
1 1 1 1 1]
precision recall f1-score support
0 0.00 0.00 0.00 0
1 1.00 0.64 0.78 42
avg / total 1.00 0.64 0.78 42
white verify confusion_matrix:
[0 0 0 0 0 1 1]
precision recall f1-score support
0 1.00 0.71 0.83 7
1 0.00 0.00 0.00 0
avg / total 1.00 0.71 0.83 7
unknown_verify:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
稍微好点,
depth=13的
test confusion_matrix (SMOTE):
[[132037 29]
[ 301 131765]]
precision recall f1-score support
0 1.00 1.00 1.00 132066
1 1.00 1.00 1.00 132066
avg / total 1.00 1.00 1.00 264132
all confusion_matrix:
[[660217 112]
[ 36 24132]]
precision recall f1-score support
0 1.00 1.00 1.00 660329
1 1.00 1.00 1.00 24168
avg / total 1.00 1.00 1.00 684497
black verify confusion_matrix:
[0 1 0 1 1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 1 0 0 0 0 1 0 1 1 1 1
0 1 1 1 1]
precision recall f1-score support
0 0.00 0.00 0.00 0
1 1.00 0.55 0.71 42
avg / total 1.00 0.55 0.71 42
white verify confusion_matrix:
[0 0 0 0 0 1 1]
precision recall f1-score support
0 1.00 0.71 0.83 7
1 0.00 0.00 0.00 0
avg / total 1.00 0.71 0.83 7
unknown_verify:
[0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
也差不多,再调整depth也差不多。
整体表现,没有MLP好!
看看逻辑回归的:
test confusion_matrix (SMOTE):
[[114699 17367]
[ 11921 120145]]
precision recall f1-score support
0 0.91 0.87 0.89 132066
1 0.87 0.91 0.89 132066
avg / total 0.89 0.89 0.89 264132
all confusion_matrix:
[[573083 87246]
[ 2877 29649]]
precision recall f1-score support
0 1.00 0.87 0.93 660329
1 0.25 0.91 0.40 32526
avg / total 0.96 0.87 0.90 692855
black verify confusion_matrix:
[1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0
1 1 0 1 1]
precision recall f1-score support
0 0.00 0.00 0.00 0
1 1.00 0.88 0.94 42
avg / total 1.00 0.88 0.94 42
black verify2 confusion_matrix:
[1 1 0 0 0 0 0 1 1 1 1]
precision recall f1-score support
0 0.00 0.00 0.00 0
1 1.00 0.55 0.71 11
avg / total 1.00 0.55 0.71 11
white verify confusion_matrix:
[1 1 1 1 1 1 1]
precision recall f1-score support
0 0.00 0.00 0.00 7
1 0.00 0.00 0.00 0
avg / total 0.00 0.00 0.00 7
unknown_verify:
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
整体精度不够。才0.25.。。。
看看xgboost的:
[[132018 48]
[ 11 6494]]
precision recall f1-score support
0 1.00 1.00 1.00 132066
1 0.99 1.00 1.00 6505
avg / total 1.00 1.00 1.00 138571
test confusion_matrix (SMOTE):
[[132018 48]
[ 82 131984]]
precision recall f1-score support
0 1.00 1.00 1.00 132066
1 1.00 1.00 1.00 132066
avg / total 1.00 1.00 1.00 264132
all confusion_matrix:
[[660134 195]
[ 29 32497]]
precision recall f1-score support
0 1.00 1.00 1.00 660329
1 0.99 1.00 1.00 32526
avg / total 1.00 1.00 1.00 692855
black verify confusion_matrix:
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1]
precision recall f1-score support
1 1.00 1.00 1.00 42
avg / total 1.00 1.00 1.00 42
black verify2 confusion_matrix:
[0 0 0 0 0 0 0 1 0 1 1]
precision recall f1-score support
0 0.00 0.00 0.00 0
1 1.00 0.27 0.43 11
avg / total 1.00 0.27 0.43 11
white verify confusion_matrix:
[0 0 1 0 1 0 1]
precision recall f1-score support
0 1.00 0.57 0.73 7
1 0.00 0.00 0.00 0
avg / total 1.00 0.57 0.73 7
unknown_verify:
[0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0
0 1 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0]
整体看来比随机森林好!