malware_detaction_ML
使用ML对恶意软件进行识别
import pandas as pd
data = pd.read_csv("MalwareData.csv", sep='|')
print(data.columns)
legit = data[0:41323].drop('legitimate', axis=1)
mal = data[41323::].drop('legitimate', axis=1)
print("合法数据有:%s 个,特征有:%s "%(legit.shape[0], legit.shape[1]))
print("恶意数据有:%s 个,特征有:%s "%(mal.shape[0], mal.shape[1]))
Index(['Name', 'md5', 'Machine', 'SizeOfOptionalHeader', 'Characteristics',
'MajorLinkerVersion', 'MinorLinkerVersion', 'SizeOfCode',
'SizeOfInitializedData', 'SizeOfUninitializedData',
'AddressOfEntryPoint', 'BaseOfCode', 'BaseOfData', 'ImageBase',
'SectionAlignment', 'FileAlignment', 'MajorOperatingSystemVersion',
'MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion',
'MajorSubsystemVersion', 'MinorSubsystemVersion', 'SizeOfImage',
'SizeOfHeaders', 'CheckSum', 'Subsystem', 'DllCharacteristics',
'SizeOfStackReserve', 'SizeOfStackCommit', 'SizeOfHeapReserve',
'SizeOfHeapCommit', 'LoaderFlags', 'NumberOfRvaAndSizes', 'SectionsNb',
'SectionsMeanEntropy', 'SectionsMinEntropy', 'SectionsMaxEntropy',
'SectionsMeanRawsize', 'SectionsMinRawsize', 'SectionMaxRawsize',
'SectionsMeanVirtualsize', 'SectionsMinVirtualsize',
'SectionMaxVirtualsize', 'ImportsNbDLL', 'ImportsNb',
'ImportsNbOrdinal', 'ExportNb', 'ResourcesNb', 'ResourcesMeanEntropy',
'ResourcesMinEntropy', 'ResourcesMaxEntropy', 'ResourcesMeanSize',
'ResourcesMinSize', 'ResourcesMaxSize', 'LoadConfigurationSize',
'VersionInformationSize', 'legitimate'],
dtype='object')
合法数据有:41323 个,特征有:56
恶意数据有:96724 个,特征有:56
data.head(-5)
mal.head(5)
data['legitimate']
0 1
1 1
2 1
3 1
4 1
..
138042 0
138043 0
138044 0
138045 0
138046 0
Name: legitimate, Length: 138047, dtype: int64
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
data_in = data.drop(['Name', 'md5', 'legitimate'], axis=1).values
labels = data['legitimate'].values
extratree = ExtraTreesClassifier().fit(data_in, labels)
select = SelectFromModel(extratree, prefit=True)
data_in_new = select.transform(data_in)
print(data_in.shape, data_in_new.shape)
(138047, 54) (138047, 13)
import numpy as np
features = data_in_new.shape[1]
importances = extratree.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(features):
print('%d'%(f+1),data.columns[2+indices[f]],importances[indices[f]])
1 DllCharacteristics 0.13159378465829966
2 Characteristics 0.12898426863444917
3 Machine 0.10501819146671434
4 VersionInformationSize 0.059924950764671775
5 Subsystem 0.05488002624255342
6 SectionsMaxEntropy 0.05349804545740858
7 MajorSubsystemVersion 0.052116049448295995
8 ResourcesMaxEntropy 0.044890679185710124
9 ImageBase 0.04252264661828136
10 SizeOfOptionalHeader 0.04178342613704668
11 ResourcesMinEntropy 0.039008608631288154
12 SectionsMinEntropy 0.028637455162767493
13 MajorOperatingSystemVersion 0.019217124710739743
from sklearn.ensemble import RandomForestClassifier
legit_train, legit_test, mal_train, mal_test = train_test_split(data_in_new, labels)
classifier = RandomForestClassifier(n_estimators=50)
classifier.fit(legit_train,mal_train)
RandomForestClassifier(n_estimators=50)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(n_estimators=50)
print("the score of the algorithm:", classifier.score(legit_test, mal_test) * 100)
the score of the algorithm: 99.4233889661567
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
def print_stats_metrics(y_test, y_pred):
print('Accuracy: %f' % accuracy_score(y_test,y_pred) )
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
print ("confusion matrix")
print(confmat)
print (pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))
print('Precision: %f' % precision_score(y_true=y_test, y_pred=y_pred,average='binary'))
print('Recall: %f' % recall_score(y_true=y_test, y_pred=y_pred))
print('F1-measure: %f' % f1_score(y_true=y_test, y_pred=y_pred))
predictions = classifier.predict(legit_test)
print_stats_metrics(mal_test,predictions)
Accuracy: 0.994234
confusion matrix
[[24109 111]
[ 88 10204]]
Predicted 0 1 All
True
0 24109 111 24220
1 88 10204 10292
All 24197 10315 34512
Precision: 0.989239
Recall: 0.991450
F1-measure: 0.990343
from sklearn.ensemble import GradientBoostingClassifier
grad_boost = GradientBoostingClassifier(n_estimators=50)
grad_boost.fit(legit_train, mal_train)
GradientBoostingClassifier(n_estimators=50)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingClassifier(n_estimators=50)
predictions = grad_boost.predict(legit_test)
print_stats_metrics(mal_test, predictions)
Accuracy: 0.988497
confusion matrix
[[24039 181]
[ 216 10076]]
Predicted 0 1 All
True
0 24039 181 24220
1 216 10076 10292
All 24255 10257 34512
Precision: 0.982354
Recall: 0.979013
F1-measure: 0.980680