malware_detaction_ML

使用ML对恶意软件进行识别

import pandas as pd

data = pd.read_csv("MalwareData.csv", sep='|')

print(data.columns)

legit = data[0:41323].drop('legitimate', axis=1)
mal = data[41323::].drop('legitimate', axis=1)

print("合法数据有:%s 个,特征有:%s "%(legit.shape[0], legit.shape[1]))
print("恶意数据有:%s 个,特征有:%s "%(mal.shape[0], mal.shape[1]))

Index(['Name', 'md5', 'Machine', 'SizeOfOptionalHeader', 'Characteristics',
'MajorLinkerVersion', 'MinorLinkerVersion', 'SizeOfCode',
'SizeOfInitializedData', 'SizeOfUninitializedData',
'AddressOfEntryPoint', 'BaseOfCode', 'BaseOfData', 'ImageBase',
'SectionAlignment', 'FileAlignment', 'MajorOperatingSystemVersion',
'MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion',
'MajorSubsystemVersion', 'MinorSubsystemVersion', 'SizeOfImage',
'SizeOfHeaders', 'CheckSum', 'Subsystem', 'DllCharacteristics',
'SizeOfStackReserve', 'SizeOfStackCommit', 'SizeOfHeapReserve',
'SizeOfHeapCommit', 'LoaderFlags', 'NumberOfRvaAndSizes', 'SectionsNb',
'SectionsMeanEntropy', 'SectionsMinEntropy', 'SectionsMaxEntropy',
'SectionsMeanRawsize', 'SectionsMinRawsize', 'SectionMaxRawsize',
'SectionsMeanVirtualsize', 'SectionsMinVirtualsize',
'SectionMaxVirtualsize', 'ImportsNbDLL', 'ImportsNb',
'ImportsNbOrdinal', 'ExportNb', 'ResourcesNb', 'ResourcesMeanEntropy',
'ResourcesMinEntropy', 'ResourcesMaxEntropy', 'ResourcesMeanSize',
'ResourcesMinSize', 'ResourcesMaxSize', 'LoadConfigurationSize',
'VersionInformationSize', 'legitimate'],
dtype='object')
合法数据有:41323 个,特征有:56
恶意数据有:96724 个,特征有:56

data.head(-5)
mal.head(5)
data['legitimate']

0 1
1 1
2 1
3 1
4 1
..
138042 0
138043 0
138044 0
138045 0
138046 0
Name: legitimate, Length: 138047, dtype: int64

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
data_in = data.drop(['Name', 'md5', 'legitimate'], axis=1).values
labels = data['legitimate'].values
extratree = ExtraTreesClassifier().fit(data_in, labels)
select = SelectFromModel(extratree, prefit=True)
data_in_new = select.transform(data_in)
print(data_in.shape, data_in_new.shape)

(138047, 54) (138047, 13)

import numpy as np
features = data_in_new.shape[1]
importances = extratree.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(features):
    print('%d'%(f+1),data.columns[2+indices[f]],importances[indices[f]])

1 DllCharacteristics 0.13159378465829966
2 Characteristics 0.12898426863444917
3 Machine 0.10501819146671434
4 VersionInformationSize 0.059924950764671775
5 Subsystem 0.05488002624255342
6 SectionsMaxEntropy 0.05349804545740858
7 MajorSubsystemVersion 0.052116049448295995
8 ResourcesMaxEntropy 0.044890679185710124
9 ImageBase 0.04252264661828136
10 SizeOfOptionalHeader 0.04178342613704668
11 ResourcesMinEntropy 0.039008608631288154
12 SectionsMinEntropy 0.028637455162767493
13 MajorOperatingSystemVersion 0.019217124710739743

from sklearn.ensemble import RandomForestClassifier
legit_train, legit_test, mal_train, mal_test = train_test_split(data_in_new, labels)
classifier = RandomForestClassifier(n_estimators=50)
classifier.fit(legit_train,mal_train)
RandomForestClassifier(n_estimators=50)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print("the score of the algorithm:", classifier.score(legit_test, mal_test) * 100)

the score of the algorithm: 99.4233889661567

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

def print_stats_metrics(y_test, y_pred):
    print('Accuracy: %f' % accuracy_score(y_test,y_pred) )
    confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
    print ("confusion matrix")
    print(confmat)
    print (pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))
    print('Precision: %f' % precision_score(y_true=y_test, y_pred=y_pred,average='binary'))
    print('Recall: %f' % recall_score(y_true=y_test, y_pred=y_pred))
    print('F1-measure: %f' % f1_score(y_true=y_test, y_pred=y_pred))

predictions = classifier.predict(legit_test)
print_stats_metrics(mal_test,predictions)

Accuracy: 0.994234
confusion matrix
[[24109 111]
[ 88 10204]]
Predicted 0 1 All
True
0 24109 111 24220
1 88 10204 10292
All 24197 10315 34512
Precision: 0.989239
Recall: 0.991450
F1-measure: 0.990343

from sklearn.ensemble import GradientBoostingClassifier

grad_boost = GradientBoostingClassifier(n_estimators=50)
grad_boost.fit(legit_train, mal_train)

GradientBoostingClassifier(n_estimators=50)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
predictions = grad_boost.predict(legit_test)
print_stats_metrics(mal_test, predictions)

Accuracy: 0.988497
confusion matrix
[[24039 181]
[ 216 10076]]
Predicted 0 1 All
True
0 24039 181 24220
1 216 10076 10292
All 24255 10257 34512
Precision: 0.982354
Recall: 0.979013
F1-measure: 0.980680


posted @ 2024-03-20 14:41  crabin88  阅读(34)  评论(0)    收藏  举报