malware_detaction_ML

使用ML对恶意软件进行识别

import pandas as pd

data = pd.read_csv("MalwareData.csv", sep='|')

print(data.columns)

legit = data[0:41323].drop('legitimate', axis=1)
mal = data[41323::].drop('legitimate', axis=1)

print("合法数据有：%s 个，特征有：%s "%(legit.shape[0], legit.shape[1]))
print("恶意数据有：%s 个，特征有：%s "%(mal.shape[0], mal.shape[1]))

Index(['Name', 'md5', 'Machine', 'SizeOfOptionalHeader', 'Characteristics',
       'MajorLinkerVersion', 'MinorLinkerVersion', 'SizeOfCode',
       'SizeOfInitializedData', 'SizeOfUninitializedData',
       'AddressOfEntryPoint', 'BaseOfCode', 'BaseOfData', 'ImageBase',
       'SectionAlignment', 'FileAlignment', 'MajorOperatingSystemVersion',
       'MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion',
       'MajorSubsystemVersion', 'MinorSubsystemVersion', 'SizeOfImage',
       'SizeOfHeaders', 'CheckSum', 'Subsystem', 'DllCharacteristics',
       'SizeOfStackReserve', 'SizeOfStackCommit', 'SizeOfHeapReserve',
       'SizeOfHeapCommit', 'LoaderFlags', 'NumberOfRvaAndSizes', 'SectionsNb',
       'SectionsMeanEntropy', 'SectionsMinEntropy', 'SectionsMaxEntropy',
       'SectionsMeanRawsize', 'SectionsMinRawsize', 'SectionMaxRawsize',
       'SectionsMeanVirtualsize', 'SectionsMinVirtualsize',
       'SectionMaxVirtualsize', 'ImportsNbDLL', 'ImportsNb',
       'ImportsNbOrdinal', 'ExportNb', 'ResourcesNb', 'ResourcesMeanEntropy',
       'ResourcesMinEntropy', 'ResourcesMaxEntropy', 'ResourcesMeanSize',
       'ResourcesMinSize', 'ResourcesMaxSize', 'LoadConfigurationSize',
       'VersionInformationSize', 'legitimate'],
      dtype='object')
合法数据有：41323 个，特征有：56 
恶意数据有：96724 个，特征有：56

data.head(-5)

mal.head(5)

data['legitimate']

0         1
1         1
2         1
3         1
4         1
         ..
138042    0
138043    0
138044    0
138045    0
138046    0
Name: legitimate, Length: 138047, dtype: int64

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

data_in = data.drop(['Name', 'md5', 'legitimate'], axis=1).values
labels = data['legitimate'].values
extratree = ExtraTreesClassifier().fit(data_in, labels)
select = SelectFromModel(extratree, prefit=True)
data_in_new = select.transform(data_in)
print(data_in.shape, data_in_new.shape)

(138047, 54) (138047, 13)

import numpy as np

features = data_in_new.shape[1]
importances = extratree.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(features):
    print('%d'%(f+1),data.columns[2+indices[f]],importances[indices[f]])

1 DllCharacteristics 0.13159378465829966
2 Characteristics 0.12898426863444917
3 Machine 0.10501819146671434
4 VersionInformationSize 0.059924950764671775
5 Subsystem 0.05488002624255342
6 SectionsMaxEntropy 0.05349804545740858
7 MajorSubsystemVersion 0.052116049448295995
8 ResourcesMaxEntropy 0.044890679185710124
9 ImageBase 0.04252264661828136
10 SizeOfOptionalHeader 0.04178342613704668
11 ResourcesMinEntropy 0.039008608631288154
12 SectionsMinEntropy 0.028637455162767493
13 MajorOperatingSystemVersion 0.019217124710739743

from sklearn.ensemble import RandomForestClassifier

legit_train, legit_test, mal_train, mal_test = train_test_split(data_in_new, labels)
classifier = RandomForestClassifier(n_estimators=50)
classifier.fit(legit_train,mal_train)

RandomForestClassifier(n_estimators=50)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

print("the score of the algorithm:", classifier.score(legit_test, mal_test) * 100)

the score of the algorithm: 99.4233889661567

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

def print_stats_metrics(y_test, y_pred):
    print('Accuracy: %f' % accuracy_score(y_test,y_pred) )
    confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
    print ("confusion matrix")
    print(confmat)
    print (pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))
    print('Precision: %f' % precision_score(y_true=y_test, y_pred=y_pred,average='binary'))
    print('Recall: %f' % recall_score(y_true=y_test, y_pred=y_pred))
    print('F1-measure: %f' % f1_score(y_true=y_test, y_pred=y_pred))

predictions = classifier.predict(legit_test)
print_stats_metrics(mal_test,predictions)

Accuracy: 0.994234
confusion matrix
[[24109   111]
 [   88 10204]]
Predicted      0      1    All
True                          
0          24109    111  24220
1             88  10204  10292
All        24197  10315  34512
Precision: 0.989239
Recall: 0.991450
F1-measure: 0.990343

from sklearn.ensemble import GradientBoostingClassifier

grad_boost = GradientBoostingClassifier(n_estimators=50)
grad_boost.fit(legit_train, mal_train)

GradientBoostingClassifier(n_estimators=50)

predictions = grad_boost.predict(legit_test)
print_stats_metrics(mal_test, predictions)

Accuracy: 0.988497
confusion matrix
[[24039   181]
 [  216 10076]]
Predicted      0      1    All
True                          
0          24039    181  24220
1            216  10076  10292
All        24255  10257  34512
Precision: 0.982354
Recall: 0.979013
F1-measure: 0.980680

posted @ 2024-03-20 14:41 crabin88 阅读(13) 评论(0) 编辑收藏举报

刷新页面返回顶部

crabin

malware_detaction_ML

使用ML对恶意软件进行识别

公告