malware_detaction_ML

使用ML对恶意软件进行识别

import pandas as pd

data = pd.read_csv("MalwareData.csv", sep='|')

print(data.columns)

legit = data[0:41323].drop('legitimate', axis=1)
mal = data[41323::].drop('legitimate', axis=1)

print("合法数据有:%s 个,特征有:%s "%(legit.shape[0], legit.shape[1]))
print("恶意数据有:%s 个,特征有:%s "%(mal.shape[0], mal.shape[1]))

Index(['Name', 'md5', 'Machine', 'SizeOfOptionalHeader', 'Characteristics',
       'MajorLinkerVersion', 'MinorLinkerVersion', 'SizeOfCode',
       'SizeOfInitializedData', 'SizeOfUninitializedData',
       'AddressOfEntryPoint', 'BaseOfCode', 'BaseOfData', 'ImageBase',
       'SectionAlignment', 'FileAlignment', 'MajorOperatingSystemVersion',
       'MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion',
       'MajorSubsystemVersion', 'MinorSubsystemVersion', 'SizeOfImage',
       'SizeOfHeaders', 'CheckSum', 'Subsystem', 'DllCharacteristics',
       'SizeOfStackReserve', 'SizeOfStackCommit', 'SizeOfHeapReserve',
       'SizeOfHeapCommit', 'LoaderFlags', 'NumberOfRvaAndSizes', 'SectionsNb',
       'SectionsMeanEntropy', 'SectionsMinEntropy', 'SectionsMaxEntropy',
       'SectionsMeanRawsize', 'SectionsMinRawsize', 'SectionMaxRawsize',
       'SectionsMeanVirtualsize', 'SectionsMinVirtualsize',
       'SectionMaxVirtualsize', 'ImportsNbDLL', 'ImportsNb',
       'ImportsNbOrdinal', 'ExportNb', 'ResourcesNb', 'ResourcesMeanEntropy',
       'ResourcesMinEntropy', 'ResourcesMaxEntropy', 'ResourcesMeanSize',
       'ResourcesMinSize', 'ResourcesMaxSize', 'LoadConfigurationSize',
       'VersionInformationSize', 'legitimate'],
      dtype='object')
合法数据有:41323 个,特征有:56 
恶意数据有:96724 个,特征有:56 
data.head(-5)
mal.head(5)
data['legitimate']
0         1
1         1
2         1
3         1
4         1
         ..
138042    0
138043    0
138044    0
138045    0
138046    0
Name: legitimate, Length: 138047, dtype: int64
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
data_in = data.drop(['Name', 'md5', 'legitimate'], axis=1).values
labels = data['legitimate'].values
extratree = ExtraTreesClassifier().fit(data_in, labels)
select = SelectFromModel(extratree, prefit=True)
data_in_new = select.transform(data_in)
print(data_in.shape, data_in_new.shape)
(138047, 54) (138047, 13)
import numpy as np
features = data_in_new.shape[1]
importances = extratree.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(features):
    print('%d'%(f+1),data.columns[2+indices[f]],importances[indices[f]])
1 DllCharacteristics 0.13159378465829966
2 Characteristics 0.12898426863444917
3 Machine 0.10501819146671434
4 VersionInformationSize 0.059924950764671775
5 Subsystem 0.05488002624255342
6 SectionsMaxEntropy 0.05349804545740858
7 MajorSubsystemVersion 0.052116049448295995
8 ResourcesMaxEntropy 0.044890679185710124
9 ImageBase 0.04252264661828136
10 SizeOfOptionalHeader 0.04178342613704668
11 ResourcesMinEntropy 0.039008608631288154
12 SectionsMinEntropy 0.028637455162767493
13 MajorOperatingSystemVersion 0.019217124710739743
from sklearn.ensemble import RandomForestClassifier
legit_train, legit_test, mal_train, mal_test = train_test_split(data_in_new, labels)
classifier = RandomForestClassifier(n_estimators=50)
classifier.fit(legit_train,mal_train)
RandomForestClassifier(n_estimators=50)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print("the score of the algorithm:", classifier.score(legit_test, mal_test) * 100)
the score of the algorithm: 99.4233889661567
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

def print_stats_metrics(y_test, y_pred):
    print('Accuracy: %f' % accuracy_score(y_test,y_pred) )
    confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
    print ("confusion matrix")
    print(confmat)
    print (pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))
    print('Precision: %f' % precision_score(y_true=y_test, y_pred=y_pred,average='binary'))
    print('Recall: %f' % recall_score(y_true=y_test, y_pred=y_pred))
    print('F1-measure: %f' % f1_score(y_true=y_test, y_pred=y_pred))

predictions = classifier.predict(legit_test)
print_stats_metrics(mal_test,predictions)
Accuracy: 0.994234
confusion matrix
[[24109   111]
 [   88 10204]]
Predicted      0      1    All
True                          
0          24109    111  24220
1             88  10204  10292
All        24197  10315  34512
Precision: 0.989239
Recall: 0.991450
F1-measure: 0.990343
from sklearn.ensemble import GradientBoostingClassifier

grad_boost = GradientBoostingClassifier(n_estimators=50)
grad_boost.fit(legit_train, mal_train)

GradientBoostingClassifier(n_estimators=50)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
predictions = grad_boost.predict(legit_test)
print_stats_metrics(mal_test, predictions)
Accuracy: 0.988497
confusion matrix
[[24039   181]
 [  216 10076]]
Predicted      0      1    All
True                          
0          24039    181  24220
1            216  10076  10292
All        24255  10257  34512
Precision: 0.982354
Recall: 0.979013
F1-measure: 0.980680

posted @ 2024-03-20 14:41  crabin88  阅读(13)  评论(0编辑  收藏  举报