使用ML对恶意软件进行识别
| import pandas as pd |
| |
| data = pd.read_csv("MalwareData.csv", sep='|') |
| |
| print(data.columns) |
| |
| legit = data[0:41323].drop('legitimate', axis=1) |
| mal = data[41323::].drop('legitimate', axis=1) |
| |
| print("合法数据有:%s 个,特征有:%s "%(legit.shape[0], legit.shape[1])) |
| print("恶意数据有:%s 个,特征有:%s "%(mal.shape[0], mal.shape[1])) |
| |
| Index(['Name', 'md5', 'Machine', 'SizeOfOptionalHeader', 'Characteristics', |
| 'MajorLinkerVersion', 'MinorLinkerVersion', 'SizeOfCode', |
| 'SizeOfInitializedData', 'SizeOfUninitializedData', |
| 'AddressOfEntryPoint', 'BaseOfCode', 'BaseOfData', 'ImageBase', |
| 'SectionAlignment', 'FileAlignment', 'MajorOperatingSystemVersion', |
| 'MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion', |
| 'MajorSubsystemVersion', 'MinorSubsystemVersion', 'SizeOfImage', |
| 'SizeOfHeaders', 'CheckSum', 'Subsystem', 'DllCharacteristics', |
| 'SizeOfStackReserve', 'SizeOfStackCommit', 'SizeOfHeapReserve', |
| 'SizeOfHeapCommit', 'LoaderFlags', 'NumberOfRvaAndSizes', 'SectionsNb', |
| 'SectionsMeanEntropy', 'SectionsMinEntropy', 'SectionsMaxEntropy', |
| 'SectionsMeanRawsize', 'SectionsMinRawsize', 'SectionMaxRawsize', |
| 'SectionsMeanVirtualsize', 'SectionsMinVirtualsize', |
| 'SectionMaxVirtualsize', 'ImportsNbDLL', 'ImportsNb', |
| 'ImportsNbOrdinal', 'ExportNb', 'ResourcesNb', 'ResourcesMeanEntropy', |
| 'ResourcesMinEntropy', 'ResourcesMaxEntropy', 'ResourcesMeanSize', |
| 'ResourcesMinSize', 'ResourcesMaxSize', 'LoadConfigurationSize', |
| 'VersionInformationSize', 'legitimate'], |
| dtype='object') |
| 合法数据有:41323 个,特征有:56 |
| 恶意数据有:96724 个,特征有:56 |
| 0 1 |
| 1 1 |
| 2 1 |
| 3 1 |
| 4 1 |
| .. |
| 138042 0 |
| 138043 0 |
| 138044 0 |
| 138045 0 |
| 138046 0 |
| Name: legitimate, Length: 138047, dtype: int64 |
| from sklearn.ensemble import ExtraTreesClassifier |
| from sklearn.feature_selection import SelectFromModel |
| from sklearn.model_selection import train_test_split |
| data_in = data.drop(['Name', 'md5', 'legitimate'], axis=1).values |
| labels = data['legitimate'].values |
| extratree = ExtraTreesClassifier().fit(data_in, labels) |
| select = SelectFromModel(extratree, prefit=True) |
| data_in_new = select.transform(data_in) |
| print(data_in.shape, data_in_new.shape) |
| (138047, 54) (138047, 13) |
| features = data_in_new.shape[1] |
| importances = extratree.feature_importances_ |
| indices = np.argsort(importances)[::-1] |
| |
| for f in range(features): |
| print('%d'%(f+1),data.columns[2+indices[f]],importances[indices[f]]) |
| 1 DllCharacteristics 0.13159378465829966 |
| 2 Characteristics 0.12898426863444917 |
| 3 Machine 0.10501819146671434 |
| 4 VersionInformationSize 0.059924950764671775 |
| 5 Subsystem 0.05488002624255342 |
| 6 SectionsMaxEntropy 0.05349804545740858 |
| 7 MajorSubsystemVersion 0.052116049448295995 |
| 8 ResourcesMaxEntropy 0.044890679185710124 |
| 9 ImageBase 0.04252264661828136 |
| 10 SizeOfOptionalHeader 0.04178342613704668 |
| 11 ResourcesMinEntropy 0.039008608631288154 |
| 12 SectionsMinEntropy 0.028637455162767493 |
| 13 MajorOperatingSystemVersion 0.019217124710739743 |
| from sklearn.ensemble import RandomForestClassifier |
| legit_train, legit_test, mal_train, mal_test = train_test_split(data_in_new, labels) |
| classifier = RandomForestClassifier(n_estimators=50) |
| classifier.fit(legit_train,mal_train) |
RandomForestClassifier(n_estimators=50)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
| print("the score of the algorithm:", classifier.score(legit_test, mal_test) * 100) |
| the score of the algorithm: 99.4233889661567 |
| from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score |
| |
| def print_stats_metrics(y_test, y_pred): |
| print('Accuracy: %f' % accuracy_score(y_test,y_pred) ) |
| confmat = confusion_matrix(y_true=y_test, y_pred=y_pred) |
| print ("confusion matrix") |
| print(confmat) |
| print (pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)) |
| print('Precision: %f' % precision_score(y_true=y_test, y_pred=y_pred,average='binary')) |
| print('Recall: %f' % recall_score(y_true=y_test, y_pred=y_pred)) |
| print('F1-measure: %f' % f1_score(y_true=y_test, y_pred=y_pred)) |
| |
| predictions = classifier.predict(legit_test) |
| print_stats_metrics(mal_test,predictions) |
| Accuracy: 0.994234 |
| confusion matrix |
| [[24109 111] |
| [ 88 10204]] |
| Predicted 0 1 All |
| True |
| 0 24109 111 24220 |
| 1 88 10204 10292 |
| All 24197 10315 34512 |
| Precision: 0.989239 |
| Recall: 0.991450 |
| F1-measure: 0.990343 |
| from sklearn.ensemble import GradientBoostingClassifier |
| |
| grad_boost = GradientBoostingClassifier(n_estimators=50) |
| grad_boost.fit(legit_train, mal_train) |
| |
GradientBoostingClassifier(n_estimators=50)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
| predictions = grad_boost.predict(legit_test) |
| print_stats_metrics(mal_test, predictions) |
| Accuracy: 0.988497 |
| confusion matrix |
| [[24039 181] |
| [ 216 10076]] |
| Predicted 0 1 All |
| True |
| 0 24039 181 24220 |
| 1 216 10076 10292 |
| All 24255 10257 34512 |
| Precision: 0.982354 |
| Recall: 0.979013 |
| F1-measure: 0.980680 |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步