malware detection and machine learning(EMBER)
EMBER
https://github.com/elastic/ember\
paper: https://arxiv.org/abs/1804.04637
特征
9个特征组,可以分为两大部分
文件结构无关特征
-
字节直方图
-
字节熵直方图
-
可打印字符串统计
{'numstrings': 3967, 'avlength': 16.07159062263675, 'printabledist': [3729,65,……], 'printables': 63756, 'entropy': 5.877838134765625, 'paths': 4, 'urls': 26, 'registry': 0, 'MZ': 11}
文件结构相关特征
- general
- file header
- sections
- imports
- exports
- datadirections
分别如下:
-
general
# 直接使用数值作为特征数值 {'size': 1237896, 'vsize': 1241088, 'has_debug': 1, 'exports': 0, 'imports': 314, 'has_relocations': 1, 'has_resources': 1, 'has_signature': 1, 'has_tls': 1, 'symbols': 0}
-
file header
- coff header
- option header
# 数值保持原始;文本进行hash {'coff': {'timestamp': 1639042586, 'machine': 'I386', 'characteristics': ['CHARA_32BIT_MACHINE', 'EXECUTABLE_IMAGE']}, 'optional': {'subsystem': 'WINDOWS_GUI', 'dll_characteristics': ['DYNAMIC_BASE', 'NX_COMPAT', 'TERMINAL_SERVER_AWARE'], 'magic': 'PE32', 'major_image_version': 0, 'minor_image_version': 0, 'major_linker_version': 14, 'minor_linker_version': 29, 'major_operating_system_version': 6, 'minor_operating_system_version': 0, 'major_subsystem_version': 6, 'minor_subsystem_version': 0, 'sizeof_code': 368640, 'sizeof_headers': 1024, 'sizeof_heap_commit': 4096}}
-
sections
# 数值+hash {'entry': '.text', 'sections': [{'name': '.text', 'size': 368640, 'entropy': 6.463957857941052, 'vsize': 368140, 'props': ['CNT_CODE', 'MEM_EXECUTE', 'MEM_READ']}, {'name': '.rdata', 'size': 104960, 'entropy': 4.837026560868303, 'vsize': 104760, 'props': ['CNT_INITIALIZED_DATA', 'MEM_READ']}, {'name': '.data', 'size': 28672, 'entropy': 0.6108592144000272, 'vsize': 32760, 'props': ['CNT_INITIALIZED_DATA', 'MEM_READ', 'MEM_WRITE']}, {'name': '.rsrc', 'size': 703488, 'entropy': 5.868256562445707, 'vsize': 703408, 'props': ['CNT_INITIALIZED_DATA', 'MEM_READ']}, {'name': '.reloc', 'size': 22016, 'entropy': 6.754089624508025, 'vsize': 21584, 'props': ['CNT_INITIALIZED_DATA', 'MEM_DISCARDABLE', 'MEM_READ']}]}
-
imports
# dll+导入函数名: hash {'NETAPI32.dll': ['NetUserGetGroups', 'NetUserGetLocalGroups'], 'RPCRT4.dll': ['UuidFromStringW'], 'VERSION.dll': ['GetFileVersionInfoW', 'GetFileVersionInfoSizeW', 'VerQueryValueW'], 'KERNEL32.dll': ['FindFirstFileExW', 'FindClose', 'GetConsoleOutputCP', 'SetFilePointerEx', 'GetFileSizeEx', 'ReadConsoleW', 'ReadConsoleInputW', 'SetConsoleMode', ……}
-
exports
# 导出函数: hash
-
datadirectories
# 直接使用 size 和 virtual_address 数值作为特征数值 [{'name': 'EXPORT_TABLE', 'size': 0, 'virtual_address': 0}, {'name': 'IMPORT_TABLE', 'size': 300, 'virtual_address': 470148}, {'name': 'RESOURCE_TABLE', 'size': 703408, 'virtual_address': 512000}, {'name': 'EXCEPTION_TABLE', 'size': 0, 'virtual_address': 0}, {'name': 'CERTIFICATE_TABLE', 'size': 9096, 'virtual_address': 1228800}, {'name': 'BASE_RELOCATION_TABLE', 'size': 21584, 'virtual_address': 1216512}, {'name': 'DEBUG', 'size': 112, 'virtual_address': 452584}, {'name': 'ARCHITECTURE', 'size': 0, 'virtual_address': 0}, {'name': 'GLOBAL_PTR', 'size': 0, 'virtual_address': 0}, {'name': 'TLS_TABLE', 'size': 24, 'virtual_address': 452928}, {'name': 'LOAD_CONFIG_TABLE', 'size': 64, 'virtual_address': 452696}, {'name': 'BOUND_IMPORT', 'size': 0, 'virtual_address': 0}, {'name': 'IAT', 'size': 1368, 'virtual_address': 372736}, {'name': 'DELAY_IMPORT_DESCRIPTOR', 'size': 0, 'virtual_address': 0}, {'name': 'CLR_RUNTIME_HEADER', 'size': 0, 'virtual_address': 0}]
模型
lightgbm
params = {
"boosting": "gbdt",
"objective": "binary",
"num_iterations": 1000,
"learning_rate": 0.05,
"num_leaves": 2048,
"max_depth": 15,
"min_data_in_leaf": 50,
"feature_fraction": 0.5
}
malconv
maxlen = 2**20 # 1MB
embedding_size = 8
# define model structure
inp = Input( shape=(maxlen,))
emb = Embedding( input_dim, embedding_size )( inp )
filt = Conv1D( filters=128, kernel_size=500, strides=500, use_bias=True, activation='relu', padding='valid' )(emb)
attn = Conv1D( filters=128, kernel_size=500, strides=500, use_bias=True, activation='sigmoid', padding='valid')(emb)
gated = Multiply()([filt,attn])
feat = GlobalMaxPooling1D()( gated )
dense = Dense(128, activation='relu')(feat)
outp = Dense(1, activation='sigmoid')(dense)
basemodel = Model( inp, outp )