导入包:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from preprocessing.build_vocab import build_vocab
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import gensim
from gensim.models import Word2Vec
from gensim.corpora import Dictionary
import os
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
构建词表
读取数据:
dataset_folder = "../../benchmarks/BIG2015/"
data_path = "../../benchmarks/BIG2015/opcode/opcode.csv"
data_folder = "../../benchmarks/BIG2015/opcode/"
df = pd.read_csv(data_path)
label = LabelEncoder().fit_transform(df['label' ])
data = df['opcode' ]
df.fillna("" )
构建词表:
使用 gensim 的 Dictionary
类处理语料,输入类似 [['add', 'push'], ...]
,需要先将单个文档(一段话)分割为列表。
因为在整个训练数据集上构建词表,没有加 unknown
标记,加了 pad
用于填充单个文档到固定长度。
vocab_save_path = "../../benchmarks/BIG2015/word2id.json"
opcode_seq_list_split = [seq.split() for seq in opcode_seq_list]
dct = Dictionary(opcode_seq_list_split)
print (len (opcode_seq_list_split))
print (dct)
print (len (list (dct.token2id.keys())))
special_tokens = {"pad" : 0 }
dct.patch_with_special_tokens(special_tokens)
print (len (list (dct.token2id.keys())))
语料库中文档的长度分布:
len_list = [len (seq) for seq in opcode_seq_list_split]
len_list
plt.hist(len_list)
print (f"小于 1000 的元素占比为: {(sum (1 for value in len_list if value < 1000 ) / len (len_list)) :.2 f} " )
print (f"小于 10000 的元素占比为: {sum (1 for value in len_list if value < 10000 )/len (len_list):.2 f} " )
以 json 格式 保存词汇表:
import json
with open (vocab_save_path, "w" ) as file:
json.dump(dct.token2id, file, indent=4 )
构建整数索引语料
使用doc2idx 将语料转换为整数索引语料,之后对每句话填充或者截断到固定长度。
int_opcode_list = []
desired_size = 1000
for opcode in opcode_seq_list_split:
int_opcode = dct.doc2idx(opcode)
if len (int_opcode) < 1000 :
int_opcode += [0 ] * (desired_size - len (int_opcode))
int_opcode_list.append(int_opcode)
elif len (int_opcode) >= 1000 :
int_opcode_list.append(int_opcode[0 :1000 ])
print (len (int_opcode_list))
保存标签和转换后的数据:
np.save(os.path.join(data_folder, "opcode_int_top1000.npy" ), np.array(int_opcode_list))
np.save(os.path.join("../../benchmarks/BIG2015/" , "label.npy" ), np.array(label))
构建 dataset 和 dataloader
读取保存的数据并划分训练集、验证集和测试集,数据划分比例为 4:4:2。
data = np.load(os.path.join(data_folder, "opcode_int_top1000.npy" ))
label = np.load(os.path.join(dataset_folder, "label.npy" ))
print (data.shape, label.shape)
x_train, x_temp, y_train, y_temp = train_test_split(data, label, test_size=0.2 , random_state=42 )
x_test, x_val, y_test, y_val = train_test_split(x_temp, y_temp, test_size=0.5 , random_state=42 )
print (x_train.shape, x_val.shape, x_test.shape)
构建 dataloader:
def build_dataloader (x_train=None , y_train=None , x_val=None , y_val=None , x_test=None , y_test=None ):
"""获取dataloader
"""
batch_size = 32
y_train, y_test, y_val = y_train.reshape(-1 ), y_test.reshape(-1 ), y_val.reshape(-1 )
train_set = TensorDataset(torch.from_numpy(x_train), torch.from_numpy(y_train))
train_loader = DataLoader(
dataset=train_set,
batch_size=batch_size,
shuffle=True ,
num_workers=1 ,
)
val_set = TensorDataset(torch.from_numpy(x_val), torch.from_numpy(y_val))
val_loader = DataLoader(
dataset=val_set,
shuffle=True ,
batch_size=batch_size,
num_workers=1 ,
)
test_set = TensorDataset(torch.from_numpy(x_test), torch.from_numpy(y_test))
test_loader = DataLoader(
dataset=test_set,
shuffle=True ,
batch_size=batch_size,
num_workers=1 ,
)
return train_loader, val_loader, test_loader
构建训练函数和推理函数
在训练函数和推理函数中:
得到所有预测结果后和真实标签计算 accuracy, precision, recall, f1
。
device = f'cuda:{0 } ' if torch.cuda.is_available() else 'cpu'
def train (epoch, model, train_loader, optimizer, criterion ):
"""训练函数
Args:
epoch (_type_): _description_
model (_type_): _description_
train_loader (_type_): _description_
optimizer (_type_): _description_
criterion (_type_): _description_
Returns:
_type_: _description_
"""
model.train()
total_samples = 0
total_accuracy = 0
train_loss = 0
predictions_all = []
labels_all = []
for batch, labels in tqdm(train_loader, ncols=100 , desc=f"epoch: {epoch} , training" ):
batch, labels = batch.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(batch)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
train_loss += loss * batch.size(0 )
_, predictions = outputs.max (1 )
predictions_all.extend(predictions.cpu().numpy())
labels_all.extend(labels.cpu().numpy())
total_samples += labels.size(0 )
accuracy = accuracy_score(predictions_all, labels_all)
train_loss = train_loss / total_samples
precision, recall, f1, _ = precision_recall_fscore_support(labels_all, predictions_all, average='macro' , zero_division=0 )
return train_loss, accuracy, precision, recall, f1
def val (epoch, model, val_loader, criterion ):
model.eval ()
total_samples = 0
total_accuracy = 0
val_loss = 0
predictions_all = []
labels_all = []
with torch.no_grad():
for batch, labels in tqdm(val_loader, desc=f"epoch: {epoch} , validating" , ncols=100 ):
batch, labels = batch.to(device), labels.to(device)
outputs = model(batch)
loss = criterion(outputs, labels)
val_loss += loss * batch.size(0 )
_, predictions = outputs.max (1 )
predictions_all.extend(predictions.cpu().numpy())
labels_all.extend(labels.cpu().numpy())
total_samples += labels.size(0 )
accuracy = accuracy_score(predictions_all, labels_all)
val_loss = val_loss / total_samples
precision, recall, f1, _ = precision_recall_fscore_support(labels_all, predictions_all, average='macro' , zero_division=0 )
return val_loss,accuracy, precision, recall, f1
构建 TextCNN 模型:
import torch
import torch.nn as nn
import torch.nn.functional as F
class TextCNN (nn.Module):
def __init__ (self, vocab_size, embedding_dim, num_filters, filter_sizes, output_dim, dropout ):
super (TextCNN, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.convs = nn.ModuleList([
nn.Conv1d(in_channels=embedding_dim,
out_channels=num_filters,
kernel_size=fs)
for fs in filter_sizes
])
self.fc = nn.Linear(len (filter_sizes) * num_filters, output_dim)
self.dropout = nn.Dropout(dropout)
def forward (self, text ):
embedded = self.embedding(text)
embedded = embedded.permute(0 , 2 , 1 )
conved = [F.relu(conv(embedded)) for conv in self.convs]
pooled = [F.max_pool1d(conv, conv.shape[2 ]).squeeze(2 ) for conv in conved]
cat = self.dropout(torch.cat(pooled, dim=1 ))
output = self.fc(cat)
return output
画图函数:
def plot_data (save_path, list1, list2, label="Loss" ):
x = range (1 , len (list1) + 1 )
plt.plot(x, list1, label=f'Train {label} ' )
plt.plot(x, list2, label=f'Validate {label} ' )
plt.legend()
plt.title(f'{label} Curve' )
plt.xlabel('Epoch' )
plt.ylabel(f'{label} ' )
plt.savefig(os.path.join(save_path, f'{label} _curve.png' ), format ='png' )
plt.show()
plt.close()
训练、推理和结果分析
训练和验证代码:
使用 GPU 的情况下,训练时间较短,在训练完成后可以加上推理代码。
epochs = 10
output_dir = "../outputs/big2015/"
vocab_size = 736
embedding_dim = 128
num_filters = 300
filter_sizes = [3 , 4 , 5 , 6 ]
output_dim = 9
dropout = 0.2
train_loader, val_loader, test_loader = build_dataloader(x_train, y_train, x_val, y_val, x_test, y_test)
model = TextCNN(vocab_size, embedding_dim, num_filters, filter_sizes, output_dim, dropout)
model.to(device)
print (model)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001 )
train_loss_list, train_acc_list, val_loss_list, val_acc_list = [],[],[],[]
best_epoch = 0
best_val_loss = float ('inf' )
best_val_acc= float ('inf' )
patience = 5
for epoch in range (1 , epochs):
train_loss, train_acc, train_precision, train_recall, train_f1 = train(epoch, model, train_loader, optimizer, criterion)
print ('Epoch: {}, Train Loss: {:.4f}, Train Acc: {:.2f}%, Train Precision: {:.2f}%, Train Recall: {:.2f}%, Train F1: {:.2f}%' .format ((epoch), train_loss, 100 *train_acc, 100 *train_precision, 100 *train_recall, 100 *train_f1))
val_loss, val_acc, val_precision, val_recall, val_f1 = val(epoch, model, val_loader, criterion)
print ('Epoch: {}, Val Loss: {:.4f}, Val Acc: {:.2f}%, Val Precision: {:.2f}%, Val Recall: {:.2f}%, Val F1: {:.2f}%, (Best Val Acc: {:.2f}%)' .format (epoch, val_loss, 100 *val_acc, 100 *val_precision, 100 *val_recall, 100 *val_f1, 100 *best_val_acc))
if best_val_loss > val_loss :
best_epoch = epoch
best_val_acc = val_acc
best_val_loss = val_loss
wait = 0
save_dir = os.path.join(output_dir, "test" )
os.makedirs(save_dir, exist_ok=True )
print (f"--> save model success: {save_dir} " )
torch.save(model.state_dict(), os.path.join(save_dir, "textcnn.pth" ))
else :
wait += 1
if wait >= patience:
print (f'Early stopping at epoch {epoch} ...' )
print (f"--> Best Epoch: {best_epoch} , Best Val Acc: {best_val_acc} , Best Val Loss: {best_val_loss} " )
train_loss_list.append(train_loss.cpu().detach().numpy())
train_acc_list.append(train_acc)
val_loss_list.append(val_loss.cpu().detach().numpy())
val_acc_list.append(val_acc)
print ("-------------------------------------------------" )
plot_data(os.path.join(output_dir, "test" ), train_acc_list, val_acc_list, label="Acc" )
plot_data(os.path.join(output_dir, "test" ), train_loss_list, val_loss_list, label='Loss' )
第 7 个 epoch 开始有过拟合倾向,但总体过拟合不严重。
限制长度为 1000,使用 TextCNN,验证精度可以达到 97+。
提取操作码时,没有去除 dd dw
这类数据定义指令,部分操作码序列长度很长,后续可以考虑去掉这类指令,并合并处理语义相近的指令,可以进一步缩小序列长度,实验效果是否能有提升需要再验证。
一个非常有挑战性的问题,指令的参数是否能帮助检测/分类任务。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 25岁的心里话
· 闲置电脑爆改个人服务器(超详细) #公网映射 #Vmware虚拟网络编辑器
· 零经验选手,Compose 一天开发一款小游戏!
· 通过 API 将Deepseek响应流式内容输出到前端
· 因为Apifox不支持离线,我果断选择了Apipost!