dga model train and test code
# _*_coding:UTF-8_*_ import operator import tldextract import random import pickle import os import tflearn from math import log from tflearn.data_utils import to_categorical, pad_sequences from tflearn.layers.core import input_data, dropout, fully_connected from tflearn.layers.conv import conv_1d, max_pool_1d from tflearn.layers.estimator import regression from tflearn.layers.normalization import batch_normalization from sklearn.model_selection import train_test_split def get_cnn_model(max_len, volcab_size=None): if volcab_size is None: volcab_size = 10240000 # Building convolutional network network = tflearn.input_data(shape=[None, max_len], name='input') network = tflearn.embedding(network, input_dim=volcab_size, output_dim=32) network = conv_1d(network, 64, 3, activation='relu', regularizer="L2") network = max_pool_1d(network, 2) network = conv_1d(network, 64, 3, activation='relu', regularizer="L2") network = max_pool_1d(network, 2) network = batch_normalization(network) network = fully_connected(network, 64, activation='relu') network = dropout(network, 0.5) network = fully_connected(network, 2, activation='softmax') sgd = tflearn.SGD(learning_rate=0.1, lr_decay=0.96, decay_step=1000) network = regression(network, optimizer=sgd, loss='categorical_crossentropy') model = tflearn.DNN(network, tensorboard_verbose=0) return model def get_data_from(file_name): ans = [] with open(file_name) as f: for line in f: domain_name = line.strip() ans.append(domain_name) return ans def get_local_data(tag="labeled"): white_data = get_data_from(file_name="dga_360_sorted.txt") black_data = get_data_from(file_name="top-1m.csv") return black_data, white_data def get_data(): black_x, white_x = get_local_data() black_y, white_y = [1]*len(black_x), [0]*len(white_x) X = black_x + white_x labels = black_y + white_y # Generate a dictionary of valid characters valid_chars = {x:idx+1 for idx, x in enumerate(set(''.join(X)))} max_features = len(valid_chars) + 1 print("max_features:", max_features) maxlen = max([len(x) for x in X]) print("max_len:", maxlen) maxlen = min(maxlen, 256) # Convert characters to int and pad X = [[valid_chars[y] for y in x] for x in X] X = pad_sequences(X, maxlen=maxlen, value=0.) # Convert labels to 0-1 Y = to_categorical(labels, nb_classes=2) volcab_file = "volcab.pkl" output = open(volcab_file, 'wb') # Pickle dictionary using protocol 0. data = {"valid_chars": valid_chars, "max_len": maxlen, "volcab_size": max_features} pickle.dump(data, output) output.close() return X, Y, maxlen, max_features def train_model(): X, Y, max_len, volcab_size = get_data() print("X len:", len(X), "Y len:", len(Y)) trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.2, random_state=42) print(trainX[:1]) print(trainY[:1]) print(testX[-1:]) print(testY[-1:]) model = get_cnn_model(max_len, volcab_size) model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=1024) filename = 'finalized_model.tflearn' model.save(filename) model.load(filename) print("Just review 3 sample data test result:") result = model.predict(testX[0:3]) print(result) def test_model(): volcab_file = "volcab.pkl" assert os.path.exists(volcab_file) pkl_file = open(volcab_file, 'rb') data = pickle.load(pkl_file) valid_chars, max_document_length, max_features = data["valid_chars"], data["max_len"], data["volcab_size"] print("max_features:", max_features) print("max_len:", max_document_length) cnn_model = get_cnn_model(max_document_length, max_features) filename = 'finalized_model.tflearn' cnn_model.load(filename) print("predict domains:") bls = list() with open("dga_360_sorted.txt") as f: # with open("todo.txt") as f: lines = f.readlines() print("domain_list len:", len(lines)) cnt = 1000 for i in range(0, len(lines), cnt): lines2 = lines[i:i+cnt] domain_list = [line.strip() for line in lines2] #print("domain_list sample:", domain_list[:5]) # Convert characters to int and pad X = [[valid_chars[y] if y in valid_chars else 0 for y in x] for x in domain_list] X = pad_sequences(X, maxlen=max_document_length, value=0.) result = cnn_model.predict(X) for i, domain in enumerate(domain_list): if result[i][1] > .5: #.95: #print(lines2[i], domain + " is GDA") print(lines2[i].strip() + "\t" + domain, result[i][1]) bls.append(domain) else: #print(lines2[i], domain ) pass #print(bls) print(len(bls) , "dga found!") if __name__ == "__main__": print("train model...") train_model() print("test model...") test_model()