DGA ngram kmeans+TSNE用于绘图

复制代码
# -*- coding:utf-8 -*-

import sys
import re
import numpy as np
from sklearn.externals import joblib
import csv
import matplotlib.pyplot as plt
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import cross_validation
import os
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE


#处理域名的最小长度
MIN_LEN=10

#随机程度
random_state = 170


def load_alexa(filename):
    domain_list=[]
    csv_reader = csv.reader(open(filename))
    for row in csv_reader:
        domain=row[1]
        if domain >= MIN_LEN:
            domain_list.append(domain)
    return domain_list


def load_dga(filename):
    domain_list=[]
    #xsxqeadsbgvpdke.co.uk,Domain used by Cryptolocker - Flashback DGA for 13 Apr 2017,2017-04-13,
    # http://osint.bambenekconsulting.com/manual/cl.txt
    with open(filename) as f:
        for line in f:
            domain=line.split(",")[0]
            if domain >= MIN_LEN:
                domain_list.append(domain)
    return  domain_list


def nb_dga():
    x1_domain_list = load_alexa("../data/top-1000.csv")
    x2_domain_list = load_dga("../data/dga-cryptolocke-1000.txt")
    x3_domain_list = load_dga("../data/dga-post-tovar-goz-1000.txt")

    x_domain_list=np.concatenate((x1_domain_list, x2_domain_list,x3_domain_list))

    y1=[0]*len(x1_domain_list)
    y2=[1]*len(x2_domain_list)
    y3=[2]*len(x3_domain_list)

    y=np.concatenate((y1, y2,y3))

    print x_domain_list
    cv = CountVectorizer(ngram_range=(2, 2), decode_error="ignore",
                                          token_pattern=r"\w", min_df=1)
    x= cv.fit_transform(x_domain_list).toarray()

    clf = GaussianNB()
    print  cross_validation.cross_val_score(clf, x, y, n_jobs=-1, cv=3)

def kmeans_dga():
    x1_domain_list = load_alexa("../data/dga/top-100.csv")
    x2_domain_list = load_dga("../data/dga/dga-cryptolocke-50.txt")
    x3_domain_list = load_dga("../data/dga/dga-post-tovar-goz-50.txt")

    x_domain_list=np.concatenate((x1_domain_list, x2_domain_list,x3_domain_list))
    #x_domain_list = np.concatenate((x1_domain_list, x2_domain_list))

    y1=[0]*len(x1_domain_list)
    y2=[1]*len(x2_domain_list)
    y3=[1]*len(x3_domain_list)

    y=np.concatenate((y1, y2,y3))
    #y = np.concatenate((y1, y2))

    #print x_domain_list

    cv = CountVectorizer(ngram_range=(2, 2), decode_error="ignore",
                                          token_pattern=r"\w", min_df=1)
    x= cv.fit_transform(x_domain_list).toarray()
    model=KMeans(n_clusters=2, random_state=random_state)
    y_pred = model.fit_predict(x)
    #print  y_pred

    tsne = TSNE(learning_rate=100)
    x=tsne.fit_transform(x)
    print x
    print x_domain_list

    for i,label in enumerate(x):
        #print label
        x1,x2=x[i]
        if y_pred[i] == 1:
            plt.scatter(x1,x2,marker='o')
        else:
            plt.scatter(x1, x2,marker='x')
        #plt.annotate(label,xy=(x1,x2),xytext=(x1,x2))

    plt.show()

if __name__ == '__main__':
    #nb_dga()
    kmeans_dga()
复制代码

 

posted @   bonelee  阅读(3579)  评论(0编辑  收藏  举报
编辑推荐:
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
阅读排行:
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· DeepSeek 开源周回顾「GitHub 热点速览」
历史上的今天:
2016-11-17 ES批量索引写入时的ID自动生成算法
点击右上角即可分享
微信分享提示