hmm CDN检测
# -*- coding:utf-8 -*- import sys import re from hmmlearn import hmm import numpy as np from sklearn.externals import joblib import matplotlib.pyplot as plt import tldextract import os def iterbrowse(path): for home, dirs, files in os.walk(path): for filename in files: yield os.path.join(home, filename) def extract_domain(domain): suffix = {'.com','.la','.io', '.co', '.cn','.info', '.net', '.org','.me', '.mobi', '.us', '.biz', '.xxx', '.ca', '.co.jp', '.com.cn', '.net.cn', '.org.cn', '.mx','.tv', '.ws', '.ag', '.com.ag', '.net.ag', '.org.ag','.am','.asia', '.at', '.be', '.com.br', '.net.br', '.name', '.live', '.news', '.bz', '.tech', '.pub', '.wang', '.space', '.top', '.xin', '.social', '.date', '.site', '.red', '.studio', '.link', '.online', '.help', '.kr', '.club', '.com.bz', '.net.bz', '.cc', '.band', '.market', '.com.co', '.net.co', '.nom.co', '.lawyer', '.de', '.es', '.com.es', '.nom.es', '.org.es', '.eu', '.wiki', '.design', '.software', '.fm', '.fr', '.gs', '.in', '.co.in', '.firm.in', '.gen.in', '.ind.in', '.net.in', '.org.in', '.it', '.jobs', '.jp', '.ms', '.com.mx', '.nl','.nu','.co.nz','.net.nz', '.org.nz', '.se', '.tc', '.tk', '.tw', '.com.tw', '.idv.tw', '.org.tw', '.hk', '.co.uk', '.me.uk', '.org.uk', '.vg'} domain = domain.lower() names = domain.split(".") if len(names) >= 3: if ("."+".".join(names[-2:])) in suffix: return ".".join(names[-3:]), ".".join(names[:-3]) elif ("."+names[-1]) in suffix: return ".".join(names[-2:]), ".".join(names[:-2]) print "New domain suffix found. Use tld extract domain..." pos = domain.rfind("/") if pos >= 0: # maybe subdomain contains /, for dns tunnel tool ext = tldextract.extract(domain[pos+1:]) subdomain = domain[:pos+1] + ext.subdomain else: ext = tldextract.extract(domain) subdomain = ext.subdomain if ext.suffix: mdomain = ext.domain + "." + ext.suffix else: mdomain = ext.domain return mdomain, subdomain def parse(log): data = log.split('^') SRC_PORT_IDX = 5-1 DST_PORT_IDX = 6-1 PROTOCOL_IDX = 7-1 protol = data[PROTOCOL_IDX] dstport = data[DST_PORT_IDX] if '17' == protol and ('53' == dstport): DNS_QUERY_NAME_IDX = 55-1 # domain if (len(data) < 55): print "error line:" print log return ("", "") domain = data[DNS_QUERY_NAME_IDX] mdomain, subdomain = extract_domain(domain) return (mdomain, subdomain) else: print "error line not a DNS:" print log return ("", "") #处理域名的最小长度 MIN_LEN=3 #状态个数 N=5 #最大似然概率阈值 T=-50 #模型文件名 FILE_MODEL="hmm-cdn.m" def get_cdn_domains(dir_path): domain_list=[] for path in iterbrowse(dir_path): with open(path) as f: for line in f: mdomain, sub_domain = parse(line) if len(sub_domain) >= MIN_LEN: domain_list.append(sub_domain) if len(domain_list) >= 2000: return domain_list #else: # print path, "pass line:", line return domain_list def domain2ver(domain): ver=[] for i in range(0,len(domain)): ver.append([ord(domain[i])]) return ver def train_hmm(domain_list): X = [[0]] X_lens = [1] for domain in domain_list: ver=domain2ver(domain) np_ver = np.array(ver) #print len(np_ver) try: X=np.concatenate([X,np_ver]) except ValueError: print domain print len(X), len(np_ver) print X print np_ver raise X_lens.append(len(np_ver)) remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=100) remodel.fit(X,X_lens) joblib.dump(remodel, FILE_MODEL) return remodel def test(remodel, domain_list): x=[] y=[] for domain in domain_list: domain_ver=domain2ver(domain) np_ver = np.array(domain_ver) pro = remodel.score(np_ver) print "SCORE:(%d) DOMAIN:(%s) " % (pro, domain) x.append(len(domain)) y.append(pro) return x,y if __name__ == '__main__': domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_cdn") remodel=train_hmm(domain_list) remodel=joblib.load(FILE_MODEL) x_1,y_1=test(remodel, domain_list) print x_1 print y_1 #sys.exit(0) domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_black") x_2,y_2=test(remodel, domain_list) print x_2 print y_2 domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_white_like") x_3,y_3=test(remodel, domain_list) print x_3 print y_3 #%matplotlib inline fig,ax=plt.subplots() ax.set_xlabel('Domain Length') ax.set_ylabel('HMM Score') ax.scatter(x_3,y_3,color='b',label="WHITE") ax.scatter(x_2, y_2, color='g', label="BLACK") ax.scatter(x_1, y_1, color='r', label="CDN") ax.legend(loc='right') plt.show()
使用pickle保存和加载模型:
# -*- coding:utf-8 -*- import sys import re from hmmlearn import hmm import numpy as np #from sklearn.externals import joblib import matplotlib.pyplot as plt import tldextract import os import pickle def iterbrowse(path): for home, dirs, files in os.walk(path): for filename in files: yield os.path.join(home, filename) def extract_domain(domain): suffix = {'.com','.la','.io', '.co', '.cn','.info', '.net', '.org','.me', '.mobi', '.us', '.biz', '.xxx', '.ca', '.co.jp', '.com.cn', '.net.cn', '.org.cn', '.mx','.tv', '.ws', '.ag', '.com.ag', '.net.ag', '.org.ag','.am','.asia', '.at', '.be', '.com.br', '.net.br', '.name', '.live', '.news', '.bz', '.tech', '.pub', '.wang', '.space', '.top', '.xin', '.social', '.date', '.site', '.red', '.studio', '.link', '.online', '.help', '.kr', '.club', '.com.bz', '.net.bz', '.cc', '.band', '.market', '.com.co', '.net.co', '.nom.co', '.lawyer', '.de', '.es', '.com.es', '.nom.es', '.org.es', '.eu', '.wiki', '.design', '.software', '.fm', '.fr', '.gs', '.in', '.co.in', '.firm.in', '.gen.in', '.ind.in', '.net.in', '.org.in', '.it', '.jobs', '.jp', '.ms', '.com.mx', '.nl','.nu','.co.nz','.net.nz', '.org.nz', '.se', '.tc', '.tk', '.tw', '.com.tw', '.idv.tw', '.org.tw', '.hk', '.co.uk', '.me.uk', '.org.uk', '.vg'} domain = domain.lower() names = domain.split(".") if len(names) >= 3: if ("."+".".join(names[-2:])) in suffix: return ".".join(names[-3:]), ".".join(names[:-3]) elif ("."+names[-1]) in suffix: return ".".join(names[-2:]), ".".join(names[:-2]) print "New domain suffix found. Use tld extract domain..." pos = domain.rfind("/") if pos >= 0: # maybe subdomain contains /, for dns tunnel tool ext = tldextract.extract(domain[pos+1:]) subdomain = domain[:pos+1] + ext.subdomain else: ext = tldextract.extract(domain) subdomain = ext.subdomain if ext.suffix: mdomain = ext.domain + "." + ext.suffix else: mdomain = ext.domain return mdomain, subdomain def parse(log): data = log.split('^') SRC_PORT_IDX = 5-1 DST_PORT_IDX = 6-1 PROTOCOL_IDX = 7-1 protol = data[PROTOCOL_IDX] dstport = data[DST_PORT_IDX] if '17' == protol and ('53' == dstport): DNS_QUERY_NAME_IDX = 55-1 # domain if (len(data) < 55): print "error line:" print log return ("", "") domain = data[DNS_QUERY_NAME_IDX] mdomain, subdomain = extract_domain(domain) return (mdomain, subdomain) else: print "error line not a DNS:" print log return ("", "") #处理域名的最小长度 MIN_LEN=1 #状态个数 N=8 #最大似然概率阈值 T=-50 #模型文件名 FILE_MODEL="hmm-cdn.m" FILE_MODEL2 ="hmm-cdn-white.pkl" def get_cdn_domains(dir_path): domain_list=[] for path in iterbrowse(dir_path): with open(path) as f: for line in f: mdomain, sub_domain = parse(line) if len(sub_domain) >= MIN_LEN: domain_list.append(sub_domain) if len(domain_list) >= 3000: return domain_list #else: # print path, "pass line:", line return domain_list def domain2ver(domain): ver=[] for i in range(0,len(domain)): ver.append([ord(domain[i])]) return ver def train_hmm(domain_list): if os.path.exists(FILE_MODEL2): print "found model file, use it..." file_model = open(FILE_MODEL2, 'rb') model = pickle.load(file_model) file_model.close() return model X = [[0]] X_lens = [1] for domain in domain_list: ver=domain2ver(domain) np_ver = np.array(ver) #print len(np_ver) try: X=np.concatenate([X,np_ver]) except ValueError: print domain print len(X), len(np_ver) print X print np_ver raise X_lens.append(len(np_ver)) #remodel = hmm.GaussianHMM(n_components=N, covariance_type="spherical", n_iter=500) #spherical, diag full,tied remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=500) remodel.fit(X,X_lens) #joblib.dump(remodel, FILE_MODEL) file_model = open(FILE_MODEL2, 'wb') pickle.dump(remodel, file_model) file_model.close() return remodel def test(remodel, domain_list): x=[] y=[] for domain in domain_list: domain_ver=domain2ver(domain) np_ver = np.array(domain_ver) pro = remodel.score(np_ver) print "SCORE:(%d) DOMAIN:(%s) " % (pro, domain) x.append(len(domain)) y.append(pro) return x,y if __name__ == '__main__': domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_cdn") domain_list2 = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_white_like") #remodel=train_hmm(domain_list) remodel=train_hmm(domain_list+domain_list2) #remodel=joblib.load(FILE_MODEL) x_1,y_1=test(remodel, domain_list) print x_1 print y_1 #sys.exit(0) domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_black") x_2,y_2=test(remodel, domain_list) print x_2 print y_2 domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_white_like") x_3,y_3=test(remodel, domain_list) print x_3 print y_3 #%matplotlib inline fig,ax=plt.subplots() ax.set_xlabel('Domain Length') ax.set_ylabel('HMM Score') #ax.scatter(x_3,y_3,color='b',label="WHITE") ax.scatter(x_2, y_2, color='g', label="DNS tunnel") ax.scatter(x_1, y_1, color='r', label="CDN") ax.legend(loc='right') plt.show()
其中:X = [[0]],X_lens = [] 也可以按照下面方式进行读写。除去了冗余的初始化。
def train_hmm(domain_list): if os.path.exists(FILE_MODEL2): print "found model file, use it..." file_model = open(FILE_MODEL2, 'rb') model = pickle.load(file_model) file_model.close() return model #X = [[0]] #X_lens = [1] X = [] X_lens = [] #print X for domain in domain_list: ver=domain2ver(domain) #np_ver = np.array(ver) try: #X=np.concatenate([X,np_ver]) X = X + ver except ValueError: print domain print X print ver raise X_lens.append(len(ver)) #remodel = hmm.GaussianHMM(n_components=N, covariance_type="spherical", n_iter=500) #spherical, diag full,tied remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=500) remodel.fit(X,X_lens) #joblib.dump(remodel, FILE_MODEL) file_model = open(FILE_MODEL2, 'wb') pickle.dump(remodel, file_model) file_model.close() return remodel
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· DeepSeek 开源周回顾「GitHub 热点速览」