机器学习样本标记 示意代码
目标:根据各个字段数据的分布(例如srcIP和dstIP的top 10)以及其他特征来进行样本标注,最终将几类样本分别标注在black/white/ddos/mddos/cdn/unknown几类。
效果示意:
-------------choose one--------------
sub domain: DNSQueryName(N)
ip: srcip(S) or dstip(D)
length: DNSRequestLength(R1) or DNSReplyLength(R2)
length too: DNSRequestErrLength(R3) or DNSReplyErrLength(R4)
port: sourcePort(P1) or destPort(P2) or DNSReplyTTL(T)
code: DNSReplyCode(C2) or DNSRequestRRType(C1)
other: DNSRRClass(RR) or DNSReplyIPv4(V)
-------------label or quit------------
black(B) or white(W) or cdn(CDN) or ddos(DDOS) or mddos(M) or unknown(U) or white-like(L)
next(Q) or exit(E)?
***************************************
domain: workgroup. flow count: 206
***************************************
------------srcip-----------------
count 206
unique 9
top 162.105.129.122
freq 150
Name: sourceIP, dtype: object
--------------destip---------------
count 206
unique 12
top 199.7.83.42
freq 82
Name: destIP, dtype: object
代码:
import sys import json import os import pandas as pd import tldextract # import numpy as np medata_field = ''' 3 = sourceIP 4 = destIP 5 = sourcePort 6 = destPort 7 = protocol 12 = flowStartSeconds 13 = flowEndSecond 54 = DNSReplyCode 55 = DNSQueryName 56 = DNSRequestRRType 57 = DNSRRClass 58 = DNSDelay 59 = DNSReplyTTL 60 = DNSReplyIPv4 61 = DNSReplyIPv6 62 = DNSReplyRRType 77 = DNSReplyName 81 = payload 88 = DNSRequestLength 89 = DNSRequestErrLength 90 = DNSReplyLength 91 = DNSReplyErrLength ''' medata_field_num = [] medata_field_info = [] for l in medata_field.split("\n"): if len(l) == 0: continue num, info = l.split(" = ") medata_field_num.append(int(num)-1) medata_field_info.append(info) print medata_field_num print medata_field_info def extract_domain(domain): try: ext = tldextract.extract(domain) subdomain = ext.subdomain if ext.domain == "": mdomain = ext.suffix else: mdomain = ".".join(ext[1:]) return mdomain except Exception,e: print "extract_domain error:", e return "unknown" def parse_metadata(path): df = pd.read_csv(path, sep="^", header=None) dns_df = df.iloc[:, medata_field_num].copy() dns_df.columns = medata_field_info # print dns_df.tail() dns_df["mdomain"] = dns_df["DNSQueryName"].apply(extract_domain) # print dns_df.groupby('mdomain').describe() # print dns_df.groupby('mdomain').groups return dns_df.groupby('mdomain') def get_data_dist(df, col="sourceIP"): # group count by ip dist grouped = df.groupby(col) # print grouped.head(10)[col] print type(grouped.size()) size = grouped.size() print size print "-----------top 10-------------" print size.nlargest(10) def get_ipv4_dist(df, col="DNSReplyLength"): # group count by ip dist df2 = df[df[col] > 0] print "filter before length:", len(df), "filter after length:", len(df2) grouped = df2.groupby(by="DNSReplyIPv4") # print grouped.head(10)[col] size = grouped.size() print size print "-----------top 10-------------" print size.nlargest(10) def move_to(srcpath, domain, dst_path): with open(dst_path, "w") as w: with open(srcpath) as r: for line in r: if extract_domain(line.split("^")[55-1]) == domain: w.write(line) def main(): history_op = {} if os.path.exists("history_op.json"): with open("history_op.json") as h: history_op = json.load(h) print history_op for day in range(24, 27): for hour in range(0, 24): path = "/home/bonelee/latest_metadata_sample/sampled/unknown_sample/debugdogcom-medata_wanted-2017-09-%d-%d.txt" % (day, hour) if not os.path.exists(path) or os.path.getsize(path) == 0: print path, "passed, file not exists or empty file." continue print path, "running..." try: domains_info = parse_metadata(path) except IOError, e: print e continue for domain, group in domains_info: print "***************************************" print "domain:", domain, "flow count:", len(group) print "***************************************" # print type(group) #<class 'pandas.core.frame.DataFrame'> print "------------srcip-----------------" print group["sourceIP"].describe() print "--------------destip---------------" print group["destIP"].describe() print "----------------------------------------" print "ipv4 address return dist:" get_ipv4_dist(group) print "----------------------------------------" has_judged = False need_break = False while True: print "-------------choose one--------------" print "sub domain: DNSQueryName(N)" print "ip: srcip(S) or dstip(D)" print "length: DNSRequestLength(R1) or DNSReplyLength(R2)" print "length too: DNSRequestErrLength(R3) or DNSReplyErrLength(R4)" print "port: sourcePort(P1) or destPort(P2) or DNSReplyTTL(T)" print "code: DNSReplyCode(C2) or DNSRequestRRType(C1)" print "other: DNSRRClass(RR) or DNSReplyIPv4(V)" dist_dict = {"R1": "DNSRequestLength", "R2": "DNSReplyLength", "R3": "DNSRequestErrLength", "R4": "DNSReplyErrLength", "P1": "sourcePort", "P2": "destPort", "T": "DNSReplyTTL", "C2": "DNSReplyCode", "C1": "DNSRequestRRType", "RR": "DNSRRClass", "V": "DNSReplyIPv4", "S": "sourceIP", "D": "destIP", "N": "DNSQueryName" } print "-------------label or quit------------" print "black(B) or white(W) or cdn(CDN) or ddos(DDOS) or mddos(M) or unknown(U) or white-like(L)" print "next(Q) or exit(E)?" domain = domain.lower() if "win" == domain[-len("win"):] or "site" == domain[-len("site"):] or "vip" == domain[-len("vip"):]: check = "U" need_break = True elif "lan" in domain or "local" in domain or "dhcp" in domain or "workgroup" in domain or "home" in domain: check = "DDOS" need_break = True elif "cdn" in domain: check = "CDN" need_break = True else: if domain in history_op and not has_judged: print "found history op:", history_op[domain] if not raw_input("OK(Enter for Y)?"): check = history_op[domain] need_break = True else: check = raw_input("Input:") else: check = raw_input("Input:") has_judged = True if check == "Q": print path, "next OK!" break elif check == "E": print path, "Exit!" with open("history_op.json", "w") as f: json.dump(history_op, f) print "saved history_op.json" sys.exit() elif check == "B": move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_black/2017-8-%d-%d-%s.txt" % (day, hour, domain)) history_op[domain] = "B" print "Saved OK!" if need_break: break elif check == "W": move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_white/2017-8-%d-%d-%s.txt" % (day, hour, domain)) history_op[domain] = "W" print "Saved OK!" if need_break: break elif check == "L": move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_white_like/2017-8-%d-%d-%s.txt" % (day, hour, domain)) history_op[domain] = "L" print "Saved OK!" if need_break: break elif check == "CDN": move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_cdn/2017-8-%d-%d-%s.txt" % (day, hour, domain)) history_op[domain] = "CDN" print "Saved OK!" if need_break: break elif check == "DDOS": move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_ddos/2017-8-%d-%d-%s.txt" % (day, hour, domain)) history_op[domain] = "DDOS" print "Saved OK!" if need_break: break elif check == "M": move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_mddos/2017-8-%d-%d-%s.txt" % (day, hour, domain)) history_op[domain] = "M" print "Saved OK!" if need_break: break elif check == "U": move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_unknown/2017-8-%d-%d-%s.txt" % (day, hour, domain)) history_op[domain] = "U" print "Saved OK!" if need_break: break else: if check in dist_dict: get_data_dist(group, dist_dict[check]) else: print "unknown input!Choose the following one:" print "*******************************" print path, "check over..." print "*******************************" if __name__ == "__main__": main()