python 用正则处理日志实例
前提:
了解正则基本语法
1 import re 2 with open('top10_xiaozhuang_net.log','r') as f1: #读取日志文件 3 4 subject=f1.readlines() 5 with open('slice_log.log','w') as f2: #将切割结果存储到slice_log.log 6 for line in subject: 7 #line: 8 2019-04-15 00:00:00 192.168.254.253 info LinkProof: 14/04/2019 22:51:53 14/04/2019 22:52:48 114. 80.179.132 210. 29.144. 1 211.65.207.189 UDP 17224 53 0.0.0.0 OTHER 84, 9 14/04/2019 22:51:53 14/04/2019 22:52:48 120.221.144.117 210. 29.144. 1 211.65.207.189 UDP 38883 53 0.0.0.0 OTHER 80, 10 14/04/2019 22:51:53 14/04/2019 22:52:48 112. 47. 12.154 210. 29.144. 1 211.65.207.189 UDP 34323 53 0.0.0.0 OTHER 76, 11 #将log切块,使得结果成为结构统一的块 12 result = re.split( 13 #用问号和"...LinkProof"和","来切 14 r""".*LinkProof:\s+|\, 15 """, 16 line.strip('\n'), 0, re.VERBOSE) 17 18 19 20 #result : ['', '14/04/2019 22:51:53 14/04/2019 22:52:48 120.221.145. 4 210. 29.144. 1 211.65.207.189 UDP 64777 53 0.0.0.0 OTHER 305','...',''] lenth = 9 21 #用切片去除头尾的空 22 for block in result[1:8]: 23 f2.write(block+'\n') 24 #将日期和时间分开取,正则表达式更简单,效率会更高 25 date1 = r"\S*" #反取,取不为空格的 26 time1 = r"\S*" 27 date2 = r"\S*" 28 time2 = r"\S*" 29 # time1 = r"\d{2}/\d{2}/\d{4}\s+(?:\d+\:){2}\d{2}" 30 31 #取IP,因为存在IP里存在空格,所以用相对复杂的正则保证每次取到 32 ip1 = r"(?:\d{1,3}\.\s*){3}\d{1,3}" 33 ip2 = r"(?:\d{1,3}\.\s*){3}\d{1,3}" 34 ip3 = r"(?:\d{1,3}\.\s*){3}\d{1,3}" 35 protocal = r"\w{3}" 36 sizelike = r"\d*" 37 portlike = r"\d*" 38 ip4 = r"\S*" 39 type = r"\w*" 40 num = r"\d*" 41 #正则预编译 42 log_pattern = re.compile(r"(%s)\s+(%s)\s+(%s)\s+(%s)\s+(%s)\s+(%s)\s+(%s)\s+(%s)\s+(%s)\s+(%s)\s+(%s)\s+(%s)\s+(%s)" \ 43 %(date1,time1,date2,time2,ip1,ip2,ip3,protocal,sizelike,portlike,ip4,type,num),re.VERBOSE) 44 l = [] 45 with open('slice_log.log','r') as f2: # 46 lines = f2.readlines() 47 48 for line in lines: 49 dic = {} 50 line_matchs = log_pattern.match(line) 51 if line_matchs != None: 52 all_groups = line_matchs.groups() 53 dic["date1"] = all_groups[0]+" "+all_groups[1] 54 dic["date2"] = all_groups[2]+" "+all_groups[3] 55 56 #去掉IP里的空格 57 dic["ip1"] = all_groups[4].replace(" ","") 58 dic["ip2"] = all_groups[5].replace(" ","") 59 dic["ip3"] = all_groups[6].replace(" ","") 60 61 dic["protocal"] = all_groups[7] 62 dic["sizelike"] = all_groups[8] 63 dic["portlike"] = all_groups[9] 64 dic["ip4"] = all_groups[10].replace(" ", "") 65 dic["type"] = all_groups[11] 66 dic["num"] = all_groups[12] 67 68 l.append(dic) 69 # print((all_groups)) 70 71 72 for item in l: 73 print(item) 74
得到的结果:
可再参考文章: