简单日志处理
1 import datetime 2 import re 3 logfile='''58.61.164.141 - - [22/Feb/2010:09:51:46 +0800] "GET /reference-and-source/weblog-format/ HTTP/1.1" 200 6326 "-" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"''' 4 5 def resolve_log(): 6 _pattern = '''(?P<IP>[\d\.]{7,}) - - \[(?P<datetime>[^\[\]]+)\] "(?P<request>[^"]+)" (?P<status>\d+) (?P<size>\d+) "-" "(?P<useraAgent>[^"]+)"''' 7 _regex = re.compile(_pattern, re.S) 8 9 def _getHuman(size): 10 units = ["","K","M","G","T","p"] 11 deps = 0 12 size = int(size) 13 while(size >1000): 14 deps += 1 15 size = size // 1000 16 return str(size)+units[deps] 17 18 ops = { 19 "datetime": lambda time:datetime.datetime.strptime(time,"%d/%b/%Y:%H:%M:%S %z"), 20 "size": _getHuman, 21 "status": int, 22 "request": lambda request:dict(zip(("method","url","protocal"),request.split())) 23 } 24 25 def _extract(logfile): 26 matcher = _regex.match(logfile) 27 if matcher: 28 return {k: ops.get(k, lambda x: x)(v) for k, v in matcher.groupdict().items()} 29 else: 30 return None 31 return _extract 32 33 """ 34 测试 35 """ 36 res = resolve_log()(logfile) 37 print(res) 38 39 """ 40 可以打开一个文件测试代码 41 open("www.log",mode="r",encoding="utf8") 42 """ 43 44 loged = [] 45 with open("www.log",mode="rt",encoding="utf8") as f: 46 for line in f: 47 res = resolve_log()(line) 48 loged.append(res)
处理结果:
{'size': '6K', 'datetime': datetime.datetime(2010, 2, 22, 9, 51, 46, tzinfo=datetime.timezone(datetime.timedelta(0, 28800))), 'IP': '58.61.164.141', 'status': 200, 'request': {'url': '/reference-and-source/weblog-format/', 'protocal': 'HTTP/1.1', 'method': 'GET'}, 'useraAgent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'}