文件二分查找
import os import re import argparse import sys import time SLEEP_DURATION = 0.001 # 1ms MAX_LINE_LENGTH = 1024 * 1024 * 128 # 128MB enable_color = False enable_verbose = False DATETIME_FORMAT_LIST = [ "\s(?P<month>\d+)\-(?P<day>\d+)\s(?P<hour>\d+):(?P<minute>\d+):(?P<second>\d+):", "(?P<day>\d+)\/(?P<month>[A-Za-z]+)\/(?P<year>\d+):(?P<hour>\d+):(?P<minute>\d+):(?P<second>\d+)" ] MONTH_DICT = { "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05", "Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12", "01": "01", "02": "02", "03": "03", "04": "04", "05": "05", "06": "06", "07": "07", "08": "08", "09": "09", "10": "10", "11": "11", "12": "12", "1": "01", "2": "02", "3": "03", "4": "04", "5": "05", "6": "06", "7": "07", "8": "08", "9": "09", " 1": "01", " 2": "02", " 3": "03", " 4": "04", " 5": "05", " 6": "06", " 7": "07", " 8": "08", " 9": "09", } DAY_DICT = { "01": "01", "02": "02", "03": "03", "04": "04", "05": "05", "06": "06", "07": "07", "08": "08", "09": "09", " 1": "01", " 2": "02", " 3": "03", " 4": "04", " 5": "05", " 6": "06", " 7": "07", " 8": "08", " 9": "09", "10": "10", "11": "11", "12": "12", "13": "13", "14": "14", "15": "15", "16": "16", "17": "17", "18": "18", "19": "19", "20": "20", "21": "21", "22": "22", "23": "23", "24": "24", "25": "25", "26": "26", "27": "27", "28": "28", "29": "29", "30": "30", "31": "31", } # def detect_datetime_format(line): """ 确认文件中日期格式使用的正则 """ global DATETIME_FORMAT_LIST for reg in DATETIME_FORMAT_LIST: match = re.search(reg, line) if match: return re.compile(reg) return None def extraction_time(line, reg, st): """ 提取时间 :param line: 需要提取的文字行 :param reg: 正则compile对象 :return: 数字时间 """ st = str(st) _year, _month, _day, _hour = st[:4], st[4:6], st[6:8], st[8:10] match = reg.search(line) if match is None: return None match_dict = match.groupdict() if match_dict.get("year") is None: match_dict["year"] = _year if match_dict.get("month") is None: match_dict["month"] = _month if match_dict.get("day") is None: match_dict["day"] = _day if match_dict.get("hour") is None: match_dict['hour'] = _hour # 处理数字格式 match_dict['month'] = MONTH_DICT[match_dict['month']] match_dict['day'] = MONTH_DICT[match_dict['day']] times = "{year}{month}{day}{hour}{minute}{second}".format(**match_dict) return int(times) def forward_match(f, st, ed, regprog, ed_inclusive=True, ): """ 向前读取 """ if ed_inclusive: f.seek(ed) f.readline() # 保证游标定位下一行的开头 ed = f.tell() # ed 文件未部的行首索引 f.seek(st) line = None while f.tell() < ed: # 读取一行,读取到\n line = "" while f.tell() < ed and (len(line) == 0 or line[-1] != "\n"): line += f.readline() # z match = re.search(regprog, line) if match: f.seek(0 - len(line), os.SEEK_CUR) return line, f.tell() f.seek(ed) return line, f.tell() def at_line_head(f): """ 判断是否是一行的开始 """ if f.tell() == 0: return True else: f.seek(f.tell() - 1) return f.read(1) == "\n" def backward_match(f, ed, st, regprog): """ 向后读取直到regprog匹配一行,如果找到匹配行则超过st,然后定位f的读取指针到对应的行首, 否则定位到st返回匹配的obj,最后读取的行和行的头部位置注意: ed 未读 Return: match # the matched object line # last line read(maybe not a complete line) f.tell() # the head position of the line """ backward_step_hint = 1024 * 4 # 移动到行尾 f.seek(ed) if f.tell() < st: return None, None match = None line = None # 获取当前的位置 old_pos = f.tell() # cache backward read content in case failing to read a whole # line during a loop round last_buffer = "" backward_step = backward_step_hint # 读取 1024 * 4 字节 while (not match) and (old_pos > st): new_pos = old_pos - backward_step if new_pos < st: new_pos = st f.seek(new_pos) lines = [] cur_pos = f.tell() while cur_pos < old_pos: size = old_pos - f.tell() line = f.readline(size) lines.append(line) cur_pos = f.tell() f.seek(new_pos) valid_start_index = 0 if len(lines) == 1: if at_line_head(f): lines[0] = lines[0] + last_buffer last_buffer = "" else: last_buffer = lines[0] + last_buffer lines = [] else: # when len(lines) != 1, there may be the following # possibilities: # 1. lines[0] is not a complete line # 2. lines[0] is a complete line # we can judge by checking if the first character # of lines[0] is at line head lines[-1] = lines[-1] + last_buffer last_buffer = "" if not at_line_head(f) and new_pos != st: # lines[0] is not a complete line # nor does lines[0][0] at position st last_buffer = lines[0] valid_start_index = 1 if new_pos == st and len(last_buffer) > 0: # new_pos == st means the loop will end # after this round, so we have to handle # data in last_buffer lines.append(last_buffer) total_lines_length = 0 for line in lines: total_lines_length += len(line) # handle data from this round cur_lines_length = 0 for index in reversed(range(valid_start_index, len(lines))): line = lines[index] cur_lines_length += len(line) match = re.search(regprog, line) if match: # locate f's reading pointer f.seek( total_lines_length \ - cur_lines_length \ + new_pos) return line, f.tell() # update old_pos old_pos = new_pos f.seek(st) return line, f.tell() def binary_seek_pos(f, st, ed, start_time, cmp_pattern): """ :param start_time: 配置时间 type:int :param f: 日志文件句柄 :param st: 文件开始位置 :param ed: 文件结束位置 :param cmp_pattern: 时间正则 :return: """ while st < ed: mid = st + (ed - st) / 2 f.seek(mid) line, res_pos = forward_match(f, mid, ed, cmp_pattern) times = extraction_time(line=line, reg=cmp_pattern, st=start_time) if times: # modify group(0) compare if match pattern is after # or equal to the cmp_pattern if times >= start_time: # true or fasle line time 是否大于 st if res_pos == ed: # 如果这导致死循环,向后搜索一行并比较注意:如果我们不处理 res_pos == ed 情况, # 我们可能会遇到死循环,比如只剩下 2 行,第一行有 10 个字节,第二行有100个字节, # 那么“mid”会一直定位在第2行,如果第2行中的pattern意外地在cmp_pattern之后或者等于cmp_pattern, # 就会出现死循环,因为“ed”在下一轮不会改变 line, back_res_pos = backward_match(f, mid, st, cmp_pattern) times = extraction_time(line=line, reg=cmp_pattern, st=start_time) if not times or back_res_pos == res_pos: # 这意味着只剩下一行,它涵盖了位置 st 和 ed,只需返回 res_pos return res_pos elif back_res_pos == st: # this means only two lines left, and # they cover positions st and ed. just # compare and decide which to return if times >= start_time: return st else: return res_pos else: if times >= start_time: ed = back_res_pos else: st = back_res_pos else: ed = res_pos else: if res_pos == st: # 这意味着 st 和 ed 必须被同一行覆盖,只需返回 stres_pos return st st = res_pos else: line, res_pos = backward_match(f, mid, st, cmp_pattern) match = extraction_time(line, cmp_pattern, start_time) if not match: # the whole file does not contain any valid line return None # found one valid line, compare with cmp_pattern if match >= start_time: ed = res_pos else: # this line and the lines follow, until ed, # all locate before the target cmp_pattern, # thus return ed directly return ed return None if st > ed else ed def get_start_and_end_pos(file, start_time, end_time, regular): """ 提取结果 :param file: :param start_time: :param end_time: :param regular: :return: """ lpms = 50 # 读取50行, # 打开文件 with open(file, "r") as f: # 确定日志文件的正则格式 format = None for _ in range(3): line = f.readline().strip("\n") format = detect_datetime_format(line) if format is not None: break if format is None: sys.stderr.write("log date format is not" " supported,file:%s\n" % file) return -1 # 获取文件读取开始位置 start_pos = 0 f.seek(0, os.SEEK_END) end_pos = f.tell() # get start read position of the file start_read_pos = binary_seek_pos(f=f, st=start_pos, ed=end_pos, cmp_pattern=format, start_time=start_time) if start_read_pos is None: sys.stderr.write("Error: no matching start line for reading.\n") return -1 # 读取文件结束位置 if not end_time: end_read_pos = end_pos else: end_read_pos = binary_seek_pos(f=f, st=start_read_pos, ed=end_pos, cmp_pattern=format, start_time=end_time) if end_read_pos is None: sys.stderr.write("Error: no matching end line for reading.\n") return -1 # 开始读取文件 f.seek(start_read_pos) while f.tell() < end_read_pos: line = f.readline() if f.tell() > end_read_pos: break sys.stdout.write(line) # 控制读取速度 if lpms > 0 and line_count >= lpms: line_count = 0 time.sleep(SLEEP_DURATION) def check_param(file_list, start_time, end_time): """ 校验参数 :param file_list: :param start_time: :param end_time: :param regular: :return: """ # 检查每个文件是否合法 for file in file_list: if not os.path.isfile(file): sys.stderr.write("file path not exist,%s" % file) sys.exit(1) try: time.strptime(start_time, '%Y%m%d%H%M%S') except: sys.stderr.write( "start_time:%s wrong format," "the supported time formats " "are %Y%m%d%H%M%S, eg:20220909121314" % start_time) sys.exit(1) try: if end_time is not None: time.strptime(end_time, '%Y%m%d%H%M%S') except: sys.stderr.write( "end_time:%s wrong format," "the supported time formats are " "%Y%m%d%H%M%S, eg:20220909121314" % end_time) sys.exit(1) def file_handle(cmd_namespace): """ 文件相关操作 :param cmd_namespace: :return: """ # 获取参数 file_list = getattr(cmd_namespace, "file") start_time = getattr(cmd_namespace, "start") end_time = getattr(cmd_namespace, "end") regular = getattr(cmd_namespace, "regular") check = getattr(cmd_namespace, "check") # 校验参数 check_param(file_list, start_time, end_time) # 打印参数 if check in [1, '1']: sys.stdout.write("file_list:%s" % file_list) sys.stdout.write("start_time:%s" % start_time) sys.stdout.write("end_time:%s" % end_time) sys.stdout.write("regular:%s" % regular) for file in file_list: def init_parser(target_parser): """ 初始化参数 :param target_parser: :return: """ target_parser.add_argument("-f", "--file", nargs="+", dest="file", required=True, help="-f file1 file2 ") target_parser.add_argument("-s", "--start-datetime", dest="start", required=True, help="-s 20220909122300") target_parser.add_argument("-e", "--end-datetime", dest="end", required=False, help="-e 20220909122300") target_parser.add_argument("-r", "--reg", dest="regular", required=False, help="-r uri=\\s+," "Match regular from results") target_parser.add_argument("-c", "--check", dest="check", required=False, const=0, type=int, help="-c 1, Check" " parameters,1:print" " parameters,0:not " "output parameters,default 0") def main(): """ :return: """ parser = argparse.ArgumentParser( description="Usage: " \ "logcat -s '20220101000000' -e '20220101010101' -f " \ " LOGFILE1.log LOGFILE2.log ... ") init_parser(parser) cmd_namespace = parser.parse_args() file_handle(cmd_namespace) if __name__ == "__main__": try: sys.exit(main()) except Exception as ex: sys.exit(-255)