文件二分查找

import os
import re
import argparse
import sys
import time

SLEEP_DURATION = 0.001  # 1ms
MAX_LINE_LENGTH = 1024 * 1024 * 128  # 128MB

enable_color = False
enable_verbose = False

DATETIME_FORMAT_LIST = [
    "\s(?P<month>\d+)\-(?P<day>\d+)\s(?P<hour>\d+):(?P<minute>\d+):(?P<second>\d+):",
    "(?P<day>\d+)\/(?P<month>[A-Za-z]+)\/(?P<year>\d+):(?P<hour>\d+):(?P<minute>\d+):(?P<second>\d+)"
]

MONTH_DICT = {
    "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05",
    "Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10",
    "Nov": "11", "Dec": "12",
    "01": "01", "02": "02", "03": "03", "04": "04", "05": "05",
    "06": "06", "07": "07", "08": "08", "09": "09", "10": "10",
    "11": "11", "12": "12",
    "1": "01", "2": "02", "3": "03", "4": "04", "5": "05",
    "6": "06", "7": "07", "8": "08", "9": "09",
    " 1": "01", " 2": "02", " 3": "03", " 4": "04", " 5": "05",
    " 6": "06", " 7": "07", " 8": "08", " 9": "09",
}

DAY_DICT = {
    "01": "01", "02": "02", "03": "03", "04": "04", "05": "05",
    "06": "06", "07": "07", "08": "08", "09": "09",
    " 1": "01", " 2": "02", " 3": "03", " 4": "04", " 5": "05",
    " 6": "06", " 7": "07", " 8": "08", " 9": "09",
    "10": "10", "11": "11", "12": "12", "13": "13", "14": "14",
    "15": "15", "16": "16", "17": "17", "18": "18", "19": "19",
    "20": "20", "21": "21", "22": "22", "23": "23", "24": "24",
    "25": "25", "26": "26", "27": "27", "28": "28", "29": "29",
    "30": "30", "31": "31",
}


#
def detect_datetime_format(line):
    """
    确认文件中日期格式使用的正则
    """
    global DATETIME_FORMAT_LIST
    for reg in DATETIME_FORMAT_LIST:
        match = re.search(reg, line)
        if match:
            return re.compile(reg)
    return None


def extraction_time(line, reg, st):
    """
    提取时间
    :param line: 需要提取的文字行
    :param reg: 正则compile对象
    :return: 数字时间
    """
    st = str(st)
    _year, _month, _day, _hour = st[:4], st[4:6], st[6:8], st[8:10]
    match = reg.search(line)
    if match is None:
        return None
    match_dict = match.groupdict()
    if match_dict.get("year") is None:
        match_dict["year"] = _year

    if match_dict.get("month") is None:
        match_dict["month"] = _month

    if match_dict.get("day") is None:
        match_dict["day"] = _day

    if match_dict.get("hour") is None:
        match_dict['hour'] = _hour

    # 处理数字格式
    match_dict['month'] = MONTH_DICT[match_dict['month']]
    match_dict['day'] = MONTH_DICT[match_dict['day']]
    times = "{year}{month}{day}{hour}{minute}{second}".format(**match_dict)
    return int(times)


def forward_match(f, st, ed, regprog, ed_inclusive=True, ):
    """
        向前读取
    """

    if ed_inclusive:
        f.seek(ed)
        f.readline()  # 保证游标定位下一行的开头
        ed = f.tell()  # ed 文件未部的行首索引

    f.seek(st)

    line = None
    while f.tell() < ed:

        # 读取一行,读取到\n
        line = ""
        while f.tell() < ed and (len(line) == 0 or line[-1] != "\n"):
            line += f.readline()
        # z
        match = re.search(regprog, line)
        if match:
            f.seek(0 - len(line), os.SEEK_CUR)
            return line, f.tell()

    f.seek(ed)
    return line, f.tell()


def at_line_head(f):
    """
    判断是否是一行的开始
    """
    if f.tell() == 0:
        return True
    else:
        f.seek(f.tell() - 1)
        return f.read(1) == "\n"


def backward_match(f, ed, st, regprog):
    """
向后读取直到regprog匹配一行,如果找到匹配行则超过st,然后定位f的读取指针到对应的行首,
否则定位到st返回匹配的obj,最后读取的行和行的头部位置注意: ed 未读
    Return: match    # the matched object
            line     # last line read(maybe not a complete line)
            f.tell() # the head position of the line
    """

    backward_step_hint = 1024 * 4
    # 移动到行尾
    f.seek(ed)

    if f.tell() < st:
        return None, None

    match = None
    line = None

    # 获取当前的位置
    old_pos = f.tell()

    # cache backward read content in case failing to read a whole
    # line during a loop round
    last_buffer = ""

    backward_step = backward_step_hint  # 读取 1024 * 4 字节

    while (not match) and (old_pos > st):
        new_pos = old_pos - backward_step
        if new_pos < st:
            new_pos = st
        f.seek(new_pos)

        lines = []
        cur_pos = f.tell()
        while cur_pos < old_pos:
            size = old_pos - f.tell()
            line = f.readline(size)
            lines.append(line)
            cur_pos = f.tell()

        f.seek(new_pos)
        valid_start_index = 0
        if len(lines) == 1:
            if at_line_head(f):
                lines[0] = lines[0] + last_buffer
                last_buffer = ""

            else:
                last_buffer = lines[0] + last_buffer
                lines = []

        else:
            # when len(lines) != 1, there may be the following
            # possibilities:
            #   1. lines[0] is not a complete line
            #   2. lines[0] is a complete line
            #   we can judge by checking if the first character
            #   of lines[0] is at line head

            lines[-1] = lines[-1] + last_buffer
            last_buffer = ""

            if not at_line_head(f) and new_pos != st:
                # lines[0] is not a complete line
                # nor does lines[0][0] at position st
                last_buffer = lines[0]
                valid_start_index = 1

        if new_pos == st and len(last_buffer) > 0:
            # new_pos == st means the loop will end
            # after this round, so we have to handle
            # data in last_buffer
            lines.append(last_buffer)

        total_lines_length = 0
        for line in lines:
            total_lines_length += len(line)

        # handle data from this round
        cur_lines_length = 0

        for index in reversed(range(valid_start_index, len(lines))):
            line = lines[index]
            cur_lines_length += len(line)
            match = re.search(regprog, line)
            if match:
                # locate f's reading pointer
                f.seek(
                        total_lines_length \
                        - cur_lines_length \
                        + new_pos)
                return line, f.tell()

        # update old_pos
        old_pos = new_pos

    f.seek(st)
    return line, f.tell()


def binary_seek_pos(f, st, ed, start_time, cmp_pattern):
    """

    :param start_time:  配置时间 type:int
    :param f: 日志文件句柄
    :param st: 文件开始位置
    :param ed: 文件结束位置
    :param cmp_pattern: 时间正则
    :return:
    """
    while st < ed:
        mid = st + (ed - st) / 2
        f.seek(mid)
        line, res_pos = forward_match(f, mid, ed, cmp_pattern)
        times = extraction_time(line=line, reg=cmp_pattern, st=start_time)
        if times:
            # modify group(0) compare if match pattern is after
            # or equal to the cmp_pattern
            if times >= start_time:  # true or fasle   line time 是否大于 st
                if res_pos == ed:
                    # 如果这导致死循环,向后搜索一行并比较注意:如果我们不处理 res_pos == ed 情况,
                    # 我们可能会遇到死循环,比如只剩下 2 行,第一行有 10 个字节,第二行有100个字节,
                    # 那么“mid”会一直定位在第2行,如果第2行中的pattern意外地在cmp_pattern之后或者等于cmp_pattern,
                    # 就会出现死循环,因为“ed”在下一轮不会改变
                    line, back_res_pos = backward_match(f, mid, st, cmp_pattern)
                    times = extraction_time(line=line, reg=cmp_pattern, st=start_time)
                    if not times or back_res_pos == res_pos:
                        # 这意味着只剩下一行,它涵盖了位置 st 和 ed,只需返回 res_pos
                        return res_pos

                    elif back_res_pos == st:
                        # this means only two lines left, and
                        # they cover positions st and ed. just
                        # compare and decide which to return

                        if times >= start_time:
                            return st
                        else:
                            return res_pos
                    else:
                        if times >= start_time:
                            ed = back_res_pos
                        else:
                            st = back_res_pos
                else:
                    ed = res_pos
            else:
                if res_pos == st:
                    # 这意味着 st 和 ed 必须被同一行覆盖,只需返回 stres_pos
                    return st
                st = res_pos
        else:
            line, res_pos = backward_match(f, mid, st, cmp_pattern)
            match = extraction_time(line, cmp_pattern, start_time)
            if not match:
                # the whole file does not contain any valid line
                return None

            # found one valid line, compare with cmp_pattern
            if match >= start_time:
                ed = res_pos
            else:
                # this line and the lines follow, until ed,
                # all locate before the target cmp_pattern,
                # thus return ed directly
                return ed
    return None if st > ed else ed


def get_start_and_end_pos(file, start_time, end_time, regular):
    """
    提取结果
    :param file:
    :param start_time:
    :param end_time:
    :param regular:
    :return:
    """
    lpms = 50  # 读取50行,
    # 打开文件
    with open(file, "r") as f:
        # 确定日志文件的正则格式
        format = None
        for _ in range(3):
            line = f.readline().strip("\n")
            format = detect_datetime_format(line)
            if format is not None:
                break
        if format is None:
            sys.stderr.write("log date format is not"
                             " supported,file:%s\n" % file)
            return -1

        # 获取文件读取开始位置
        start_pos = 0
        f.seek(0, os.SEEK_END)
        end_pos = f.tell()
        # get start read position of the file
        start_read_pos = binary_seek_pos(f=f, st=start_pos, ed=end_pos,
                                         cmp_pattern=format, start_time=start_time)
        if start_read_pos is None:
            sys.stderr.write("Error: no matching start line for reading.\n")
            return -1

        # 读取文件结束位置
        if not end_time:
            end_read_pos = end_pos
        else:
            end_read_pos = binary_seek_pos(f=f, st=start_read_pos, ed=end_pos,
                                           cmp_pattern=format, start_time=end_time)
            if end_read_pos is None:
                sys.stderr.write("Error: no matching end line for reading.\n")
                return -1

        # 开始读取文件
        f.seek(start_read_pos)
        while f.tell() < end_read_pos:
            line = f.readline()
            if f.tell() > end_read_pos:
                break

            sys.stdout.write(line)
            # 控制读取速度
            if lpms > 0 and line_count >= lpms:
                line_count = 0
                time.sleep(SLEEP_DURATION)


def check_param(file_list, start_time, end_time):
    """
    校验参数
    :param file_list:
    :param start_time:
    :param end_time:
    :param regular:
    :return:
    """
    # 检查每个文件是否合法
    for file in file_list:
        if not os.path.isfile(file):
            sys.stderr.write("file path not exist,%s" % file)
            sys.exit(1)
    try:
        time.strptime(start_time, '%Y%m%d%H%M%S')
    except:
        sys.stderr.write(
                "start_time:%s wrong format,"
                "the supported time formats "
                "are %Y%m%d%H%M%S, eg:20220909121314" % start_time)
        sys.exit(1)

    try:
        if end_time is not None:
            time.strptime(end_time, '%Y%m%d%H%M%S')
    except:
        sys.stderr.write(
                "end_time:%s wrong format,"
                "the supported time formats are "
                "%Y%m%d%H%M%S, eg:20220909121314" % end_time)
        sys.exit(1)


def file_handle(cmd_namespace):
    """
    文件相关操作
    :param cmd_namespace:
    :return:
    """
    # 获取参数
    file_list = getattr(cmd_namespace, "file")
    start_time = getattr(cmd_namespace, "start")
    end_time = getattr(cmd_namespace, "end")
    regular = getattr(cmd_namespace, "regular")
    check = getattr(cmd_namespace, "check")

    # 校验参数
    check_param(file_list, start_time, end_time)

    # 打印参数
    if check in [1, '1']:
        sys.stdout.write("file_list:%s" % file_list)
        sys.stdout.write("start_time:%s" % start_time)
        sys.stdout.write("end_time:%s" % end_time)
        sys.stdout.write("regular:%s" % regular)

    for file in file_list:


def init_parser(target_parser):
    """
    初始化参数
    :param target_parser:
    :return:
    """
    target_parser.add_argument("-f", "--file", nargs="+",
                               dest="file", required=True,
                               help="-f file1 file2 ")
    target_parser.add_argument("-s", "--start-datetime", dest="start",
                               required=True, help="-s 20220909122300")
    target_parser.add_argument("-e", "--end-datetime", dest="end",
                               required=False, help="-e 20220909122300")
    target_parser.add_argument("-r", "--reg", dest="regular",
                               required=False, help="-r uri=\\s+,"
                                                    "Match regular from results")
    target_parser.add_argument("-c", "--check", dest="check", required=False,
                               const=0, type=int, help="-c 1, Check"
                                                       " parameters,1:print"
                                                       " parameters,0:not "
                                                       "output parameters,default 0")


def main():
    """

    :return:
    """
    parser = argparse.ArgumentParser(
            description="Usage: " \
                        "logcat -s '20220101000000' -e '20220101010101' -f " \
                        " LOGFILE1.log LOGFILE2.log ... ")
    init_parser(parser)

    cmd_namespace = parser.parse_args()

    file_handle(cmd_namespace)


if __name__ == "__main__":
    try:
        sys.exit(main())
    except Exception as ex:
        sys.exit(-255)

  

posted @ 2022-10-12 23:09  老鲜肉  阅读(22)  评论(0编辑  收藏  举报