切割web日志

  

log='''10.1.1.95 - e800 [18/Mar/2005:12:21:42 +0800] \
"GET /stats/awstats.pl?config=e800 HTTP/1.1" 200 899 "http://10.1.1.1/pv/" \
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Maxthon)"'''

 

def jpm(b:str)->list:
    lst=[]
    flag=False

    for word in log.split():
        if not flag and (word.startswith('[') or word.startswith('"')):
            if word.endswith(']') or word.endswith('"'):
                lst.append(word.strip('[]"'))
                continue
            flag=True
            tmp=word[1:]
            continue

        if flag:
            if word.endswith(']') or word.endswith('"'):
                tmp+=' '+word[:-1] # tmp+=word.strip(']"')
                lst.append(tmp)
                flag=False
                # continue # continue提取
            else:
                tmp+=' '+word
                # continue # continue 提取
            continue
        lst.append(word)
    return lst

print(jpm(log))

 

def dip(sub:str)->list:
    lst=[]
    flag=False
    for word in log.split():
        # 判断flag提高效率
        if not flag and (word.startswith('[') or word.startswith('"')):
            tmp=word
            if tmp[-1] == ']' or tmp[-1] == '"': # 对"-"特殊情况做处理
                tmp=tmp.strip('[]"')
                lst.append(tmp)
                continue
            flag=True
            continue

        if flag: # 进入特殊处理
            if word.endswith(']') or word.endswith('"'):
                tmp+=' '+word
                flag=False
                # print(tmp)
                tmp=tmp.strip('[]"')
                lst.append(tmp)
                continue
            else:
                tmp+=' '+word
                continue

        lst.append(word)
    return lst
print(dip(log))

 

import datetime
log='''10.1.1.95 - e800 [18/Mar/2005:12:21:42 +0800] \
"GET /stats/awstats.pl?config=e800 HTTP/1.1" 200 899 "http://10.1.1.1/pv/" \
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Maxthon)"'''

def convert_time(timestr:str)->datetime.datetime:
    fmtstr='%d/%b/%Y:%H:%M:%S %z'
    dt=datetime.datetime.strptime(timestr,fmtstr)
    print(dt,type(dt))
    return dt

def convert_request(request:str)->dict:
    return dict(zip(('method','url','protocol'),request.split()))

def jix(b:str)->list:
    lst=[]
    flag=False

    for word in log.split():
        if not flag:
            if word[0] == '[' or word.startswith('"'):
                if word[-1] == ']' or word.endswith('"'):
                    lst.append(word[1:-1])
                else:
                    flag=True
                    tmp=word[1:]
            else:
                lst.append(word)
            continue

        if flag:
            if word[-1] == ']' or word.endswith('"'):
                lst.append(tmp+' '+word[:-1])
                flag=False
            else:
                tmp+=' '+word
            continue
    return lst

print(jix(log))


def pim():
    names = ['remote', 'logname', 'username', 'datetime',
             'request', 'status', 'size', 'referer', 'useragent']
    funcs = (None, None, None, convert_time,
             convert_request, int, int, None, None)
    d = {}
    for i, field in enumerate(jix(log)):
        if funcs[i] is not None:
            d.setdefault(names[i], funcs[i](field))
        else:
            d.setdefault(names[i], field)

    return d
print(pim())

 

import re,datetime
log='''10.1.1.95 - e800 [18/Mar/2005:12:21:42 +0800] \
"GET /stats/awstats.pl?config=e800 HTTP/1.1" 200 899 "http://10.1.1.1/pv/" \
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Maxthon)"'''

def extract(line):
    pattern='''(?P<remote>[\d\.]{7,})(?= ) (?P<logname>[\w-]+) (?P<username>[\w-]+) \[(?P<time>[^][]+)\] "(?P<request>[^"]+)" (?P<status>\d{3}) (?P<size>\d+) "(?P<referer>[^"]+)" "(?P<useragent>[^"]+)"'''
    regex=re.compile(pattern,flags=re.S|re.I)
    matcher=regex.match(log)
    return matcher.groupdict()
print(extract(log))

def convert_time(timestr:str)->datetime.datetime:
    fmtstr='%d/%b/%Y:%H:%M:%S %z'
    return datetime.datetime.strptime(timestr,fmtstr)

def convert_request(request:str)->dict:
    return dict(zip(('method','url','protocol'),request))

funcs={
    'time':convert_time,
    # 'time': lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z')
    'status':int,
    'size':int,
    'request':convert_request
    # 'request': lambda request:dict(zip(('method','url','protocol'),request))
}

ciz={}

for k,v in extract(log).items():
    ciz[k]=funcs.get(k,lambda m:m)(v) # default为函数,调用返回v
    # print(k,v)
    # if funcs.get(k,None):
    #     ciz[k]=funcs[k](v)
    # else:
    #     ciz.update(((k,v),))
print(ciz)
bb={k:funcs.get(k,lambda m:m)(v) for k,v in extract(log).items()}
print(bb)

 

    

import re,datetime
log='''10.1.1.95 - e800 [18/Mar/2005:12:21:42 +0800] \
"GET /stats/awstats.pl?config=e800 HTTP/1.1" 200 899 "http://10.1.1.1/pv/" \
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Maxthon)"'''
pattern='''(?P<remote>[\d\.]{7,})(?= ) (?P<logname>[\w-]+) (?P<username>[\w-]+) \[(?P<time>[^][]+)\] "(?P<request>[^"]+)" (?P<status>\d{3}) ('''
regex=re.compile(pattern,flags=re.M|re.I)

def extract(line):
    matcher=regex.match(line)
    # if matcher:
    #     return (k:funcs.get(k,lambda m:m)(v) for k,v in matcher.groupdict().items())
    # else:
    #     raise Exception('None match')

    if matcher: # matcher 为None,返回默认None
        return (k:funcs.get(k,lambda m:m)(v) for k,v in matcher.groupdict().items())


funcs={
    'time':convert_time,
    # 'time': lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z')
    'status':int,
    'size':int,
    'request':convert_requesta# 'request': lambda request:dict(zip(('method','url','protocol'),request))
}

lines=[]
for line in lines:
    try:
        d=extract(line)
    except:
        pass

for line in lines:
    d=extract(line)
    if d:
        pass
    else:
        # todo 统计不合格日志
        pass

 

def load(path):
    with open(path,mode='rt+',encoding='utf8') as f:
        for line in f:
            fields=extract(line)
            if fields:
                yield fields
            else:
                pass # todo
def extract(sub):
    matcher=regex.search(sub)
    if matcher:
        return matcher.group()
    else:
        raise Exception('None match')
print(extract(n))

def extract(line)->dict: # 不匹配,抛异常
    matcher=regex.match(line)
    if matcher:
        return {k:funcs.get(k,lambda m:m)(v) for k,v in matcher.groupdict().items()}
    else:
        raise Exception("None match")

def extract(line)->dict: # 不匹配,返回None
    matcher=regex.match(line)
    niz=None
    if matcher:
        niz={k:funcs.get(k,lambda m:m)(v) for k,v in matcher.groupdict().items()}
    return niz

 

posted @ 2020-09-26 15:22  ascertain  阅读(137)  评论(0编辑  收藏  举报