使用python找出nginx访问日志中访问次数最多的10个ip排序生成网页

使用python找出nginx访问日志中访问次数最多的10个ip排序生成网页

方法1:
linux下使用awk命令

# cat access1.log | awk '{print $1"  "$7"  "$9}'|sort -n|uniq -c |sort -n -r|head -10

方法2:
通过python处理日志

#encoding=utf-8

# 找到日志中的top 10,日志格式如下
#txt = '''100.116.167.9 - - [22/Oct/2017:03:55:53 +0800] "HEAD /check HTTP/1.0" 200 0 "-" "-" "-" ut = 0.001'''

#nodes = txt.split()
#print 'ip:%s, url:%s, code:%s' % (nodes[0],nodes[6],nodes[8])

# 统计ip,url,code的次数,并且生成字典
def log_analysis(log_file, dpath, topn = 10):
    path=log_file
    shandle = open(path, 'r')
    count = 1

    log_dict = {}

    while True:
        line = shandle.readline()
        if line == '':
            break
        #print line
        nodes = line.split()
        #count += 1
        #if count >= 10:
        #    break

        # {(ip,url,code):count}当做字典的key
        #print 'ip:%s, url:%s, code:%s' % (nodes[0],nodes[6],nodes[8])

        # 拼凑字典,如果不存在赋值为1,如果存在则+1
        ip,url,code = nodes[0],nodes[6],nodes[8]
        if (ip, url, code) not in log_dict:
            log_dict[(ip, url, code)] = 1
        else:
            log_dict[(ip, url, code)] = log_dict[(ip, url, code)] + 1
    # 关闭文件句柄
    shandle.close()
    # 对字典进行排序
    #print log_dict
    # ('111.37.21.148', '/index', '200'): 2
    rst_list = log_dict.items()
    #print rst_list
    # 
    for j in range(10):
    # 冒泡法根据rst_list中的count排序,找出访问量最大的10个IP
        for i in range(0,len(rst_list) - 1):
            if rst_list[i][1] > rst_list[i+1][1]:
                temp = rst_list[i]
                rst_list[i] = rst_list[i+1]
                rst_list[i+1] = temp

    need_list = rst_list[-1:-topn - 1:-1]
    # 打印出top 10访问日志,并写入网页中
    title = 'nginx访问日志'
    tbody = ''
    for i in need_list:
        tbody += '<tr>\n<td>%s</td><td>%s</td><td>%s</td><td>%s</td>\n<tr>\n' % (i[1],i[0][0],i[0][1],i[0][2])

    html_tpl = '''
    <!DOCTYPE html>
    <html>
        <head>
            <meta charset="utf-8">
            <title>{title}</title>
        </head>
        <body>
            <table border="1" cellspacing="0" cellpadding="0" color='pink'>
                <thead>
                    <tr cellspacing="0" cellpadding="0">
                        <th>访问次数</th>
                        <th>ip</th>
                        <th>url</th>
                        <th>http_code</th>
                    </tr>
                </thead>
                {tbody}
            </table>
        </body>
    </html>
    '''
    html_handle = open(dpath,'w')
    html_handle.write(html_tpl.format(title = title, tbody = tbody))
    html_handle.close()

# 函数入口
if __name__ == '__main__':
    # nginx日志文件
    log_file = 'access1.log'
    dpath = 'top10.html'
    # topn 表示去top多少个
    # 不传,默认10个
    topn = 10
    # log_analysis(log_file, dpath)
    log_analysis(log_file,dpath,topn)

 

方法2

# 统计nginx日志中的前十名

def static_file(file_name):
    res_dict = {}
    with open(file_name) as f:
        for line in f:
            if line == '\n':
                continue
            # ['100.116.x.x', '-', '-', '[08/Feb/2018:14:37:13', '+0800]', '"HEAD', 
            # '/check', 'HTTP/1.0"', '200', '0', '"-"', '"-"', '"-"', 'ut', '=', '0.002']
            tmp = line.split()
            # print(tmp)
            tup = (tmp[0],tmp[8])
            # 赋值
            res_dict[tup] = res_dict.get(tup,0) + 1
    return res_dict

def generate_html(rst_list):
    str_html = '<table border="1" cellpading=0 cellspacing=0>'
    str_html += "<tr><th>ip地址</th><th>状态码</th><th>次数</th></tr>"
    html_tmpl = '<tr><td>%s</td><td>%s</td><td>%s</td></tr>'

    for (ip, status),count in rst_list[-20:]:
        str_html += html_tmpl % (ip,status,count)
    str_html += '</table>'
    return str_html

def write_to_html(html_list):
    with open('res.html', 'w') as f:
        f.write(html_list)


def main():
    res_dict = static_file('voice20180208.log')
    res_list = sorted(res_dict.items(), key = lambda x:x[1])
    # html_content = generate_html(res_list[-10:])
    html_content = generate_html(res_list[-1:-20:-1])
    write_to_html(html_content)

if __name__ == "__main__":
    main()

 

posted @ 2017-11-23 14:40  reblue520  阅读(1883)  评论(0编辑  收藏  举报