Python统计web日志中每天用户访问情况

背景

统计web日志中每天用户访问情况

日志目录

import os
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import Manager

# 全局日志目录配置
LOG_DIR = "/var/log/nginx"

def get_log_access(file_name, count_dict):
    """
    处理单个日志文件,统计总访问次数和唯一 IP 数量,并将结果存储在共享字典中。
    
    参数:
    - file_name: 日志文件的名称
    - count_dict: 用于存储结果的共享字典
    """
    ip_set = set()
    total_count = 0

    # 检查并生成文件路径
    file_path = os.path.join(LOG_DIR, file_name)

    try:
        # 读取日志文件并处理每一行
        with open(file_path, 'r', encoding='utf-8') as file_object:
            for line in file_object:
                if not line.strip():
                    continue

                # 提取用户 IP 地址
                user_ip = line.split(' - -', maxsplit=1)[0].split(',')[0]
                total_count += 1

                if user_ip not in ip_set:
                    ip_set.add(user_ip)

        # 将统计结果保存到共享字典中
        count_dict[file_name] = {"total": total_count, "ip": len(ip_set)}

    except FileNotFoundError:
        print(f"Error: File {file_name} not found in {LOG_DIR}.")
    except Exception as e:
        print(f"Error processing file {file_name}: {e}")

def filter_log_files(log_dir):
    """
    过滤日志目录下的所有 .log 文件。
    
    参数:
    - log_dir: 日志文件所在的目录路径
    
    返回:
    - 符合条件的文件列表
    """
    try:
        return [f for f in os.listdir(log_dir) if f.endswith(".log")]
    except FileNotFoundError:
        raise FileNotFoundError(f"The directory {log_dir} does not exist.")
    except Exception as e:
        raise Exception(f"Error accessing directory {log_dir}: {e}")

def run():
    """
    主函数,使用多进程来并行处理多个日志文件,并输出结果。
    """
    # 检查日志目录是否存在
    log_files = filter_log_files(LOG_DIR)

    # 使用多进程池并行处理文件
    with Manager() as manager, ProcessPoolExecutor(max_workers=4) as pool:
        count_dict = manager.dict()

        # 提交任务到进程池
        for file_name in log_files:
            pool.submit(get_log_access, file_name, count_dict)

        # 进程池自动关闭,并等待任务完成
        pool.shutdown(wait=True)

        # 输出结果
        for file_name, result in count_dict.items():
            print(f"{file_name}: Total Requests = {result['total']}, Unique IPs = {result['ip']}")

if __name__ == '__main__':
    run()

运行示例

[root@nginx script]# python3 log_ip_analyzer.py
20240811.log: Total Requests = 20528, Unique IPs = 253
20240812.log: Total Requests = 25023, Unique IPs = 125
20240813.log: Total Requests = 22023, Unique IPs = 323
20240814.log: Total Requests = 26356, Unique IPs = 364
20240815.log: Total Requests = 15563, Unique IPs = 89
posted @ 2024-08-15 12:53  &UnstopPable  阅读(6)  评论(0编辑  收藏  举报