统计日志的不同条数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import collections
import itertools
import multiprocessing
import bz2
 
class MapReduce(object):
    def __init__(self,map_func,reduce_func,num_workers=None):
        self.map_func = map_func
        self.reduce_func = reduce_func
        self.pool = multiprocessing.Pool(num_workers)
 
    def partition(self,mapped_values):
        partition_data = collections.defaultdict(list)
        for key , value in  mapped_values:
            partition_data[key].append(value)
        return partition_data.items()
 
 
    def __call__(self, inputs,chunksize=1):
        mao_response = self.pool.map(self.map_func,inputs,chunksize=chunksize)
        partitioned_data = self.partition(itertools.chain(*mao_response))
        reduce_values = self.pool.map(self.reduce_func,partitioned_data)
        return reduce_values
 
 
def mapper_match(one_file):
    output = []
    for line in bz2.BZ2File(one_file).readlines():
        line=line.rstrip().split()
        if line[3] == 'web' and line[5] == '0':
            output.append((line[4],1))
 
def reduce_match(item):
    cookie,occurances = item
    return (cookie,sum(occurances))
 
def mapper_count(item):
    _ , count = item
    return [(count,1)]
 
def reducer_count(item):
    freq , occurances = item
    return ((freq,sum(occurances)))
 
import glob
import operator
 
input_files='sssssss'
 
mapper = MapReduce(mapper_match,reduce_match)
cokkie_feq = mapper(input_files)
mapper = MapReduce(reducer_count,reducer_count)
cookie_fep = mapper(cokkie_feq)
cookie_fep.sort (key = operator.itemgetter(1),reverse = True)
for key ,value in cookie_fep:
    print(key,value)

  

posted @   冰滴的眼泪  阅读(159)  评论(0编辑  收藏  举报
编辑推荐:
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
阅读排行:
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律
历史上的今天:
2017-06-11 布尔值数据类型
点击右上角即可分享
微信分享提示