布隆过滤器

from bloom_filter import BloomFilter
# 生成一个装1亿大小的
bloom = BloomFilter(max_elements=100000000, error_rate=0.1)
# 向bloom添加URL
bloom.add('https://www.xxxx.com/1.html')
#判断URL是否在
print(bloom.__contains__('https://www.xxxx.com/1.html'))

可变布隆

# -*- coding: utf-8 -*-
# ---------------------------------------
#   布隆过滤器升级版, 可以根据传入的预期数据量和误判率自动计算布隆大小, 最大256MB
# ---------------------------------------

import math
from hashlib import md5

from scrapy import Request
from scrapy.utils.request import request_fingerprint

from crawler.db.redis_client import redis_cli
from crawler.utils.exception import BloomException


class SimpleHash(object):
    def __init__(self, cap, seed):
        self.cap = cap
        self.seed = seed

    def hash(self, value):
        ret = 0
        nlen = len(value)
        for i in range(nlen):
            ret += self.seed * ret + ord(value[i])
        return (self.cap - 1) & ret


def calculation_bloom_filter(n, p):
    """
    根据 https://www.jianshu.com/p/c3ed818f9531 中描述,这个计算比 calculation_bloom_filter_old 好像要准确点
    通过数据量和期望的误报率 计算出 位数组大小 和 哈希函数的数量
    k为哈希函数个数    m为位数组大小
    n为数据量          p为误报率
    m = - (nlnp)/(ln2)^2
    k = (m/n) ln2
    """
    m = - (n * (math.log(p, math.e)) / (math.log(2, math.e)) ** 2)
    mem = math.ceil(m / 8 / 1024 / 1024)  # 需要的多少 M 内存
    block_num = math.ceil(mem / 512)  # 需要多少个 Redis 512M 的内存块
    return math.ceil(m), block_num


class BloomFilter(object):

    def __init__(self, key='common_bloom', server=redis_cli, capacity=100000000, error_rate=0.00001):
        self.bit_size, block_num = calculation_bloom_filter(capacity, error_rate)
        if self.bit_size > 1 << 31:
            self.bit_size = 1 << 31  # Redis的String类型最大容量为512M,现使用256M
        self.seeds = [5, 7, 11, 13, 31, 37, 61]
        self.server = server
        self.key = key
        self.block_num = block_num
        self.hashfunc = []
        for seed in self.seeds:
            self.hashfunc.append(SimpleHash(self.bit_size, seed))

    def get_md5_info(self, str_input):
        res = str_input
        m5 = md5()
        m5.update(res)
        res = m5.hexdigest()
        return res

    def check_value(self, value, b_hash=False):
        if isinstance(value, str):
            value = value.encode()
        if not isinstance(value, bytes):
            raise BloomException("传入的布隆去重的参数: {} 非字符串".format(str(value)))
        if not b_hash:
            value = md5(value).hexdigest()
        return value

    def is_contains(self, value, bhash=False):
        if not value:
            return False
        value = self.check_value(value, bhash)
        name = self.key + str(int(value[0:2], 16) % self.block_num)
        with self.server.pipeline() as pipe:
            for f in self.hashfunc:
                loc = f.hash(value)
                pipe.getbit(name, loc)
            decides = pipe.execute()
            for decide in decides:
                if decide == 0:
                    return False
        return True

    def insert(self, value, bhash=False):
        if not value:
            return None
        value = self.check_value(value, bhash)
        name = self.key + str(int(value[0:2], 16) % self.block_num)
        with self.server.pipeline() as pipe:
            for f in self.hashfunc:
                loc = f.hash(value)
                pipe.setbit(name, loc, 1)
            pipe.execute()
        return True

    def is_contains_url(self, url):
        request = Request(url)
        fp = request_fingerprint(request)
        if self.is_contains(fp, True):
            bret = True
        else:
            self.insert(fp, True)
            bret = False
        return bret

    def is_contains_info(self, str_input):
        if isinstance(str_input, int):
            str_input = str(str_input)
        if self.is_contains(str_input, True):
            bret = True
        else:
            self.insert(str_input, True)
            bret = False
        return bret


if __name__ == '__main__':
    bf = BloomFilter("temp_b")
    bf.insert("http://www.baidu.com")
    print(bf.is_contains("http://www.baidu.com1"))
    print(bf.is_contains("http://www.baidu.com"))
    print(bf.is_contains(['test']))

posted @ 2019-12-30 22:43  公众号python学习开发  阅读(199)  评论(0编辑  收藏  举报