布隆过滤器
from bloom_filter import BloomFilter
# 生成一个装1亿大小的
bloom = BloomFilter(max_elements=100000000, error_rate=0.1)
# 向bloom添加URL
bloom.add('https://www.xxxx.com/1.html')
#判断URL是否在
print(bloom.__contains__('https://www.xxxx.com/1.html'))
可变布隆
# -*- coding: utf-8 -*-
# ---------------------------------------
# 布隆过滤器升级版, 可以根据传入的预期数据量和误判率自动计算布隆大小, 最大256MB
# ---------------------------------------
import math
from hashlib import md5
from scrapy import Request
from scrapy.utils.request import request_fingerprint
from crawler.db.redis_client import redis_cli
from crawler.utils.exception import BloomException
class SimpleHash(object):
def __init__(self, cap, seed):
self.cap = cap
self.seed = seed
def hash(self, value):
ret = 0
nlen = len(value)
for i in range(nlen):
ret += self.seed * ret + ord(value[i])
return (self.cap - 1) & ret
def calculation_bloom_filter(n, p):
"""
根据 https://www.jianshu.com/p/c3ed818f9531 中描述,这个计算比 calculation_bloom_filter_old 好像要准确点
通过数据量和期望的误报率 计算出 位数组大小 和 哈希函数的数量
k为哈希函数个数 m为位数组大小
n为数据量 p为误报率
m = - (nlnp)/(ln2)^2
k = (m/n) ln2
"""
m = - (n * (math.log(p, math.e)) / (math.log(2, math.e)) ** 2)
mem = math.ceil(m / 8 / 1024 / 1024) # 需要的多少 M 内存
block_num = math.ceil(mem / 512) # 需要多少个 Redis 512M 的内存块
return math.ceil(m), block_num
class BloomFilter(object):
def __init__(self, key='common_bloom', server=redis_cli, capacity=100000000, error_rate=0.00001):
self.bit_size, block_num = calculation_bloom_filter(capacity, error_rate)
if self.bit_size > 1 << 31:
self.bit_size = 1 << 31 # Redis的String类型最大容量为512M,现使用256M
self.seeds = [5, 7, 11, 13, 31, 37, 61]
self.server = server
self.key = key
self.block_num = block_num
self.hashfunc = []
for seed in self.seeds:
self.hashfunc.append(SimpleHash(self.bit_size, seed))
def get_md5_info(self, str_input):
res = str_input
m5 = md5()
m5.update(res)
res = m5.hexdigest()
return res
def check_value(self, value, b_hash=False):
if isinstance(value, str):
value = value.encode()
if not isinstance(value, bytes):
raise BloomException("传入的布隆去重的参数: {} 非字符串".format(str(value)))
if not b_hash:
value = md5(value).hexdigest()
return value
def is_contains(self, value, bhash=False):
if not value:
return False
value = self.check_value(value, bhash)
name = self.key + str(int(value[0:2], 16) % self.block_num)
with self.server.pipeline() as pipe:
for f in self.hashfunc:
loc = f.hash(value)
pipe.getbit(name, loc)
decides = pipe.execute()
for decide in decides:
if decide == 0:
return False
return True
def insert(self, value, bhash=False):
if not value:
return None
value = self.check_value(value, bhash)
name = self.key + str(int(value[0:2], 16) % self.block_num)
with self.server.pipeline() as pipe:
for f in self.hashfunc:
loc = f.hash(value)
pipe.setbit(name, loc, 1)
pipe.execute()
return True
def is_contains_url(self, url):
request = Request(url)
fp = request_fingerprint(request)
if self.is_contains(fp, True):
bret = True
else:
self.insert(fp, True)
bret = False
return bret
def is_contains_info(self, str_input):
if isinstance(str_input, int):
str_input = str(str_input)
if self.is_contains(str_input, True):
bret = True
else:
self.insert(str_input, True)
bret = False
return bret
if __name__ == '__main__':
bf = BloomFilter("temp_b")
bf.insert("http://www.baidu.com")
print(bf.is_contains("http://www.baidu.com1"))
print(bf.is_contains("http://www.baidu.com"))
print(bf.is_contains(['test']))