Bloom Filter布隆过滤器原理和实现(2)

这一篇主要是根据 leveldb 实现的 布隆过滤器的简单版

#include <iostream>
#include <cstring>

using namespace std;

#ifndef FALLTHROUGH_INTENDED
#define FALLTHROUGH_INTENDED \
  do {                       \
  } while (0)
#endif

int LittleEndian() { //返回1,小端,返回0, 大端
    int a = 1;
    return *(char*)&a;
}

inline uint32_t DecodeFixed32(const char* ptr) {
    const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr);
    if (LittleEndian) {
        uint32_t result;
        std::memcpy(&result, buffer, sizeof(uint32_t));
        return result;
    }

    return (static_cast<uint32_t>(buffer[0])) |
           (static_cast<uint32_t>(buffer[1]) << 8) |
           (static_cast<uint32_t>(buffer[2]) << 16) |
           (static_cast<uint32_t>(buffer[3]) << 24);
}

uint32_t Hash(const char* data, size_t n, uint32_t seed) {
  // Similar to murmur hash
  const uint32_t m = 0xc6a4a793;
  const uint32_t r = 24;
  const char* limit = data + n;
  uint32_t h = seed ^ (n * m);

  // Pick up four bytes at a time
  while (data + 4 <= limit) {
    uint32_t w = DecodeFixed32(data);
    data += 4;
    h += w;
    h *= m;
    h ^= (h >> 16);
  }

  // Pick up remaining bytes
  switch (limit - data) {
    case 3:
      h += static_cast<uint8_t>(data[2]) << 16;
      FALLTHROUGH_INTENDED;
    case 2:
      h += static_cast<uint8_t>(data[1]) << 8;
      FALLTHROUGH_INTENDED;
    case 1:
      h += static_cast<uint8_t>(data[0]);
      h *= m;
      h ^= (h >> r);
      break;
  }
  return h;
}

class Bitmap {
public:
    Bitmap(size_t size) : size_(size) {
        bits.resize((size_ >> 3) + 1, 0);  //多开辟一个空间,原因是数组只能表示区间[0,size)
    }
    void bitmapSet(size_t val) {
       bits[val >> 3] |= (1 << (val % 8));  // >> 3 相当于除以8,用移位操作可提高性能
    }
    bool bitmapGet(size_t val) {
        return bits[val >> 3] & (1 << (val % 8));
    }
private:
    size_t size_;
    std::string bits;
};

class BloomFilter {
private:
    static uint32_t BloomHash(const std::string& key) {
        return Hash(key.data(), key.size(), 0xbc9f1d34);
    }
    
    enum { defaultSize = 100000000 * 16 };  //16亿

public:
    BloomFilter() : k_(8) {
        bitmap_ = new Bitmap(defaultSize);
    }
    ~BloomFilter() {
        delete bitmap_;
    }
    void Add(const string& s) {
        uint32_t h = BloomHash(s);
        const uint32_t delta = (h >> 17) | (h << 15); 
        for (size_t i = 0; i < k_; ++i) {
            uint32_t bitpos = h % defaultSize;
            bitmap_->bitmapSet(bitpos);
            h += delta;
        }
    }
    bool Contain(const string& s) {
        bool ret = true;
        uint32_t h = BloomHash(s);
        const uint32_t delta = (h >> 17) | (h << 15); 
        for (size_t i = 0; i < k_; ++i) {
            uint32_t bitpos = h % defaultSize;
            ret = ret && bitmap_->bitmapGet(bitpos);
            h += delta;
        }
        return ret;
    }

private:
    int k_;  // hash的个数
    Bitmap* bitmap_;
};

void bloomFilterTest() {
    std::string email = "1293173298@qq.com";
    BloomFilter bf;
    bf.Add(email);
    bool ret1 = bf.Contain(email);       // true
    bool ret2 = bf.Contain("even.com");  // false
}

int main() {
    bloomFilterTest();

    system("pause");
    return 0;
}
posted @ 2019-12-31 08:21  evenleo  阅读(372)  评论(0编辑  收藏  举报