Bloom_filter算法(zz)
(转自: http://www.felix021.com/blog/read.php?1882)
布隆过滤器的详细介绍和典型用途,可参见
Wikipedia:http://en.wikipedia.org/wiki/Bloom_filter
谷歌黑板报(数学之美):http://www.google.cn/ggblog/googlechinablog/2007/07/bloom-filter_7469.html
下面是一个简单的布隆过滤器的C/C++实现,以及使用例程。使用sdbmhash字符串hash方法来进行hash。
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4
5 unsigned int jshash(const char *s, unsigned size);
6 unsigned int sdbmhash(const char *s, unsigned size);
7
8 /* ------------- bloom types and funcs --------------- */
9 const unsigned char masks[8] = {0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80};
10
11 typedef unsigned (*hash_func_ptr)(const char *buffer, unsigned size);
12 struct __bloom_filter
13 {
14 unsigned n;
15 unsigned size;
16 unsigned char *bits;
17 hash_func_ptr hash;
18 };
19 typedef struct __bloom_filter* bloom_filter;
20
21 bloom_filter bloom_init (unsigned n, hash_func_ptr hash);
22 int bloom_insert(bloom_filter b, void *data, unsigned size);
23 int bloom_check(bloom_filter b, void *data, unsigned size);
24 void bloom_destroy(bloom_filter b);
25 /* ------------- end of bloom types and funcs --------------- */
26
27 int main()
28 {
29 const int size = 655371;
30 bloom_filter b1 = bloom_init(size, sdbmhash);
31 for (int i = 0; i < size / 2; i += 2)
32 {
33 if (!bloom_insert(b1, &i, sizeof(i)))
34 {
35 fprintf(stderr, "err insert %d\n", i);
36 exit(1);
37 }
38 }
39 printf("insert ok\n");
40
41 int cnt = 0;
42 for (int i = 0; i < size / 2; i++)
43 {
44 if (bloom_check(b1, &i, sizeof(i)))
45 {
46 if (i & 1)
47 {
48 //printf("i = %d should not be checked, tolerable.\n", i);
49 cnt++;
50 }
51 }
52 else
53 {
54 if (!(i & 1))
55 {
56 printf("i = %d should be checked! BUG!\n", i);
57 }
58 }
59 }
60 printf("cnt = %d\n", cnt);
61 return 0;
62 }
63
64 bloom_filter bloom_init (unsigned n, hash_func_ptr hash)
65 {
66 bloom_filter b = (bloom_filter)malloc(sizeof(__bloom_filter));
67 if (b == NULL)
68 {
69 fprintf(stderr, "bloom_init: err malloc bloom_filter\n");
70 return NULL;
71 }
72
73 b->n = n;
74 b->size = (n + 7) / 8;
75 b->hash = hash;
76
77 b->bits = (unsigned char *)malloc(b->size);
78 memset(b->bits, 0, b->size);
79 if (b->bits == NULL)
80 {
81 fprintf(stderr, "bloom_init: err malloc bits\n");
82 return NULL;
83 }
84 return b;
85 }
86
87 int bloom_insert(bloom_filter b, void *data, unsigned size)
88 {
89 unsigned h = b->hash((const char *)data, size) % (b->n);
90 unsigned idx = h / 8;
91 if (idx >= b->size)
92 {
93 fprintf(stderr, "bloom_insert: hash value overflow\n");
94 return 0;
95 }
96 b->bits[idx] |= masks[h % 8];
97 //printf("h = %2d, idx = %2d, bit = %2d\n", h, idx, h % 8);
98 return 1;
99 }
100
101 int bloom_check(bloom_filter b, void *data, unsigned size)
102 {
103 unsigned h = b->hash((const char *)data, size) % (b->n);
104 unsigned idx = h / 8;
105 if (idx >= b->size)
106 {
107 fprintf(stderr, "bloom_insert: hash value overflow\n");
108 exit(1);
109 }
110 return !!(b->bits[idx] & masks[h % 8]);
111 }
112
113 void bloom_destroy(bloom_filter b)
114 {
115 if (b != NULL)
116 {
117 if (b->bits != NULL)
118 free(b->bits);
119 free(b);
120 }
121 }
122
123 //-----------------------------------------------
124
125 unsigned int jshash(const char *s, unsigned size)
126 {
127 int hash = 1315423911;
128 unsigned len = 0;
129 while (len < size)
130 {
131 hash ^= (hash << 5) + s[len] + (hash >> 2);
132 len++;
133 }
134 return (hash & 0x7fffffffl);
135 }
136
137 unsigned int sdbmhash(const char *s, unsigned size)
138 {
139 int hash = 0;
140 unsigned len = 0;
141 while (len < size)
142 {
143 hash = (hash << 6) + (hash << 16) - hash + s[len];
144 len++;
145 }
146 return (hash & 0x7fffffffl);
147 }
148
2 #include <stdlib.h>
3 #include <string.h>
4
5 unsigned int jshash(const char *s, unsigned size);
6 unsigned int sdbmhash(const char *s, unsigned size);
7
8 /* ------------- bloom types and funcs --------------- */
9 const unsigned char masks[8] = {0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80};
10
11 typedef unsigned (*hash_func_ptr)(const char *buffer, unsigned size);
12 struct __bloom_filter
13 {
14 unsigned n;
15 unsigned size;
16 unsigned char *bits;
17 hash_func_ptr hash;
18 };
19 typedef struct __bloom_filter* bloom_filter;
20
21 bloom_filter bloom_init (unsigned n, hash_func_ptr hash);
22 int bloom_insert(bloom_filter b, void *data, unsigned size);
23 int bloom_check(bloom_filter b, void *data, unsigned size);
24 void bloom_destroy(bloom_filter b);
25 /* ------------- end of bloom types and funcs --------------- */
26
27 int main()
28 {
29 const int size = 655371;
30 bloom_filter b1 = bloom_init(size, sdbmhash);
31 for (int i = 0; i < size / 2; i += 2)
32 {
33 if (!bloom_insert(b1, &i, sizeof(i)))
34 {
35 fprintf(stderr, "err insert %d\n", i);
36 exit(1);
37 }
38 }
39 printf("insert ok\n");
40
41 int cnt = 0;
42 for (int i = 0; i < size / 2; i++)
43 {
44 if (bloom_check(b1, &i, sizeof(i)))
45 {
46 if (i & 1)
47 {
48 //printf("i = %d should not be checked, tolerable.\n", i);
49 cnt++;
50 }
51 }
52 else
53 {
54 if (!(i & 1))
55 {
56 printf("i = %d should be checked! BUG!\n", i);
57 }
58 }
59 }
60 printf("cnt = %d\n", cnt);
61 return 0;
62 }
63
64 bloom_filter bloom_init (unsigned n, hash_func_ptr hash)
65 {
66 bloom_filter b = (bloom_filter)malloc(sizeof(__bloom_filter));
67 if (b == NULL)
68 {
69 fprintf(stderr, "bloom_init: err malloc bloom_filter\n");
70 return NULL;
71 }
72
73 b->n = n;
74 b->size = (n + 7) / 8;
75 b->hash = hash;
76
77 b->bits = (unsigned char *)malloc(b->size);
78 memset(b->bits, 0, b->size);
79 if (b->bits == NULL)
80 {
81 fprintf(stderr, "bloom_init: err malloc bits\n");
82 return NULL;
83 }
84 return b;
85 }
86
87 int bloom_insert(bloom_filter b, void *data, unsigned size)
88 {
89 unsigned h = b->hash((const char *)data, size) % (b->n);
90 unsigned idx = h / 8;
91 if (idx >= b->size)
92 {
93 fprintf(stderr, "bloom_insert: hash value overflow\n");
94 return 0;
95 }
96 b->bits[idx] |= masks[h % 8];
97 //printf("h = %2d, idx = %2d, bit = %2d\n", h, idx, h % 8);
98 return 1;
99 }
100
101 int bloom_check(bloom_filter b, void *data, unsigned size)
102 {
103 unsigned h = b->hash((const char *)data, size) % (b->n);
104 unsigned idx = h / 8;
105 if (idx >= b->size)
106 {
107 fprintf(stderr, "bloom_insert: hash value overflow\n");
108 exit(1);
109 }
110 return !!(b->bits[idx] & masks[h % 8]);
111 }
112
113 void bloom_destroy(bloom_filter b)
114 {
115 if (b != NULL)
116 {
117 if (b->bits != NULL)
118 free(b->bits);
119 free(b);
120 }
121 }
122
123 //-----------------------------------------------
124
125 unsigned int jshash(const char *s, unsigned size)
126 {
127 int hash = 1315423911;
128 unsigned len = 0;
129 while (len < size)
130 {
131 hash ^= (hash << 5) + s[len] + (hash >> 2);
132 len++;
133 }
134 return (hash & 0x7fffffffl);
135 }
136
137 unsigned int sdbmhash(const char *s, unsigned size)
138 {
139 int hash = 0;
140 unsigned len = 0;
141 while (len < size)
142 {
143 hash = (hash << 6) + (hash << 16) - hash + s[len];
144 len++;
145 }
146 return (hash & 0x7fffffffl);
147 }
148