【数据结构与算法】bitmap
bitmap
位图法,用每个bit位存储状态(如0/1),用于判断某个数据是否存在。适用于数据量很大,但状态不多的情况。
数据的取值范围[min,max],创建长度为max-min+1的bit数组bitmap,若num存在则bitmap[num]=1,否则为0。
用char数组存储,每个元素存1字节,0x0f就是表示有4个数存在4个数不存在。
(1)bitmap原理及实现
#pragma warning(disable : 4996 4800) #include <stdlib.h> #include <stdio.h> #include <memory.h> #include "math.h" #include <mutex> class Bitmap { //位图bitmap类 private: char* M; long long int N; //比特图所存放的空间M[],容量为N*sizeof(char)*8比特 std::mutex mtx; protected: void init(long long int n) //初始化位图空间 { M = new char[N = (n + 7) / 8]; //申请内存 memset(M, 0, N); //初始化内存块 } public: Bitmap(long long int n = 8) { init(n); } //按指定或默认规模创建比特图(为测试暂时选用较小的默认值) Bitmap(char* file, long long int n = 8) //按指定或默认规模,从指定文件中读取比特图 { init(n); FILE* fp = fopen(file, "r"); fread(M, sizeof(char), N, fp); fclose(fp); } ~Bitmap() { delete[] M; M = NULL; } //析构时释放比特图空间 void set(long long int k) //置位第k个标志位 { expand(k); //拓容 //mtx.lock(); M[k >> 3] |= (0x80 >> (k & 0x07)); //M[第k个标志位所在的字节(k/8 取整)] |= (第k个标志位在所在字节中的位数(取余)) //mtx.unlock(); } void clear(long long int k) { expand(k); //拓容 M[k >> 3] &= ~(0x80 >> (k & 0x07)); //M[第k个标志位所在的字节(k/8 取整)] &= ~(第k个标志位在所在字节中的位数(取余)) } bool test(long long int k) {//取出指定字节中的指定位 expand(k); //拓容 return M[k >> 3] & (0x80 >> (k & 0x07)); //M[第k个标志位所在的字节(k/8 取整)] &(第k个标志位在所在字节中的位的值) } void dump(char* file) //将位图整体导出至指定的文件,以便对此后的新位图批量初始化 { FILE* fp = fopen(file, "w"); fwrite(M, sizeof(char), N, fp); fclose(fp); } char* bits2string(long long int n) { //将前n位转换为字符串—— expand(n - 1); //此时可能被访问的最高位为bitmap[n - 1] char* s = new char[n + (long long)1]; s[n] = '\0'; //字符串所占空间,由上层调用者负责释放 for (long long int i = 0; i < n; i++) s[i] = test(i) ? '1' : '0'; return s; //返回字符串位置 } void expand(long long int k) { //若被访问的bitmap[k]已出界,则需扩容 if (k < 8 * N) return; //仍在界内,无需扩容 long long int oldN = N; char* oldM = M; init((long long)2 * k); //与向量类似,加倍策略 memcpy_s(M, N, oldM, oldN); delete[] oldM; //原数据转移至新空间 } void print(long long int n) //逐位打印以检验位图内容,非必需接口 { expand(n); for (unsigned int i = 0; i < n; i++) printf(test(i) ? "1" : "0"); } static bool isPrime(int n) { //判断某个数是否为素数 if (n <= 3) { return n > 1; } // 不在6的倍数两侧的一定不是质数 if (n % 6 != 1 && n % 6 != 5) { return false; } int s = (int)sqrt(n); for (int i = 5; i <= s; i += 6) { if (n % i == 0 || n % (i + 2) == 0) { return false; } } return true; } //返回耗费内存MB unsigned int size() { return (unsigned int)(N / (1024 * 1024)); } };
(2)std::vector<bool>
但bitset
效率极低,做不了bitmap。
而vector<bool>
在cpp中不是存储bool的vector,而是被标准库特化为了比特,极大节省了空间且效率极高。
(3)应用场景
-
有400万个数字,数字分布在范围1000w~1500w内,要求排序且复杂度为O(N),查找数字且复杂度为O(1)
#include <iostream> #include <stdlib.h> #include <vector> #include <random> #include <string> #define MIN 10000000 //数据下限 #define MAX 15000000 //数据上限(右闭区间) #define COUNT 4000000 //数据个数 int main() { using namespace std; std::mt19937 gen((unsigned int)time(NULL)); //gen是一个使用rd()作种子初始化的标准梅森旋转算法的随机数发生器 std::uniform_int_distribution<> distrib(MIN, MAX); //随机数范围[MIN,MAX] vector<bool> bitmap(MAX - MIN + 1, false); //全部置0 printf("capacity=%d,size=%d\n", bitmap.capacity(), bitmap.size()); for (int i = 0; i < COUNT; i++) { int index = distrib(gen) - MIN; //原始数据要减去MIN映射到[0,MAX-MIN]的区间上 bitmap[index] = true; } while (true) { cout << "input:" << endl; int a; cin >> a; if (a<MIN || a>MAX) { printf("%d is not existed..\n",a); continue; } if (bitmap[a - MIN]) printf("%d is existed..\n",a); else printf("%d is not existed..\n",a); } /*int count = 0; for(auto item:bitmap) { if (++count == 100) { cout << endl; count = 0; } if (item) printf("1"); else printf("0"); } cout << endl; printf("capacity=%d,size=%d\n", bitmap.capacity(), bitmap.size());*/ return 0; } #include <iostream> #include <stdlib.h> #include <vector> #include <random> #include <string> #include "Bitmap.h" #define MIN 10000000 #define MAX 15000000 #define COUNT 4000000 int main() { using namespace std; std::mt19937 gen((unsigned int)time(NULL));//gen是一个使用rd()作种子初始化的标准梅森旋转算法的随机数发生器 std::uniform_int_distribution<> distrib(MIN, MAX); Bitmap bitmap(MAX - MIN + 1); for (int i = 0; i < COUNT; i++) { int index = distrib(gen) - MIN; bitmap.set(index); } for (int i = 0; i < (MAX - MIN + 1); i++) { if (bitmap.test(i)) { cout << "min=" << i + MIN << endl; break; } } for (int i = MAX - MIN; i >= 0; i--) { if (bitmap.test(i)) { cout << "max=" << i + MIN << endl; break; } } //bitmap.print(COUNT); while (true) { cout << "input:" << endl; int a; cin >> a; if (a<MIN || a>MAX) { printf("%d is not existed..\n",a); continue; } if (bitmap.test(a-MIN)) printf("%d is existed..\n",a); else printf("%d is not existed..\n",a); } return 0; } -
40亿个非负整数(范围[0,4294967295])中找中位数和找出现两次的数
#include <iostream> #include <thread> #include <string> #include <random> #include <vector> #include "Bitmap.h" using namespace std; #define MIN 0 #define MAX 4294967295 #define COUNT 400 Bitmap bitmap1(MAX - MIN + (long long)1); Bitmap bitmap2(MAX - MIN + (long long)1); void FullFillBitmap(long long int start, long long int end) { std::mt19937 gen((unsigned int)time(NULL)); std::uniform_int_distribution<unsigned int> distrib(MIN, MAX); for (long long int i = start; i < end; i++) { long long int index = distrib(gen) - MIN; if (bitmap1.test(index)) { bitmap2.set(index); continue; } bitmap1.set(index); } } int main() { auto start = chrono::steady_clock::now(); //获取当前时间点 /*std::mt19937 gen((unsigned int)time(NULL)); std::uniform_int_distribution<unsigned int> distrib(MIN, MAX); for (long long int i = 0; i < COUNT; i++) { long long int index = distrib(gen) - MIN; bitmap.set(index); }*/ thread th1(FullFillBitmap, 0, COUNT/10); thread th2(FullFillBitmap, COUNT / 10, COUNT / 10 * 2); thread th3(FullFillBitmap, COUNT / 10 * 2, COUNT / 10 * 3); thread th4(FullFillBitmap, COUNT / 10 * 3, COUNT / 10 * 4); thread th5(FullFillBitmap, COUNT / 10 * 4, COUNT / 10 * 5); thread th6(FullFillBitmap, COUNT / 10 * 5, COUNT / 10 * 6); thread th7(FullFillBitmap, COUNT / 10 * 6, COUNT / 10 * 7); thread th8(FullFillBitmap, COUNT / 10 * 7, COUNT / 10 * 8); thread th9(FullFillBitmap, COUNT / 10 * 8, COUNT / 10 * 9); thread th10(FullFillBitmap, COUNT / 10 * 9, COUNT); th1.join(); th2.join(); th3.join(); th4.join(); th5.join(); th6.join(); th7.join(); th8.join(); th9.join(); th10.join(); auto end1 = chrono::steady_clock::now(); //获取当前时间点 auto time_diff1 = end1 - start; //计算时间段 auto duration1 = chrono::duration_cast<chrono::milliseconds>(time_diff1); //将时间段转成ms单位 cout << "fullfill cost : " << duration1.count() << "ms" << endl; cout << "bitmap1 size=" << bitmap1.size() << "MB" << endl; cout << "bitmap2 size=" << bitmap2.size() << "MB" << endl; vector<long long> repeats; for (long long i = MIN; i < (MAX - MIN + 1); i++) { if (bitmap1.test(i) && bitmap2.test(i)) { repeats.push_back(i); printf("%lld\n", i); } } cout << "重复两次的数字有" << repeats.size() << "个" << endl; auto end2 = chrono::steady_clock::now(); //获取当前时间点 auto time_diff2 = end2 - start; //计算时间段 auto duration2 = chrono::duration_cast<chrono::milliseconds>(time_diff2); //将时间段转成ms单位 cout << "query repeats cost : " << duration2.count() << "ms" << endl; return 0; }