【数据结构与算法】bitmap

bitmap


位图法,用每个bit位存储状态(如0/1),用于判断某个数据是否存在。适用于数据量很大,但状态不多的情况。
数据的取值范围[min,max],创建长度为max-min+1的bit数组bitmap,若num存在则bitmap[num]=1,否则为0。
用char数组存储,每个元素存1字节,0x0f就是表示有4个数存在4个数不存在。

(1)bitmap原理及实现

#pragma warning(disable : 4996 4800)
#include <stdlib.h>
#include <stdio.h>
#include <memory.h>
#include "math.h"
#include <mutex>
class Bitmap { //位图bitmap类
private:
char* M;
long long int N; //比特图所存放的空间M[],容量为N*sizeof(char)*8比特
std::mutex mtx;
protected:
void init(long long int n) //初始化位图空间
{
M = new char[N = (n + 7) / 8]; //申请内存
memset(M, 0, N); //初始化内存块
}
public:
Bitmap(long long int n = 8) { init(n); } //按指定或默认规模创建比特图(为测试暂时选用较小的默认值)
Bitmap(char* file, long long int n = 8) //按指定或默认规模,从指定文件中读取比特图
{
init(n); FILE* fp = fopen(file, "r"); fread(M, sizeof(char), N, fp); fclose(fp);
}
~Bitmap() { delete[] M; M = NULL; } //析构时释放比特图空间
void set(long long int k) //置位第k个标志位
{
expand(k); //拓容
//mtx.lock();
M[k >> 3] |= (0x80 >> (k & 0x07)); //M[第k个标志位所在的字节(k/8 取整)] |= (第k个标志位在所在字节中的位数(取余))
//mtx.unlock();
}
void clear(long long int k) {
expand(k); //拓容
M[k >> 3] &= ~(0x80 >> (k & 0x07)); //M[第k个标志位所在的字节(k/8 取整)] &= ~(第k个标志位在所在字节中的位数(取余))
}
bool test(long long int k) {//取出指定字节中的指定位
expand(k); //拓容
return M[k >> 3] & (0x80 >> (k & 0x07)); //M[第k个标志位所在的字节(k/8 取整)] &(第k个标志位在所在字节中的位的值)
}
void dump(char* file) //将位图整体导出至指定的文件,以便对此后的新位图批量初始化
{
FILE* fp = fopen(file, "w"); fwrite(M, sizeof(char), N, fp); fclose(fp);
}
char* bits2string(long long int n)
{ //将前n位转换为字符串——
expand(n - 1); //此时可能被访问的最高位为bitmap[n - 1]
char* s = new char[n + (long long)1]; s[n] = '\0'; //字符串所占空间,由上层调用者负责释放
for (long long int i = 0; i < n; i++) s[i] = test(i) ? '1' : '0';
return s; //返回字符串位置
}
void expand(long long int k)
{ //若被访问的bitmap[k]已出界,则需扩容
if (k < 8 * N) return; //仍在界内,无需扩容
long long int oldN = N; char* oldM = M;
init((long long)2 * k); //与向量类似,加倍策略
memcpy_s(M, N, oldM, oldN);
delete[] oldM; //原数据转移至新空间
}
void print(long long int n) //逐位打印以检验位图内容,非必需接口
{
expand(n);
for (unsigned int i = 0; i < n; i++)
printf(test(i) ? "1" : "0");
}
static bool isPrime(int n) { //判断某个数是否为素数
if (n <= 3) {
return n > 1;
}
// 不在6的倍数两侧的一定不是质数
if (n % 6 != 1 && n % 6 != 5) {
return false;
}
int s = (int)sqrt(n);
for (int i = 5; i <= s; i += 6) {
if (n % i == 0 || n % (i + 2) == 0) {
return false;
}
}
return true;
}
//返回耗费内存MB
unsigned int size()
{
return (unsigned int)(N / (1024 * 1024));
}
};

(2)std::vector<bool>

bitset效率极低,做不了bitmap。

vector<bool>在cpp中不是存储bool的vector,而是被标准库特化为了比特,极大节省了空间且效率极高。

(3)应用场景

  • 有400万个数字,数字分布在范围1000w~1500w内,要求排序且复杂度为O(N),查找数字且复杂度为O(1)

    #include <iostream>
    #include <stdlib.h>
    #include <vector>
    #include <random>
    #include <string>
    #define MIN 10000000 //数据下限
    #define MAX 15000000 //数据上限(右闭区间)
    #define COUNT 4000000 //数据个数
    int main()
    {
    using namespace std;
    std::mt19937 gen((unsigned int)time(NULL)); //gen是一个使用rd()作种子初始化的标准梅森旋转算法的随机数发生器
    std::uniform_int_distribution<> distrib(MIN, MAX); //随机数范围[MIN,MAX]
    vector<bool> bitmap(MAX - MIN + 1, false); //全部置0
    printf("capacity=%d,size=%d\n", bitmap.capacity(), bitmap.size());
    for (int i = 0; i < COUNT; i++)
    {
    int index = distrib(gen) - MIN; //原始数据要减去MIN映射到[0,MAX-MIN]的区间上
    bitmap[index] = true;
    }
    while (true)
    {
    cout << "input:" << endl;
    int a;
    cin >> a;
    if (a<MIN || a>MAX)
    {
    printf("%d is not existed..\n",a);
    continue;
    }
    if (bitmap[a - MIN])
    printf("%d is existed..\n",a);
    else
    printf("%d is not existed..\n",a);
    }
    /*int count = 0;
    for(auto item:bitmap)
    {
    if (++count == 100)
    {
    cout << endl;
    count = 0;
    }
    if (item)
    printf("1");
    else
    printf("0");
    }
    cout << endl;
    printf("capacity=%d,size=%d\n", bitmap.capacity(), bitmap.size());*/
    return 0;
    }
    #include <iostream>
    #include <stdlib.h>
    #include <vector>
    #include <random>
    #include <string>
    #include "Bitmap.h"
    #define MIN 10000000
    #define MAX 15000000
    #define COUNT 4000000
    int main()
    {
    using namespace std;
    std::mt19937 gen((unsigned int)time(NULL));//gen是一个使用rd()作种子初始化的标准梅森旋转算法的随机数发生器
    std::uniform_int_distribution<> distrib(MIN, MAX);
    Bitmap bitmap(MAX - MIN + 1);
    for (int i = 0; i < COUNT; i++)
    {
    int index = distrib(gen) - MIN;
    bitmap.set(index);
    }
    for (int i = 0; i < (MAX - MIN + 1); i++)
    {
    if (bitmap.test(i))
    {
    cout << "min=" << i + MIN << endl;
    break;
    }
    }
    for (int i = MAX - MIN; i >= 0; i--)
    {
    if (bitmap.test(i))
    {
    cout << "max=" << i + MIN << endl;
    break;
    }
    }
    //bitmap.print(COUNT);
    while (true)
    {
    cout << "input:" << endl;
    int a;
    cin >> a;
    if (a<MIN || a>MAX)
    {
    printf("%d is not existed..\n",a);
    continue;
    }
    if (bitmap.test(a-MIN))
    printf("%d is existed..\n",a);
    else
    printf("%d is not existed..\n",a);
    }
    return 0;
    }
  • 40亿个非负整数(范围[0,4294967295])中找中位数和找出现两次的数

#include <iostream>
#include <thread>
#include <string>
#include <random>
#include <vector>
#include "Bitmap.h"
using namespace std;
#define MIN 0
#define MAX 4294967295
#define COUNT 400
Bitmap bitmap1(MAX - MIN + (long long)1);
Bitmap bitmap2(MAX - MIN + (long long)1);
void FullFillBitmap(long long int start, long long int end)
{
std::mt19937 gen((unsigned int)time(NULL));
std::uniform_int_distribution<unsigned int> distrib(MIN, MAX);
for (long long int i = start; i < end; i++)
{
long long int index = distrib(gen) - MIN;
if (bitmap1.test(index))
{
bitmap2.set(index);
continue;
}
bitmap1.set(index);
}
}
int main()
{
auto start = chrono::steady_clock::now(); //获取当前时间点
/*std::mt19937 gen((unsigned int)time(NULL));
std::uniform_int_distribution<unsigned int> distrib(MIN, MAX);
for (long long int i = 0; i < COUNT; i++)
{
long long int index = distrib(gen) - MIN;
bitmap.set(index);
}*/
thread th1(FullFillBitmap, 0, COUNT/10);
thread th2(FullFillBitmap, COUNT / 10, COUNT / 10 * 2);
thread th3(FullFillBitmap, COUNT / 10 * 2, COUNT / 10 * 3);
thread th4(FullFillBitmap, COUNT / 10 * 3, COUNT / 10 * 4);
thread th5(FullFillBitmap, COUNT / 10 * 4, COUNT / 10 * 5);
thread th6(FullFillBitmap, COUNT / 10 * 5, COUNT / 10 * 6);
thread th7(FullFillBitmap, COUNT / 10 * 6, COUNT / 10 * 7);
thread th8(FullFillBitmap, COUNT / 10 * 7, COUNT / 10 * 8);
thread th9(FullFillBitmap, COUNT / 10 * 8, COUNT / 10 * 9);
thread th10(FullFillBitmap, COUNT / 10 * 9, COUNT);
th1.join();
th2.join();
th3.join();
th4.join();
th5.join();
th6.join();
th7.join();
th8.join();
th9.join();
th10.join();
auto end1 = chrono::steady_clock::now(); //获取当前时间点
auto time_diff1 = end1 - start; //计算时间段
auto duration1 = chrono::duration_cast<chrono::milliseconds>(time_diff1); //将时间段转成ms单位
cout << "fullfill cost : " << duration1.count() << "ms" << endl;
cout << "bitmap1 size=" << bitmap1.size() << "MB" << endl;
cout << "bitmap2 size=" << bitmap2.size() << "MB" << endl;
vector<long long> repeats;
for (long long i = MIN; i < (MAX - MIN + 1); i++)
{
if (bitmap1.test(i) && bitmap2.test(i))
{
repeats.push_back(i);
printf("%lld\n", i);
}
}
cout << "重复两次的数字有" << repeats.size() << "个" << endl;
auto end2 = chrono::steady_clock::now(); //获取当前时间点
auto time_diff2 = end2 - start; //计算时间段
auto duration2 = chrono::duration_cast<chrono::milliseconds>(time_diff2); //将时间段转成ms单位
cout << "query repeats cost : " << duration2.count() << "ms" << endl;
return 0;
}
posted @   徘徊彼岸花  阅读(75)  评论(0编辑  收藏  举报
点击右上角即可分享
微信分享提示