bitmap与桶方式对1000万数据进行排序(转+自己实现理解)
1. 100万数据的产生,随机数方式
#include <iostream>
#include <time.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
using namespace std;
const int size = 10000000;
int num[size];
int main()
{
int n;
FILE *fp = fopen("data.txt", "w");
assert(fp);
for (n = 1; n <= size; n++)
//之前此处写成了n=0;n<size。导致下面有一段小程序的测试数据出现了0,特此订正。
num[n] = n;
srand((unsigned)time(NULL));
int i, j;
for (n = 0; n < size; n++)
{
i = (rand() * RAND_MAX + rand()) % 10000000;
j = (rand() * RAND_MAX + rand()) % 10000000;
swap(num[i], num[j]);
}
for (n = 0; n < size; n++)
fprintf(fp, "%d ", num[n]);
fclose(fp);
return 0;
}
使用bit_set 进行排序
//位图方式解决海量数据排序,数据不能有重复
//使用 C++ stl的 bitset
#include <iostream>
#include <bitset>
#include <assert.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
using namespace std;
const int max_each_scan = 5000000;
int main()
{
clock_t begin = clock();
bitset<max_each_scan> bit_map;
bit_map.reset();
// open the file with the unsorted data
FILE *fp_unsort_file = fopen("data.txt", "r");
assert(fp_unsort_file);
int num;
// the first time scan to sort the data between 0 - 4999999
while (fscanf(fp_unsort_file, "%d ", &num) != EOF)
{
if (num < max_each_scan)
//有这个数字,将bit_map的对应的位设置为1
bit_map.set(num, 1);
}
FILE *fp_sort_file = fopen("sort.txt", "w");
assert(fp_sort_file);
int i;
// write the sorted data into file
for (i = 0; i < max_each_scan; i++)
{
if (bit_map[i] == 1)
fprintf(fp_sort_file, "%d ", i);
}
// the second time scan to sort the data between 5000000 - 9999999
int result = fseek(fp_unsort_file, 0, SEEK_SET);
if (result)
cout << "fseek failed!" << endl;
else
{
bit_map.reset();
while (fscanf(fp_unsort_file, "%d ", &num) != EOF)
{
if (num >= max_each_scan && num < 10000000)
{
num -= max_each_scan;
bit_map.set(num, 1);
}
}
for (i = 0; i < max_each_scan; i++)
{
if (bit_map[i] == 1)
fprintf(fp_sort_file, "%d ", i + max_each_scan);
}
}
clock_t end = clock();
cout<<"用位图的方法,耗时:"<<endl;
cout << (end - begin) / CLK_TCK << "s" << endl;
fclose(fp_sort_file);
fclose(fp_unsort_file);
return 0;
}
位图排序的实现示例
#include <iostream>
#include <memory.h>
#define BYTESIZE 8
using namespace std;
void setBit(char *p,int posi)
{
for(int i = 0;i < (posi/BYTESIZE);i ++)
{
p ++;
}
*p = *p|(0x01 << (posi%BYTESIZE));//将该Bit位赋值1
return;
}
int main()
{
int num[] = {3,2,5,7,12,24,9,8,6};
const int BufferLen = 2;
char *pBuffer = new char[BufferLen];
memset(pBuffer,0,BufferLen);
for(int i = 0;i < 9;i ++)
setBit(pBuffer,num[i]);
//输出排序结果
for(int i = 0;i < BufferLen;i ++)//每次处理一个字节
{
for(int j = 0;j < BYTESIZE;j ++)
{
if( (*pBuffer&(0x01<<j)) == (0x01<<j))
cout << i * BYTESIZE + j << " ";
}
pBuffer ++;
}
return 0;
}
归并排序方式实现
//copyright@ 纯净的天空 && yansha
//5、July,updated,2010.05.28。
#include <iostream>
#include <ctime>
#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
//#include "ExternSort.h"
using namespace std;
//使用多路归并进行外排序的类
//ExternSort.h
/** 大数据量的排序
* 多路归并排序
* 以千万级整数从小到大排序为例
* 一个比较简单的例子,没有建立内存缓冲区*/
#ifndef EXTERN_SORT_H
#define EXTERN_SORT_H
#include <cassert>
class ExternSort
{
public:void sort()
{
time_t start = time(NULL);
//将文件内容分块在内存中排序,并分别写入临时文件
int file_count = memory_sort();
//归并临时文件内容到输出文件
merge_sort(file_count);
time_t end = time(NULL);
printf("total time:%f/n", (end - start) * 1000.0/ CLOCKS_PER_SEC);
}
//input_file:输入文件名
//out_file:输出文件名
//count: 每次在内存中排序的整数个数
ExternSort(const char *input_file, const char * out_file, int count)
{
m_count = count;
m_in_file = new char[strlen(input_file) + 1];
strcpy(m_in_file, input_file);
m_out_file = new char[strlen(out_file) + 1];
strcpy(m_out_file, out_file);
}
virtual ~ExternSort()
{
delete [] m_in_file;
delete [] m_out_file;
}
private:int m_count;
//数组长度
char *m_in_file;
//输入文件的路径
char *m_out_file;
//输出文件的路径
protected:int read_data(FILE* f, int a[], int n)
{
int i = 0;
while(i < n && (fscanf(f, "%d", &a[i]) != EOF))
i++;
printf("read:%d integer/n", i);
return i;
}
void write_data(FILE* f, int a[], int n)
{
for(int i = 0; i < n; ++i)
fprintf(f, "%d ", a[i]);
}
char* temp_filename(int index)
{
char *tempfile = new char[100];
sprintf(tempfile, "temp%d.txt", index);
return tempfile;
}
static int cmp_int(const void *a, const void *b)
{
return *(int*)a - *(int*)b;
}
int memory_sort()
{
FILE* fin = fopen(m_in_file, "rt");
int n = 0, file_count = 0;
int *array = new int[m_count];
//每读入m_count个整数就在内存中做一次排序,并写入临时文件
while(( n = read_data(fin, array, m_count)) > 0)
{
qsort(array, n, sizeof(int), cmp_int);
//这里,调用了库函数阿,在第四节的c实现里,不再调用qsort。
char *fileName = temp_filename(file_count++);
FILE *tempFile = fopen(fileName, "w");
free(fileName);
write_data(tempFile, array, n);
fclose(tempFile);
}
delete [] array;
fclose(fin);
return file_count;
}
void merge_sort(int file_count)
{
if(file_count <= 0)
return;
//归并临时文件
FILE *fout = fopen(m_out_file, "wt");
FILE* *farray = new FILE*[file_count];
int i;
for(i = 0; i < file_count; ++i)
{
char* fileName = temp_filename(i);
farray[i] = fopen(fileName, "rt");
free(fileName);
}
int *data = new int[file_count];
//存储每个文件当前的一个数字
bool *hasNext = new bool[file_count];
//标记文件是否读完
memset(data, 0, sizeof(int) * file_count);
memset(hasNext, 1, sizeof(bool) * file_count);
for(i = 0; i < file_count; ++i)
{
if(fscanf(farray[i], "%d", &data[i]) == EOF)
//读每个文件的第一个数到data数组
hasNext[i] = false;
}
while(true)
{
//求data中可用的最小的数字,并记录对应文件的索引
int min = data[0];
int j = 0;
while (j < file_count && !hasNext[j])
j++;
if (j >= file_count)
//没有可取的数字,终止归并
break;
for(i = j + 1; i < file_count; ++i)
{
if(hasNext[i] && min > data[i])
{
min = data[i];j = i;
}
}
if(fscanf(farray[j], "%d", &data[j]) == EOF)
//读取文件的下一个元素
hasNext[j] = false;
fprintf(fout, "%d ", min);
}
delete [] hasNext;
delete [] data;
for(i = 0; i < file_count; ++i)
{
fclose(farray[i]);
}
delete [] farray;
fclose(fout);
}
};
#endif
//测试主函数文件
/** 大文件排序*
数据不能一次性全部装入内存*
排序文件里有多个整数,
整数之间用空格隔开*/
const unsigned int count = 10000000;
// 文件里数据的行数
const unsigned int number_to_sort = 1000000;
//在内存中一次排序的数量
const char *unsort_file = "data.txt";
//原始未排序的文件名
const char *sort_file = "sort_data.txt";
//已排序的文件名
void init_data(unsigned int num);
//随机生成数据文件
int main(int argc, char* *argv)
{
srand(time(NULL));
init_data(count);
ExternSort extSort(unsort_file, sort_file, number_to_sort);
extSort.sort();
return 0;
}
void init_data(unsigned int num)
{
FILE* f = fopen(unsort_file, "wt");
for(int i = 0; i < num; ++i)
fprintf(f, "%d ", rand());
fclose(f);
}
还有一种是桶排序方式实现的
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <ctime>
#define LOW 18 //桶大小
#define FILE_NUM 39 //桶对应的文件数
#define MEM_SIZE 256*1024
using namespace::std;
int memory[MEM_SIZE]; //1M
//对ifp中的数据进行排序,结果输出到ofp中 ,i是正在处理的桶的编号
void sort(FILE*ifp, FILE *ofp, int i)
{
memset(memory,0,1024*1024);
int d;
int high=i<<LOW; //保存数据的高位
if(fscanf(ifp, "%d", &d)==1)
{
++memory[d&0x3ffff]; //计数,只是用低18位
high=d&0xfffc0000; //保存高位
}
while(fscanf(ifp, "%d", &d)==1)
{
++memory[d&0x3ffff]; //计数,不考虑高五位
}
for (int i=0; i<MEM_SIZE; ++i)
{
int num=memory[i];
while(num--)
{
fprintf(ofp,"%d ",i|high); //输出结果
}
}
}
int main()
{
FILE *fp_tmp[FILE_NUM];
FILE *fp_data;
if(NULL==(fp_data=fopen("data.txt","r"))) //打开测试数据
exit(0);
int d;
int i;
time_t start = time(NULL); //开始计时
for (i=0; i<FILE_NUM; ++i) //创建桶对应的FILE_NUM个文件
{
char buf[64]="tmp_";
char buf_int[4];
itoa(i, buf_int, 10);
strcat(buf,buf_int);
strcat(buf,".txt");
if((fp_tmp[i]=fopen(buf,"w+"))==NULL)
exit(0);
}
while(fscanf(fp_data,"%d",&d)==1) //读入数据存放到各个桶中
{
int i = d >> LOW; //不管这个数多大,右移18位啊,都变成0了
fprintf(fp_tmp[d >> LOW], "%d ",d&0x3ffff);
}
for (i=0; i<FILE_NUM; ++i) //初始化文件指针
{
rewind(fp_tmp[i]);
}
FILE * out_fp;
if(NULL==(out_fp=fopen("out.txt","w"))) //out.txt用于保存排序后的数据
exit(0);
for (i=0; i<FILE_NUM; ++i)
{
sort(fp_tmp[i],out_fp,i); //分别对每个桶进行排序
}
for (i=0; i<FILE_NUM; ++i) //关闭文件
{
fclose(fp_tmp[i]);
}
time_t end = time(NULL); //停止计时
printf("total time:%f/n", (end - start) * 1000.0/ CLOCKS_PER_SEC);
return 0;
}
一个不会敲代码的程序员