bitmap与桶方式对1000万数据进行排序(转+自己实现理解)

1.  100万数据的产生,随机数方式

#include <iostream>
#include
<time.h>
#include
<assert.h>
#include
<stdio.h>
#include
<stdlib.h>

using namespace std;
const int size = 10000000;
int num[size];
int main()
{
int n;
FILE
*fp = fopen("data.txt", "w");
assert(fp);
for (n = 1; n <= size; n++)

//之前此处写成了n=0;n<size。导致下面有一段小程序的测试数据出现了0,特此订正。
num[n] = n;
srand((unsigned)time(NULL));
int i, j;
for (n = 0; n < size; n++)
{
i
= (rand() * RAND_MAX + rand()) % 10000000;
j
= (rand() * RAND_MAX + rand()) % 10000000;
swap(num[i], num[j]);
}
for (n = 0; n < size; n++)
fprintf(fp,
"%d ", num[n]);
fclose(fp);

return 0;
}

  使用bit_set 进行排序

//位图方式解决海量数据排序,数据不能有重复

//使用 C++ stl的 bitset
#include <iostream>
#include
<bitset>
#include
<assert.h>
#include
<time.h>
#include
<stdio.h>
#include
<stdlib.h>
using namespace std;
const int max_each_scan = 5000000;
int main()
{
clock_t begin
= clock();
bitset
<max_each_scan> bit_map;
bit_map.reset();
// open the file with the unsorted data
FILE *fp_unsort_file = fopen("data.txt", "r");
assert(fp_unsort_file);
int num;

// the first time scan to sort the data between 0 - 4999999
while (fscanf(fp_unsort_file, "%d ", &num) != EOF)
{
if (num < max_each_scan)
//有这个数字,将bit_map的对应的位设置为1
bit_map.set(num, 1);
}
FILE
*fp_sort_file = fopen("sort.txt", "w");
assert(fp_sort_file);
int i;
// write the sorted data into file
for (i = 0; i < max_each_scan; i++)
{
if (bit_map[i] == 1)
fprintf(fp_sort_file,
"%d ", i);
}

// the second time scan to sort the data between 5000000 - 9999999
int result = fseek(fp_unsort_file, 0, SEEK_SET);
if (result)
cout
<< "fseek failed!" << endl;
else
{
bit_map.reset();
while (fscanf(fp_unsort_file, "%d ", &num) != EOF)
{
if (num >= max_each_scan && num < 10000000)
{
num
-= max_each_scan;
bit_map.
set(num, 1);
}
}
for (i = 0; i < max_each_scan; i++)
{
if (bit_map[i] == 1)
fprintf(fp_sort_file,
"%d ", i + max_each_scan);
}
}
clock_t end
= clock();

cout
<<"用位图的方法,耗时:"<<endl;
cout
<< (end - begin) / CLK_TCK << "s" << endl;
fclose(fp_sort_file);
fclose(fp_unsort_file);
return 0;
}

  位图排序的实现示例

#include <iostream>
#include
<memory.h>
#define BYTESIZE 8
using namespace std;
void setBit(char *p,int posi)
{
for(int i = 0;i < (posi/BYTESIZE);i ++)
{
p
++;
}
*p = *p|(0x01 << (posi%BYTESIZE));//将该Bit位赋值1
return;
}
int main()
{
int num[] = {3,2,5,7,12,24,9,8,6};
const int BufferLen = 2;
char *pBuffer = new char[BufferLen];
memset(pBuffer,
0,BufferLen);

for(int i = 0;i < 9;i ++)
setBit(pBuffer,num[i]);

//输出排序结果
for(int i = 0;i < BufferLen;i ++)//每次处理一个字节
{
for(int j = 0;j < BYTESIZE;j ++)
{
if( (*pBuffer&(0x01<<j)) == (0x01<<j))
cout
<< i * BYTESIZE + j << " ";
}
pBuffer
++;
}
return 0;
}

  归并排序方式实现

//copyright@ 纯净的天空 && yansha
//5、July,updated,2010.05.28。
#include <iostream>
#include
<ctime>
#include
<fstream>
#include
<stdio.h>
#include
<stdlib.h>
#include
<string.h>
//#include "ExternSort.h"
using namespace std;
//使用多路归并进行外排序的类
//ExternSort.h
/*
* 大数据量的排序
* 多路归并排序
* 以千万级整数从小到大排序为例
* 一个比较简单的例子,没有建立内存缓冲区
*/
#ifndef EXTERN_SORT_H
#define EXTERN_SORT_H
#include
<cassert>
class ExternSort
{
public:void sort()
{
time_t start
= time(NULL);
//将文件内容分块在内存中排序,并分别写入临时文件
int file_count = memory_sort();
//归并临时文件内容到输出文件
merge_sort(file_count);
time_t end
= time(NULL);
printf(
"total time:%f/n", (end - start) * 1000.0/ CLOCKS_PER_SEC);
}

//input_file:输入文件名
//out_file:输出文件名
//count: 每次在内存中排序的整数个数
ExternSort(const char *input_file, const char * out_file, int count)
{
m_count
= count;
m_in_file
= new char[strlen(input_file) + 1];
strcpy(m_in_file, input_file);
m_out_file
= new char[strlen(out_file) + 1];
strcpy(m_out_file, out_file);
}
virtual ~ExternSort()
{
delete [] m_in_file;
delete [] m_out_file;
}
private:int m_count;
//数组长度
char *m_in_file;
//输入文件的路径
char *m_out_file;
//输出文件的路径
protected:int read_data(FILE* f, int a[], int n)
{
int i = 0;
while(i < n && (fscanf(f, "%d", &a[i]) != EOF))
i
++;
printf(
"read:%d integer/n", i);
return i;
}
void write_data(FILE* f, int a[], int n)
{
for(int i = 0; i < n; ++i)
fprintf(f,
"%d ", a[i]);
}
char* temp_filename(int index)
{
char *tempfile = new char[100];
sprintf(tempfile,
"temp%d.txt", index);
return tempfile;
}
static int cmp_int(const void *a, const void *b)
{
return *(int*)a - *(int*)b;
}
int memory_sort()
{
FILE
* fin = fopen(m_in_file, "rt");
int n = 0, file_count = 0;
int *array = new int[m_count];
//每读入m_count个整数就在内存中做一次排序,并写入临时文件
while(( n = read_data(fin, array, m_count)) > 0)
{
qsort(array, n,
sizeof(int), cmp_int);
//这里,调用了库函数阿,在第四节的c实现里,不再调用qsort。
char *fileName = temp_filename(file_count++);
FILE
*tempFile = fopen(fileName, "w");
free(fileName);
write_data(tempFile, array, n);
fclose(tempFile);
}
delete [] array;
fclose(fin);
return file_count;
}
void merge_sort(int file_count)
{
if(file_count <= 0)
return;
//归并临时文件
FILE *fout = fopen(m_out_file, "wt");
FILE
* *farray = new FILE*[file_count];
int i;
for(i = 0; i < file_count; ++i)
{
char* fileName = temp_filename(i);
farray[i]
= fopen(fileName, "rt");
free(fileName);
}
int *data = new int[file_count];
//存储每个文件当前的一个数字
bool *hasNext = new bool[file_count];
//标记文件是否读完
memset(data, 0, sizeof(int) * file_count);
memset(hasNext,
1, sizeof(bool) * file_count);
for(i = 0; i < file_count; ++i)
{
if(fscanf(farray[i], "%d", &data[i]) == EOF)
//读每个文件的第一个数到data数组
hasNext[i] = false;
}
while(true)
{
//求data中可用的最小的数字,并记录对应文件的索引
int min = data[0];
int j = 0;
while (j < file_count && !hasNext[j])
j
++;
if (j >= file_count)
//没有可取的数字,终止归并
break;
for(i = j + 1; i < file_count; ++i)
{
if(hasNext[i] && min > data[i])
{
min
= data[i];j = i;
}
}
if(fscanf(farray[j], "%d", &data[j]) == EOF)
//读取文件的下一个元素
hasNext[j] = false;
fprintf(fout,
"%d ", min);
}
delete [] hasNext;
delete [] data;
for(i = 0; i < file_count; ++i)
{
fclose(farray[i]);
}
delete [] farray;
fclose(fout);
}
};
#endif
//测试主函数文件
/*
* 大文件排序*
数据不能一次性全部装入内存*
排序文件里有多个整数,
整数之间用空格隔开
*/
const unsigned int count = 10000000;
// 文件里数据的行数
const unsigned int number_to_sort = 1000000;
//在内存中一次排序的数量
const char *unsort_file = "data.txt";
//原始未排序的文件名
const char *sort_file = "sort_data.txt";
//已排序的文件名
void init_data(unsigned int num);
//随机生成数据文件
int main(int argc, char* *argv)
{
srand(time(NULL));
init_data(count);
ExternSort extSort(unsort_file, sort_file, number_to_sort);
extSort.sort();

return 0;
}
void init_data(unsigned int num)
{
FILE
* f = fopen(unsort_file, "wt");
for(int i = 0; i < num; ++i)
fprintf(f,
"%d ", rand());
fclose(f);
}

  还有一种是桶排序方式实现的

#include <stdio.h>
#include
<stdlib.h>
#include
<string.h>
#include
<iostream>
#include
<ctime>
#define LOW 18 //桶大小
#define FILE_NUM 39 //桶对应的文件数

#define MEM_SIZE 256*1024
using namespace::std;

int memory[MEM_SIZE]; //1M


//对ifp中的数据进行排序,结果输出到ofp中 ,i是正在处理的桶的编号
void sort(FILE*ifp, FILE *ofp, int i)
{
memset(memory,
0,1024*1024);

int d;
int high=i<<LOW; //保存数据的高位

if(fscanf(ifp, "%d", &d)==1)
{
++memory[d&0x3ffff]; //计数,只是用低18位
high=d&0xfffc0000; //保存高位

}

while(fscanf(ifp, "%d", &d)==1)
{

++memory[d&0x3ffff]; //计数,不考虑高五位
}

for (int i=0; i<MEM_SIZE; ++i)
{
int num=memory[i];
while(num--)
{
fprintf(ofp,
"%d ",i|high); //输出结果
}


}


}

int main()
{
FILE
*fp_tmp[FILE_NUM];
FILE
*fp_data;

if(NULL==(fp_data=fopen("data.txt","r"))) //打开测试数据
exit(0);
int d;
int i;

time_t start
= time(NULL); //开始计时

for (i=0; i<FILE_NUM; ++i) //创建桶对应的FILE_NUM个文件
{
char buf[64]="tmp_";
char buf_int[4];
itoa(i, buf_int,
10);
strcat(buf,buf_int);
strcat(buf,
".txt");

if((fp_tmp[i]=fopen(buf,"w+"))==NULL)
exit(
0);

}

while(fscanf(fp_data,"%d",&d)==1) //读入数据存放到各个桶中
{
int i = d >> LOW; //不管这个数多大,右移18位啊,都变成0了

fprintf(fp_tmp[d
>> LOW], "%d ",d&0x3ffff);
}
for (i=0; i<FILE_NUM; ++i) //初始化文件指针
{
rewind(fp_tmp[i]);

}

FILE
* out_fp;
if(NULL==(out_fp=fopen("out.txt","w"))) //out.txt用于保存排序后的数据
exit(0);

for (i=0; i<FILE_NUM; ++i)
{

sort(fp_tmp[i],out_fp,i);
//分别对每个桶进行排序

}



for (i=0; i<FILE_NUM; ++i) //关闭文件
{
fclose(fp_tmp[i]);

}
time_t end
= time(NULL); //停止计时

printf(
"total time:%f/n", (end - start) * 1000.0/ CLOCKS_PER_SEC);

return 0;
}

  

posted @ 2011-08-20 21:50  wtx  阅读(2526)  评论(1编辑  收藏  举报