随笔- 96 文章- 24 评论- 28 阅读- 183万

bitmap与桶方式对1000万数据进行排序（转+自己实现理解）

1. 100万数据的产生,随机数方式

#include <iostream>
#include <time.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>

using namespace std;
const int size = 10000000;
int num[size];
int main()
{
    int n;
    FILE *fp = fopen("data.txt", "w");
    assert(fp);
    for (n = 1; n <= size; n++)
    
    //之前此处写成了n=0;n<size。导致下面有一段小程序的测试数据出现了0，特此订正。
    num[n] = n;
    srand((unsigned)time(NULL));
    int i, j;
    for (n = 0; n < size; n++)
    {
        i = (rand() * RAND_MAX + rand()) % 10000000;
        j = (rand() * RAND_MAX + rand()) % 10000000;
        swap(num[i], num[j]);
    }
    for (n = 0; n < size; n++)
    fprintf(fp, "%d ", num[n]);
    fclose(fp);

    return 0;
}

　　使用bit_set 进行排序

//位图方式解决海量数据排序，数据不能有重复

//使用 C++ stl的 bitset
#include <iostream>
#include <bitset>
#include <assert.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
using namespace std;
const int max_each_scan = 5000000;
int main()
{
    clock_t begin = clock();
    bitset<max_each_scan> bit_map;
    bit_map.reset();
    // open the file with the unsorted data
    FILE *fp_unsort_file = fopen("data.txt", "r");
    assert(fp_unsort_file);
    int num;
    
    // the first time scan to sort the data between 0 - 4999999
    while (fscanf(fp_unsort_file, "%d ", &num) != EOF)
    {
        if (num < max_each_scan)
        //有这个数字，将bit_map的对应的位设置为1
        bit_map.set(num, 1);
    }
    FILE *fp_sort_file = fopen("sort.txt", "w");
    assert(fp_sort_file);
    int i;
    // write the sorted data into file
    for (i = 0; i < max_each_scan; i++)
    {
        if (bit_map[i] == 1)
        fprintf(fp_sort_file, "%d ", i);
    }
    
    // the second time scan to sort the data between 5000000 - 9999999
    int result = fseek(fp_unsort_file, 0, SEEK_SET);
    if (result)
    cout << "fseek failed!" << endl;
    else
    {
        bit_map.reset();
    while (fscanf(fp_unsort_file, "%d ", &num) != EOF)
    {
        if (num >= max_each_scan && num < 10000000)
        {
            num -= max_each_scan;
            bit_map.set(num, 1);
        }
    }
    for (i = 0; i < max_each_scan; i++)
    {
        if (bit_map[i] == 1)
        fprintf(fp_sort_file, "%d ", i + max_each_scan);
    }
    }
    clock_t end = clock();

    cout<<"用位图的方法，耗时："<<endl;
    cout << (end - begin) / CLK_TCK << "s" << endl;
    fclose(fp_sort_file);
    fclose(fp_unsort_file);
    return 0;
}

　　位图排序的实现示例

#include <iostream>
#include <memory.h>
#define BYTESIZE 8
using namespace std;
void setBit(char *p,int posi)
{
    for(int i = 0;i < (posi/BYTESIZE);i ++)
    {
        p ++;
    }
    *p = *p|(0x01 << (posi%BYTESIZE));//将该Bit位赋值1
    return;
}
int main()
{
    int num[] = {3,2,5,7,12,24,9,8,6};
    const int BufferLen = 2;
    char *pBuffer = new char[BufferLen];
    memset(pBuffer,0,BufferLen);

    for(int i = 0;i < 9;i ++)
       setBit(pBuffer,num[i]);

    //输出排序结果
    for(int i = 0;i < BufferLen;i ++)//每次处理一个字节
    {
        for(int j = 0;j < BYTESIZE;j ++)
        {
            if( (*pBuffer&(0x01<<j)) == (0x01<<j))
            cout << i * BYTESIZE + j << " ";
        }
        pBuffer ++;
    }
    return 0;
}

　　归并排序方式实现

//copyright@ 纯净的天空 && yansha
//5、July，updated，2010.05.28。
#include <iostream>
#include <ctime>
#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
//#include "ExternSort.h"
using namespace std;
//使用多路归并进行外排序的类
//ExternSort.h
/** 大数据量的排序
* 多路归并排序
* 以千万级整数从小到大排序为例
* 一个比较简单的例子，没有建立内存缓冲区*/
#ifndef EXTERN_SORT_H
#define EXTERN_SORT_H
#include <cassert>
class ExternSort
{
    public:void sort()
    {
        time_t start = time(NULL);
    //将文件内容分块在内存中排序，并分别写入临时文件
    int file_count = memory_sort();
    //归并临时文件内容到输出文件
    merge_sort(file_count);
    time_t end = time(NULL);
    printf("total time:%f/n", (end - start) * 1000.0/ CLOCKS_PER_SEC);
    }

    //input_file:输入文件名
    //out_file:输出文件名
    //count: 每次在内存中排序的整数个数
    ExternSort(const char *input_file, const char * out_file, int count)
    {
        m_count = count;
        m_in_file = new char[strlen(input_file) + 1];
        strcpy(m_in_file, input_file);
        m_out_file = new char[strlen(out_file) + 1];
        strcpy(m_out_file, out_file);
    }
    virtual ~ExternSort()
    {
        delete [] m_in_file;
    delete [] m_out_file;
    }
    private:int m_count;
    //数组长度
    char *m_in_file;
      //输入文件的路径
      char *m_out_file;
      //输出文件的路径
      protected:int read_data(FILE* f, int a[], int n)
      {
          int i = 0;
          while(i < n && (fscanf(f, "%d", &a[i]) != EOF))
          i++;
          printf("read:%d integer/n", i);
          return i;
          }
    void write_data(FILE* f, int a[], int n)
    {
        for(int i = 0; i < n; ++i)
        fprintf(f, "%d ", a[i]);
    }
    char* temp_filename(int index)
    {
        char *tempfile = new char[100];
        sprintf(tempfile, "temp%d.txt", index);
        return tempfile;
    }
    static int cmp_int(const void *a, const void *b)
    {
        return *(int*)a - *(int*)b;
    }
    int memory_sort()
    {
        FILE* fin = fopen(m_in_file, "rt");
        int n = 0, file_count = 0;
        int *array = new int[m_count];
        //每读入m_count个整数就在内存中做一次排序，并写入临时文件
        while(( n = read_data(fin, array, m_count)) > 0)
        {
            qsort(array, n, sizeof(int), cmp_int);
             //这里，调用了库函数阿，在第四节的c实现里，不再调用qsort。
             char *fileName = temp_filename(file_count++);
             FILE *tempFile = fopen(fileName, "w");
             free(fileName);
             write_data(tempFile, array, n);
             fclose(tempFile);
        }
        delete [] array;
        fclose(fin);
        return file_count;
    }
    void merge_sort(int file_count)
    {
        if(file_count <= 0)
        return;
        //归并临时文件
        FILE *fout = fopen(m_out_file, "wt");
        FILE* *farray = new FILE*[file_count];
        int i;
        for(i = 0; i < file_count; ++i)
        {
            char* fileName = temp_filename(i);
            farray[i] = fopen(fileName, "rt");
            free(fileName);
        }
        int *data = new int[file_count];
        //存储每个文件当前的一个数字
        bool *hasNext = new bool[file_count];
        //标记文件是否读完
        memset(data, 0, sizeof(int) * file_count);
        memset(hasNext, 1, sizeof(bool) * file_count);
        for(i = 0; i < file_count; ++i)
        {
            if(fscanf(farray[i], "%d", &data[i]) == EOF)
            //读每个文件的第一个数到data数组
            hasNext[i] = false;
        }
        while(true)
        {
            //求data中可用的最小的数字，并记录对应文件的索引
            int min = data[0];
            int j = 0;
            while (j < file_count && !hasNext[j])
            j++;
            if (j >= file_count)
            //没有可取的数字，终止归并
            break;
            for(i = j + 1; i < file_count; ++i)
            {
                if(hasNext[i] && min > data[i])
                {
                    min = data[i];j = i;
                }
            }
            if(fscanf(farray[j], "%d", &data[j]) == EOF)
            //读取文件的下一个元素
            hasNext[j] = false;
            fprintf(fout, "%d ", min);
        }
        delete [] hasNext;
        delete [] data;
        for(i = 0; i < file_count; ++i)
        {
            fclose(farray[i]);
        }
        delete [] farray;
        fclose(fout);
        }
        };
#endif
//测试主函数文件
/** 大文件排序*
数据不能一次性全部装入内存*
 排序文件里有多个整数，
 整数之间用空格隔开*/
const unsigned int count = 10000000;
// 文件里数据的行数
const unsigned int number_to_sort = 1000000;
//在内存中一次排序的数量
const char *unsort_file = "data.txt";
//原始未排序的文件名
const char *sort_file = "sort_data.txt";
//已排序的文件名
void init_data(unsigned int num);
//随机生成数据文件
int main(int argc, char* *argv)
{
    srand(time(NULL));
    init_data(count);
    ExternSort extSort(unsort_file, sort_file, number_to_sort);
    extSort.sort();

    return 0;
}
void init_data(unsigned int num)
{
    FILE* f = fopen(unsort_file, "wt");
    for(int i = 0; i < num; ++i)
    fprintf(f, "%d ", rand());
    fclose(f);
}

　　还有一种是桶排序方式实现的

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <ctime>
#define LOW 18           //桶大小
#define FILE_NUM 39       //桶对应的文件数

#define MEM_SIZE 256*1024
using namespace::std;

int memory[MEM_SIZE];     //1M


//对ifp中的数据进行排序，结果输出到ofp中 ,i是正在处理的桶的编号
void sort(FILE*ifp, FILE *ofp, int i)
{
    memset(memory,0,1024*1024);

    int d;
    int high=i<<LOW;             //保存数据的高位

    if(fscanf(ifp, "%d", &d)==1)
    {
        ++memory[d&0x3ffff]; //计数，只是用低18位
        high=d&0xfffc0000;   //保存高位

    }

    while(fscanf(ifp, "%d", &d)==1)
    {

        ++memory[d&0x3ffff]; //计数，不考虑高五位
    }

    for (int i=0; i<MEM_SIZE; ++i)
    {
        int num=memory[i];
        while(num--)
        {
            fprintf(ofp,"%d ",i|high);       //输出结果
        }


    }


}

int main()
{
   FILE *fp_tmp[FILE_NUM];
   FILE *fp_data;

   if(NULL==(fp_data=fopen("data.txt","r"))) //打开测试数据
          exit(0);
   int d;
   int i;

   time_t start = time(NULL);    //开始计时

   for (i=0; i<FILE_NUM; ++i)      //创建桶对应的FILE_NUM个文件
   {
       char buf[64]="tmp_";
       char buf_int[4];
       itoa(i, buf_int, 10);
       strcat(buf,buf_int);
       strcat(buf,".txt");

       if((fp_tmp[i]=fopen(buf,"w+"))==NULL)
           exit(0);

   }

   while(fscanf(fp_data,"%d",&d)==1)     //读入数据存放到各个桶中
   {
       int i = d >> LOW;  //不管这个数多大，右移18位啊，都变成0了

        fprintf(fp_tmp[d >> LOW], "%d ",d&0x3ffff);
   }
    for (i=0; i<FILE_NUM; ++i)         //初始化文件指针
   {
        rewind(fp_tmp[i]);

    }

   FILE * out_fp;
   if(NULL==(out_fp=fopen("out.txt","w")))        //out.txt用于保存排序后的数据
        exit(0);

   for (i=0; i<FILE_NUM; ++i)
   {

        sort(fp_tmp[i],out_fp,i);        //分别对每个桶进行排序

    }



    for (i=0; i<FILE_NUM; ++i)      //关闭文件
   {
        fclose(fp_tmp[i]);

    }
    time_t end = time(NULL);         //停止计时

    printf("total time:%f/n", (end - start) * 1000.0/ CLOCKS_PER_SEC);

   return 0;
}