100亿个数取前1w名

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/time.h>

static unsigned int BUF_PAGES;          /* 缓冲区有多少个page */
static unsigned int PAGE_SIZE;          /* page的大小 */
static unsigned int BUF_SIZE;           /* 缓冲区的大小, BUF_SIZE = BUF_PAGES*PAGE_SIZE */

static int *buffer;                                     /* 输入缓冲区 */
static int *heap;                                       /* 最小堆 */

long get_time_usecs();
void init_heap(int n);
void adjust(int n, int i);

int main(int argc, char **argv)
{
        unsigned int            K, idx, length;
        int                                     fd, i, bytes, element;

        long start_usecs = get_time_usecs();

        fd = open(argv[1], O_RDONLY);
        if (fd < 0) {
                printf("can't open file %s\n",argv[1]);
                exit(0);
        }

        PAGE_SIZE = 4096;                                                       /* page = 4KB */
        BUF_PAGES = 512;
        BUF_SIZE = PAGE_SIZE*BUF_PAGES;                         /* 4KB*512 = 2M */
        buffer = (int *)malloc(BUF_SIZE);
        if (buffer == NULL) exit(0);

        K = atoi(argv[2]);
        heap = (int *)malloc(sizeof(int)*(K+1));
        if (heap == NULL) {
                free(buffer);
                exit(0);
        }

        bytes = read(fd, heap+1, K*sizeof(int));
        if (bytes < K*sizeof(int)) {
                printf("data size is too small\n");
                exit(0);
        }
        init_heap(K);

        idx = length = 0;
        for (;;) {
                if (idx == length) {    // 输入缓冲区用完
                        bytes = read(fd, buffer, BUF_SIZE);
                        if (bytes == 0) break;
                        length = bytes/4;
                        idx = 0;
                }
                //从buffer取出一个数,若比最小堆堆顶元素大,则替换之
                element = buffer[idx++];
                if (element > heap[1]) {
                        heap[1] = element;
                        adjust(K, 1);
                }
        }

        long end_usecs = get_time_usecs();

        printf("the top %d numbers are: \n", K);
        for (i = 1; i <= K; i++) {
                printf("%d ", heap[i]);
                if (i % 6 == 0) {
                        printf("\n");
                }
        }
        printf("\n");

        free(buffer);
        free(heap);
        close(fd);

        double secs = (double)(end_usecs - start_usecs) / (double)1000000;
        printf("program tooks %.02f seconds.\n", secs);

        return 0;
}

void init_heap(int n)
{
        int             i;

        for (i = n/2; i > 0; i--) {
                adjust(n, i);
        }
}

/* 节点i的左子树和右子树已是最小堆, 此函数的
 * 作用是把以节点i为根的树调整为最小堆 */
void adjust(int n, int i)
{
        heap[0] = heap[i];
        i = 2 * i; //左子节点
        while (i <= n) {
                if (i < n && heap[i+1] < heap[i]) {
                        i++;  //左右节点小的那个进行比较
                }
                if (heap[i] >= heap[0]) {
                        break;
                }
                heap[i / 2] = heap[i]; //左右节点都比根节点大 将较小的节点上移
                i = i * 2; //锁定到下一层节点
        }
        heap[i / 2] = heap[0]; //找到合适位置 进行更换
}

long get_time_usecs()
{
        struct timeval time;
        struct timezone tz;
        memset(&tz, '\0', sizeof(struct timezone));
        gettimeofday(&time, &tz);
        long usecs = time.tv_sec*1000000 + time.tv_usec;

        return usecs;
}

 编译:  g++ -o main main.cpp  

  产生测试数据: dd if=/dev/urandom of=random.dat bs=1M count=1024  

  运行: ./main random.dat 100

  算法复杂度:  nlog10000   

posted on 2014-11-11 13:30  kangbry  阅读(178)  评论(0编辑  收藏  举报

导航