外部排序
外部排序针对排序文件的大小大于可用内存的情况。如对4GB文件进行排序,可用内存1G,显然没法直接装入内存进行排序。所以我们将这4GB的文件分割为4个1GB的文件进行排序。外部排序的一般步骤为:
- 1. 对分割后的小文件进行内部排序,将排好序的数据写入到4个不同文件中。
- 2. 使用4路归并排序的方法对4个小文件进行排序。(这里仅仅使用长度为4的最小优先级队列)
- 3. 每次读入4个文件中的一个数放入最小优先级队列中,则这个最小优先级队列中的起始元素就为4GB文件中最小的一个。
- 4. 每次从优先级队列中取出起始元素,写入到输出文件中,然后从一个关联的文件中读取下一个元素插入到最小优先级队列中。
- 5. 重复第四步,直到所有小文件读取完毕。
在下面的代码中用最小堆的方式实现了最小优先级队列。仅仅对已经排好序的4个文件(1.txt, 2.txt, 3.txt, 4.txt)每行一个整数进行操作,最后得到输出文件(data.out)即为排好序的数据文件。
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <assert.h> struct entry_s { int num; FILE *fp; }; struct prior_queue_s { struct entry_s *heap; int size; int tail; }; typedef struct prior_queue_s prior_queue_t; typedef struct entry_s entry_t; #define INT_MAX (~(1<<31)) void prior_queue_init(prior_queue_t *pq, int size) { pq->heap = malloc(sizeof(entry_t) * size); assert(pq->heap != NULL); pq->size = size; pq->tail = -1; int i; entry_t *heap = pq->heap; for (i = 0; i < size; i++) { heap[i].num = INT_MAX; heap[i].fp = NULL; } } static void shift_down(prior_queue_t *pq, int i) { int l = 2 * i + 1; int r = 2 * i + 2; int small = -1; entry_t *heap = pq->heap; if (l <= pq->tail && (heap[l].num < heap[i].num)) { small = l; } else { small = i; } if (r <= pq->tail && (heap[r].num < heap[small].num)) { small = r; } if (small != i) { entry_t entry = heap[i]; heap[i] = heap[small]; heap[small] = entry; shift_down(pq, small); } } static void shift_up(prior_queue_t *pq, int i) { int p = i / 2; int small = -1; entry_t *heap = pq->heap; if (p >= 0 && (heap[p].num > heap[i].num)) { entry_t entry = heap[i]; heap[i] = heap[p]; heap[p] = entry; shift_up(pq, p); } } entry_t prior_queue_minimum(prior_queue_t *pq) { return pq->heap[0]; } entry_t prior_queue_extract_min(prior_queue_t *pq) { entry_t ret= (pq->heap)[0]; (pq->heap)[0] = (pq->heap)[pq->tail]; pq->tail--; shift_down(pq, 0); return ret; } void prior_queue_insert(prior_queue_t *pq, entry_t x) { pq->tail++; assert(pq->tail < pq->size); (pq->heap)[pq->tail] = x; shift_up(pq, pq->tail); } void prior_queue_change_key(prior_queue_t *pq, int i, entry_t k) { entry_t *heap = pq->heap; if (heap[i].num > k.num) { heap[i] = k; shift_up(pq, i); } else { heap[i] = k; shift_down(pq, i); } } void prior_queue_print(prior_queue_t *pq) { int i; printf("size: %d\n", pq->size); printf("tail: %d\n", pq->tail); printf("data: "); for (i = 0; i <= pq->tail; i++) { printf("%-4d", (pq->heap)[i].num); } printf("\n"); } int readnum(FILE *fp) { char array[128] = {0}; char *ret; ret = fgets(array, 128, fp); if (ret == NULL) { return INT_MAX; } int i = atoi(array); return i; } void writenum(FILE *fp, int num) { char i[128] = {0}; const char *p = i; sprintf(i, "%d\n", num); fwrite(p, (size_t)strlen(i), 1, fp); } #define NFILES 4 int main() { prior_queue_t pq; prior_queue_init(&pq, 4); int i; char file[128] = {0}; char tmp[8] = {0}; for (i = 0; i < NFILES; i++) { memset(file, 0, 128); memset(tmp, 0, 8); sprintf(tmp, "%d", i+1); strcat(file, tmp); strcat(file, ".txt"); FILE *fp = fopen(file, "r"); int num = readnum(fp); entry_t entry = {num, fp}; prior_queue_insert(&pq, entry); } FILE *outfile = fopen("11.04_data.out", "w"); entry_t entry = prior_queue_minimum(&pq); entry_t new_entry; while (entry.num < INT_MAX) { FILE *fp = entry.fp; int num = entry.num; writenum(outfile, num); num = readnum(fp); new_entry.num = num; new_entry.fp = fp; prior_queue_change_key(&pq, 0, new_entry); entry = prior_queue_minimum(&pq); } return 0; }