外部二路归并排序的小尝试
了解了外部排序的入门知识后,打算简单实践一下。(虽然不是什么原理很难的东西,省略写出焦油坑然后调试半天的若干过程……)
默认元素数据类型int,使用fstream输入输出,测试在本机上跑1e8的数据集
全部读入主存用时约35.8秒,最高内存占用接近350M
在1e4大小的模拟主存跑用时约497.3秒,最高内存占用接近11M
多用约14倍的时间,占用约3%的空间,大概适合在嵌入式设备大吞吐量读写场景应用?
(虽然经过测试,可能存在小问题,逃)
1.外部二路排序代码
#include <bits/stdc++.h>
#include <cmath>
typedef unsigned long long ull;
const int MAXMM = 1e4; //max main memory
int mm[MAXMM], ok = false, total;
int init()
{
std::ifstream fin1("a1.txt"), fin2("a2.txt");
std::ofstream fout1("b1.txt"), fout2("b2.txt");
std::ofstream *fout = &fout1;
std::ifstream *fin = &fin1;
int cnt = 0;
while(*fin)
{
int i, j, t;
for(i = 0, t = 0; i < MAXMM && (*fin) >> t ; ++i)
{
mm[i] = t;
}
cnt += i;
std::sort(mm, mm + i);
for(j = 0 ; j < i ; ++j)
{
(*fout) << mm[j] << ' ';
}
fout = reinterpret_cast<std::ofstream *>((ull)(&fout1) + (ull)(&fout2) - (ull)fout);
}
return cnt;
}
template<typename it>
int way2_merge(it &in1, it &in2, it &ieof, std::ofstream &fout, int lim = MAXMM)
{
if(in1 == ieof && in2 == ieof) return false;
int p1 = 0, p2 = 0;
while(p1 < lim && in1 != ieof && p2 < lim && in2 != ieof)
{
if(*in1 < *in2)
{
fout << *in1++;
p1++;
}
else
{
fout << *in2++;
p2++;
}
fout << ' ';
}
while(p1 < lim && in1 != ieof)
{
fout << *in1++ << ' ';
p1++;
}
while(p2 < lim && in2 != ieof)
{
fout << *in2++ << ' ';
p2++;
}
ok = p1 == total | p2 == total;
printf("p1: %d, p2: %d\n", p1, p2);
return 1;
}
int main()
{
int cnt = init(); total = cnt;
typedef const char *cstring;
cstring s1 = "a1.txt", s2 = "a2.txt", s3 = "b1.txt", s4 = "b2.txt";
cstring in1 = s3, in2 = s4, out1 = s1, out2 = s2;
int result;
for(int ex = 0; !ok; ex++)
{
std::ifstream fin1(in1), fin2(in2);
std::ofstream fout1(out1), fout2(out2);
std::istream_iterator<int> _in1(fin1), _in2(fin2), ieof;
do{
printf("pass %d:\n", ex);
result = way2_merge(_in1, _in2, ieof, fout1, MAXMM << ex);
result &= way2_merge(_in1, _in2, ieof, fout2, MAXMM << ex);
}
while(result);
std::swap(in1, out1);
std::swap(in2, out2);
}
return 0;
}
2.全部读入主存代码
#include <algorithm>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <iostream>
int arr[int(1e8)];
int main()
{
std::ifstream fin("a.txt");
std::ofstream fout("b.txt");
for(int i = 0 ; i < int(1e8) ; ++i)
{
fin >> arr[i];
}
std::sort(arr, arr + int(1e8));
for(int i = 0 ; i < int(1e8) ; ++i)
{
fout << arr[i];
}
return 0;
}
3.造数据用的
#include <algorithm>
#include <cstdlib>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
const int MAXN = 1e8;
int main()
{
srand(time(0));
std::ofstream fout1("a1.txt"), fout2("a2.txt"), fout3("b1.txt"), fout4("b2.txt");
for(int i = 0 ; i < MAXN ; ++i)
fout1 << rand() << ' ';
return 0;
}
4. 测试正确性用的
#include <bits/stdc++.h>
const int MAXN = 1e8;
bool check(const char *filename, int total = MAXN)
{
std::ifstream fin(filename);
int tmp1, tmp2; fin >> tmp1; int cnt = 1;
while(fin >> tmp2 && tmp1 <= tmp2)
cnt++, tmp1 = tmp2;
printf("cnt: %d, total: %d\n", cnt, total);
return cnt == total;
}
bool identify(const char *sorted, const char *src)
{
std::ifstream fin1(sorted), fin2(src);
std::vector<int> bucket1(32768, 0), bucket2(32768, 0);
int tmp;
while(fin1 >> tmp)
bucket1[tmp]++;
while(fin2 >> tmp)
bucket2[tmp]++;
for(int i = 0 ; i < 32768 ; ++i)
if(bucket1[i] != bucket2[i]) return false;
return true;
}
int main()
{
printf("cnt: %s\n", (check("a1.txt") || check("b1.txt")) ? "Success" : "Failure");
printf("identification: %s\n", identify("a1.txt", "a.txt") ? "Success" : "Failure");
}