[846] 外排序

最近要排序一个比较大的文件,大概5G左右。文件中每行由两部分组成:“id DocNo”。其中id是一个数字,DocNo是一个字符串。我需要把这个大文件按照DocNo进行排序(字典序)。

 

要把这个文件加入内存是不太可能的,所以必须把它分解为小文件,然后把小文件加入到内存中进行排序。然后再把小文件组合起来。这有点类似于归并排序,不过需要注意的是,这里的归并是对文件的归并。

 

分文件也有几种策略。第一种可以将这个5G的文件分解成5个1G的小文件,然后分别对这个5个小文件进行排序,最后对这5个文件进行归并。归并也比较简单,可以直接对这个5个文件进行归并,也可以两两进行归并。另一种分解策略是将文件分成两个,然后对于小文件继续进行分解,知道能够装入内存。也就是说,这是一种二分的策略。

 

最后,我采用的是二分的策略,代码很简单,不过需要注意的是细节。没个细节都要考虑到。开始我就有一个小细节没有考虑到,最后导致程序跑了一个晚上后结果还是错的。不多说了,下面直接上代码(利用递归):

 

代码
<!--

Code highlighting produced by Actipro CodeHighlighter (freeware)
http://www.CodeHighlighter.com/

-->  1 using System;
  2 using System.Collections.Generic;
  3 using System.Linq;
  4 using System.Text;
  5 
  6 namespace 外排序ClueB_ID_DOCNO
  7 {
  8     class DocNoId:IComparable<DocNoId>
  9     {
 10         private string docNo;
 11         private int id;
 12 
 13         public DocNoId(string docNo, int id)
 14         {
 15             this.docNo = docNo;
 16             this.id = id;
 17         }
 18 
 19         public int CompareTo(DocNoId other)
 20         {
 21             return docNo.CompareTo(other.docNo);
 22         }
 23 
 24         public override string ToString()
 25         {
 26             return id.ToString() + " " + docNo;
 27         }
 28     }
 29 }
 30 
 31 
 32 using System;
 33 using System.Collections.Generic;
 34 using System.Linq;
 35 using System.Text;
 36 using System.IO;
 37 
 38 namespace 外排序ClueB_ID_DOCNO
 39 {
 40     class Program
 41     {
 42         static string workdir = @"D:研究数据集web09-bst";
 43         static long innerSize = 1024 * 1024 * 1024// 内排序大小
 44         static void Main(string[] args)
 45         {
 46             //using (StreamReader sr = new StreamReader(workdir + "ClueB-ID-DOCNO.txt"))
 47             //using (StreamWriter sw = new StreamWriter(workdir + "test.txt"))
 48             //{
 49             //    for (int i = 0; i < 10000; i++)
 50             //    {
 51             //        sw.WriteLine(sr.ReadLine());
 52             //    }
 53             //    sw.Flush();
 54             //}
 55             MergeSort("ClueB-ID-DOCNO.txt");
 56         }
 57 
 58         static void MergeSort(string fileName)
 59         {
 60             if (new FileInfo(workdir + fileName).Length <= innerSize) // 内存排序
 61             {
 62                 var lines = File.ReadLines(workdir + fileName);
 63                 List<DocNoId> docNoIds = new List<DocNoId>();
 64                 foreach (string line in lines)
 65                 {
 66                     string[] ls = line.Split(' ');
 67                     docNoIds.Add(new DocNoId(ls[1], Convert.ToInt32(ls[0])));
 68                 }
 69                 docNoIds.Sort();
 70                 using (StreamWriter sw = new StreamWriter(workdir + fileName))
 71                 {
 72                     foreach (DocNoId docNoId in docNoIds)
 73                     {
 74                         sw.WriteLine(docNoId);
 75                     }
 76                     sw.Flush();
 77                 }
 78             }
 79             else // 硬盘排序
 80             {
 81                 // 分解文件
 82                 string file1 = fileName + "1";
 83                 string file2 = fileName + "2";
 84                 using (StreamReader sr = new StreamReader(workdir + fileName))
 85                 using (StreamWriter sw1 = new StreamWriter(workdir + file1))
 86                 using (StreamWriter sw2 = new StreamWriter(workdir + file2))
 87                 {
 88                     int i = 0;
 89                     while (!sr.EndOfStream)
 90                     {
 91                         string line = sr.ReadLine();
 92                         if (i++ % 2 == 0)
 93                             sw1.WriteLine(line);
 94                         else
 95                             sw2.WriteLine(line);
 96                     }
 97                     sw1.Flush();
 98                     sw2.Flush();
 99                 }
100                 // 对分解出的文件,分别进行排序
101                 MergeSort(file1);
102                 MergeSort(file2);
103                 // 把两个文件进行合并
104                 using (StreamReader sr1 = new StreamReader(workdir + file1))
105                 using (StreamReader sr2 = new StreamReader(workdir + file2))
106                 using (StreamWriter sw = new StreamWriter(workdir + fileName))
107                 {
108                     string line1 = sr1.ReadLine();
109                     string[] ls1 = line1.Split(' ');
110                     string line2 = sr2.ReadLine();
111                     string[] ls2 = line2.Split(' ');
112                     while (!sr1.EndOfStream || !sr2.EndOfStream)
113                     {
114 
115                             if (ls1[1].CompareTo(ls2[1]) <= 0)
116                             {
117                                 sw.WriteLine(line1);
118                                 if (sr1.EndOfStream)
119                                 {
120                                     sw.WriteLine(line2);
121                                     while (!sr2.EndOfStream)
122                                     {
123                                         sw.WriteLine(sr2.ReadLine());
124                                     }
125                                 }
126                                 else
127                                 {
128                                     line1 = sr1.ReadLine();
129                                     ls1 = line1.Split(' ');
130                                     if (sr1.EndOfStream && sr2.EndOfStream)
131                                     {
132                                         if (ls1[1].CompareTo(ls2[1]) <= 0)
133                                         {
134                                             sw.WriteLine(line1);
135                                             sw.WriteLine(line2);
136                                         }
137                                         else
138                                         {
139                                             sw.WriteLine(line2);
140                                             sw.WriteLine(line1);
141                                         }
142                                     }
143                                 }
144                             }
145                             else
146                             {
147                                 sw.WriteLine(line2);
148                                 if (sr2.EndOfStream)
149                                 {
150                                     sw.WriteLine(line1);
151                                     while (!sr1.EndOfStream)
152                                     {
153                                         sw.WriteLine(sr1.ReadLine());
154                                     }
155                                 }
156                                 else
157                                 {
158                                     line2 = sr2.ReadLine();
159                                     ls2 = line2.Split(' ');
160                                     if (sr1.EndOfStream && sr2.EndOfStream)
161                                     {
162                                         if (ls1[1].CompareTo(ls2[1]) <= 0)
163                                         {
164                                             sw.WriteLine(line1);
165                                             sw.WriteLine(line2);
166                                         }
167                                         else
168                                         {
169                                             sw.WriteLine(line2);
170                                             sw.WriteLine(line1);
171                                         }
172                                     }
173                                 }
174                             }
175                     }
176                     sw.Flush();
177                 }
178                 // 删除临时文件
179                 File.Delete(workdir + file1);
180                 File.Delete(workdir + file2);
181             }
182         }
183     }
184 }
185 


posted on 2010-08-28 10:45  小橋流水  阅读(209)  评论(0编辑  收藏  举报

导航