大数据排序的实现代码, 理论上支持几百亿没问题吧
先上说说思路,
1, 把一个bigdata文件拆分成N个小文件,小文件容量小于当前机器的内存
2,对小文件进行排序处理
3,对小文件进行并归排序,代码中我是用1 and 1,一个个并归生成新的排序完成的文件,直到全部并归完成
简单说说我这里的并归算法,代码中的sortBySmallFile,如有文件A有n个元素, 文件B有m个元素
这里的并归,先取出An0,分别比较Bn0,Bn1.... ,当An0>Bn,把Bn写入新文件,直至An0<Bn 时把An0写入结果文件,循环取出An1
OK,上代码
package com.ben.file; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Random; /** * * @author Hejinbin * QQ 277803242 * email qing878@gmail.com */ public class BigDataSort { public final static String SMALL_FILE_PATH = "D://temp//BigData//"; public final static int BIG_NUM_LINE = 1000000; public final static String ORING_FILE_PATH = "D://temp//BigData//bigData.txt"; public final static int SMALL_FILE_LINE = 100000; //1M for 1 small file private File tempFiles[]; public BigDataSort() throws IOException { createBigsortNums(); beSmallFileAndSort(); unitFileToSort(); } private void createBigsortNums() throws IOException { BufferedWriter writer = new BufferedWriter(new FileWriter(ORING_FILE_PATH)); Random random = new Random(); for (int i = 0; i < BIG_NUM_LINE; i++) { writer.write(String.valueOf(random.nextInt(100000000))); writer.newLine();// add a new line . in order to show easy by file } writer.close(); } private void beSmallFileAndSort() throws IOException { BufferedReader bigDataFile = new BufferedReader(new FileReader(ORING_FILE_PATH)); List<Integer> smallLine = null; tempFiles = new File[BIG_NUM_LINE / SMALL_FILE_LINE]; for (int i = 0; i < tempFiles.length; i++) { tempFiles[i] = new File(SMALL_FILE_PATH + "sortTempFile" + i + ".temp"); BufferedWriter smallWtite = new BufferedWriter(new FileWriter(tempFiles[i])); smallLine = new ArrayList<Integer>(); for (int j = 0; j < SMALL_FILE_LINE; j++) smallLine.add(Integer.parseInt(bigDataFile.readLine())); Collections.sort(smallLine); for (Object num : smallLine.toArray()) smallWtite.write(num + "\n"); smallWtite.close(); } } private void unitFileToSort() throws IOException { File tempFile = null; for(int i=1;i<tempFiles.length;i++){ tempFile=sortBySmallFile(tempFiles[0],tempFiles[i]); tempFiles[0].delete(); tempFiles[0]=tempFile; } tempFile.renameTo(new File(ORING_FILE_PATH+"sortResult.txt")); } public static File sortBySmallFile(File fromFile, File toFile) throws IOException { BufferedReader fromRd = new BufferedReader(new FileReader(fromFile)); BufferedReader toTempRd = new BufferedReader(new FileReader(toFile)); File newSortFile = new File(SMALL_FILE_PATH + fromFile.getName() + ".temp"); BufferedWriter newSortFileWt = new BufferedWriter(new FileWriter(newSortFile)); int index = -1; int toPoint = -1; while (fromRd.ready()) { index = Integer.parseInt(fromRd.readLine()); if (index < toPoint) { newSortFileWt.write(String.valueOf(index)); newSortFileWt.newLine(); continue; } else { if (toPoint != -1) { newSortFileWt.write(String.valueOf(toPoint)); newSortFileWt.newLine(); } } while (toTempRd.ready()) { toPoint = Integer.parseInt(toTempRd.readLine()); if (toPoint < index) { newSortFileWt.write(String.valueOf(toPoint)); newSortFileWt.newLine(); } else { newSortFileWt.write(String.valueOf(index)); newSortFileWt.newLine(); break; } } } newSortFileWt.write(String.valueOf(index>toPoint?index:toPoint)); newSortFileWt.newLine(); newSortFileWt.close(); fromRd.close(); toTempRd.close(); toFile.delete(); return newSortFile; } public static void main(String[] args) throws IOException { BigDataSort bigDataSort = new BigDataSort(); } }
注释没写, 过几天加上,可能有些许BUG在,欢迎讨论