1亿个数引发的思考(一)
1) 如何生成1亿随机数
答:生成有序数组,遍历时生成随机下标进行交换
2)存储1亿个数字文件多大?
答:约1G
3)1亿代表的是一类什么问题?
答:内存不够用的问题,因为内存不能无限扩张,数据却可以
4)内存不够用时怎么办?
答:借助外存,并充分利用内存
5)外存文件需要一开始就切割成小块,然后分别排序,再归并吗?
答:不一定,用RandomAccessFile流的形式加载文件,加载一部分排序一部分并输出为小文件,直到大文件加载完毕。
然后将所有小文件RandomAccessFile流的形式加载,进行N路归并,即所有小文件中最小的输出到大文件,并从小文件移除。
重复这个过程直到处理完毕所有小文件。
6)堆排序和归并排序设计思想有关联吗?
答:???
7)实现一个归并排序
具体实现
import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import cn.hutool.core.util.RandomUtil; public class MergeTest { // 1)生成大数组 // 2) 填充小数组,快排后转存 // 3) N路归并 // 4) 寻找重复 public static void main(String[] args) throws IOException { //generate1yi(); int piece = 0; int cap = 1000000; SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); { System.out.println("拆分开始:" + sdf.format(new Date())); // 拆分 BufferedReader br = new BufferedReader(new FileReader(new File("D://out/1yi.txt"))); String line = null; int size = 0; List<Integer> list = new ArrayList<>(); while ((line = br.readLine()) != null) { list.add(Integer.valueOf(line)); size++; if (size == cap) { size = 0; split(list, piece++); list = new ArrayList<>(); } } if (size != 0) { split(list, piece++); } br.close(); System.out.println("归并开始:" + sdf.format(new Date())); } { // N路归并 Set<LineQueue> queueSet = new HashSet<>(); for (int i = 0; i < piece; i++) { queueSet.add(new LineQueue("D://out/" + i + ".txt", "utf-8", 10000)); } BufferedWriter bw = new BufferedWriter(new FileWriter(new File("D://out/1yi_ordered.txt"))); for (;;) { LineQueue max = null; Integer i = null; List<LineQueue> removed = new ArrayList<>(); for (LineQueue q : queueSet) { Integer next = q.next(false); if (next == null) { // 一个基于文件行的阻塞管理,消费完毕后移除 removed.add(q); continue; } if (i == null || next < i) { i = next; max = q; } } if (removed.size() > 0) { for (LineQueue q : removed) { try { q.close(); } catch (Exception e) { e.printStackTrace(); } queueSet.remove(q); } removed.clear(); } if (max == null) { break; } Integer next = max.next(true); max = null; i = 0; bw.append(next + "\r\n"); // 逢1一千万打印日志 if (next % 10000000 == 0) { System.out.println("归并进度:" + sdf.format(new Date()) + " >>>>" + next); } } bw.close(); } // 比较相邻是否相同 Integer last = null; Map<Integer, Integer> occur = new HashMap<>(); System.out.println("遍历开始:" + sdf.format(new Date())); BufferedReader br = new BufferedReader(new FileReader(new File("D://out/1yi_ordered.txt"))); String line = null; while ((line = br.readLine()) != null) { Integer curr = Integer.valueOf(line); if (last == null) { last = curr; continue; } if (curr.equals(last)) { Integer integer = occur.get(curr); if (integer == null) { occur.put(curr, 2); } else { occur.put(curr, integer + 1); } } last = curr; } br.close(); System.out.println("处理结束:" + sdf.format(new Date())); System.out.println("重复数字:" + sdf.format(new Date()) + " >>>>" + occur); } public static void split(List<Integer> list, int piece) throws IOException { BufferedWriter bw = new BufferedWriter(new FileWriter(new File("D://out/" + piece + ".txt"))); int[] arr = new int[list.size()]; for (int i = 0; i < list.size(); i++) { arr[i] = list.get(i); } quickSort(arr, "asc"); for (int i : arr) { bw.append(i + "\r\n"); } bw.flush(); bw.close(); } public static void quickSort(int arr[], String sortType) { quickSort(arr, 0, arr.length - 1, sortType); } // start vs end // low vs high // left vs right // 充分利用数组空间 (不管如何交换,数据都在) // 递归 private static void quickSort(int[] arr, int l, int h, String t) { if (l >= h) { return; } int s = l; int e = h; boolean asc = "asc".equalsIgnoreCase(t); int b = arr[s]; while (s < e) { for (; s < e; e--) { if (asc ? b > arr[e] : b < arr[e]) { exchange(arr, s, arr[e], e, b); break; } } for (; s < e; s++) { if (asc ? b < arr[s] : b > arr[s]) { exchange(arr, e, arr[s], s, b); break; } } } quickSort(arr, l, s - 1, t); quickSort(arr, s + 1, h, t); } static void exchange(int arr[], int a, int av, int b, int bv) { arr[a] = av; arr[b] = bv; } private static void generate1yi() throws IOException { int max = 10000 * 10000; int i1 = RandomUtil.randomInt(0, max); int i2 = RandomUtil.randomInt(0, max); int[] randomInts = RandomUtil.randomInts(max); File file = new File("D://out//1yi.txt"); FileWriter fileWriter = new FileWriter(file); for (int i : randomInts) { fileWriter.append(i + "\r\n"); if (i == i1) { fileWriter.append(666 + "\r\n"); } else if (i == i2) { fileWriter.append(888 + "\r\n"); } } fileWriter.close(); } }
import java.io.BufferedReader; import java.io.FileInputStream; import java.io.InputStreamReader; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; public class LineQueue implements AutoCloseable { private Thread t; private BlockingQueue<Integer> q; private boolean f = false; public LineQueue(String file, String encoding, int capacity) { q = new ArrayBlockingQueue<Integer>(capacity); Runnable runnable = new Runnable() { @Override public void run() { try (BufferedReader bufferedReader = new BufferedReader( new InputStreamReader(new FileInputStream(file), encoding));) { String l; while ((l = bufferedReader.readLine()) != null) { q.put(Integer.valueOf(l)); } f = true; } catch (Exception e) { e.printStackTrace(); } } }; t = new Thread(runnable); t.start(); } public Integer next(boolean remove) { while (q.isEmpty() && !f) { try { Thread.sleep(10L); } catch (InterruptedException e) { e.printStackTrace(); } } try { if(remove) { return q.take(); }else { return q.peek(); } } catch (InterruptedException e) { e.printStackTrace(); } return null; } @Override public void close() throws Exception { if (q != null && !q.isEmpty()) { q.clear(); t.interrupt(); } } }
输出结果:
拆分开始:2023-04-27 19:54:13 归并开始:2023-04-27 19:54:45 归并进度:2023-04-27 19:54:45 >>>>0 归并进度:2023-04-27 19:55:53 >>>>10000000 归并进度:2023-04-27 19:57:00 >>>>20000000 归并进度:2023-04-27 19:58:07 >>>>30000000 归并进度:2023-04-27 19:59:15 >>>>40000000 归并进度:2023-04-27 20:00:21 >>>>50000000 归并进度:2023-04-27 20:01:24 >>>>60000000 归并进度:2023-04-27 20:02:28 >>>>70000000 归并进度:2023-04-27 20:03:32 >>>>80000000 归并进度:2023-04-27 20:04:36 >>>>90000000 遍历开始:2023-04-27 20:05:38 处理结束:2023-04-27 20:05:43 重复数字:2023-04-27 20:05:49 >>>>{888=2, 666=2}
结论:
该方法验证通过
拆分耗时大约1分半
归并耗时大约11分钟
获取重复数字大约6秒
亮点:LineQueue (借鉴自己很早之前做文件行处理时并发任务工具)
整体处理数量级是1亿,内存占用理论可控(没输出内存占用情况)
0-1亿数据存储到文本文件内存占用 943 MB (988,888,900 字节)
思考:内存中二路归并排序算法?
思考:BitSet
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?
· 【译】Visual Studio 中新的强大生产力特性
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 【设计模式】告别冗长if-else语句:使用策略模式优化代码结构