大文件去重

若文件存的字符如下图,要求进行去重

可将数据存入HashSet,如下,但如果文件很大,大于虚拟机内存的话,会报异常java.lang.OutOfMemoryError: Java heap space

        HashSet set = new HashSet();
        File file = new File("E:\\aa.txt");
        BufferedReader reader = new BufferedReader(new FileReader(file));
        String tempString = null;
        while ((tempString = reader.readLine()) != null) {
            tempString = tempString.trim();
            if(tempString != ""){
                System.out.println(tempString);
                set.add(tempString);
            }
        }

可尝试用分批读取,用Hash取模方法将大文件拆分成若干小文件,再将若干个小文件的数据存入HashSet,最后汇总结果

首先插入测试数据aa.txt

//多线程插入测试数据
    public  void set() throws FileNotFoundException {
        File file = new File("E:\\aa.txt");
        PrintWriter pws = new PrintWriter(file);
        CountDownLatch latch = new CountDownLatch(9);
        ExecutorService executorService = Executors.newFixedThreadPool(9);
        for(int i=0;i<9;i++){
            executorService.execute(new SetClass("name+"+UUID.randomUUID().toString(),latch,file,pws));
        }

        try {
            latch.await(); //线程阻塞, 当latch中数量为0时,放行
        } catch (InterruptedException e) {
                e.printStackTrace();
        }
        executorService.shutdown();  //关闭线程
        pws.close();
    }

public class SetClass extends Thread{
        private final CountDownLatch countDownLatch;
        private File file;
        private PrintWriter pws;
        public SetClass(String name,  CountDownLatch countDownLatch1,File file,PrintWriter pws){
            super(name);
            this.countDownLatch = countDownLatch1;
            this.file = file;
            this.pws=pws;
        }

        @Override
        public void run() {
            for(int i=0;i<100000;i++){
                pws.println(UUID.randomUUID().toString());
                System.out.println(Thread.currentThread().getName()+":"+i);
            }
            countDownLatch.countDown();
        }
    }

 大文件进行拆分,利用Hash取模将重复的数据存入同一个小文件

/**
     * 将文件hash取模之后放到不同的小文件中
     * @param targetFile 要去重的文件路径
     * @param splitSize 将目标文件切割成多少份hash取模的小文件个数
     * @return
     */
    public static File[] splitFile(String targetFile,int splitSize){
        File file = new File(targetFile);
        BufferedReader reader = null;
        PrintWriter[] pws = new PrintWriter[splitSize];
        File[] littleFiles = new File[splitSize];
        String parentPath = file.getParent();
        File tempFolder = new File(parentPath + File.separator + "test");
        if(!tempFolder.exists()){
            tempFolder.mkdir();
        }
        for(int i=0;i<splitSize;i++){
            littleFiles[i] = new File(tempFolder.getAbsolutePath() + File.separator + i + ".txt");
            if(littleFiles[i].exists()){
                littleFiles[i].delete();
            }
            try {
                pws[i] = new PrintWriter(littleFiles[i]);
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            }
        }
        try {
            reader = new BufferedReader(new FileReader(file));
            String tempString = null;
            while ((tempString = reader.readLine()) != null) { // reader.readLine()逐行读取,避免一次性读完整个文件
                tempString = tempString.trim();
                if(tempString != ""){
                    //关键是将每行数据hash取模之后放到对应取模值的文件中,确保hash值相同的字符串都在同一个文件里面
                    int index = Math.abs(tempString.hashCode() % splitSize);
                    pws[index].println(tempString);
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e1) {
                    e1.printStackTrace();
                }
            }
            for(int i=0;i<splitSize;i++){
                if(pws[i] != null){
                    pws[i].close();
                }
            }
        }
        return littleFiles;
    }

对小文件进行去重并合并结果

/**
     * 对小文件进行去重合并
     * @param littleFiles 切割之后的小文件数组
     * @param distinctFilePath 去重之后的文件路径
     * @param splitSize 小文件大小
     */
    public static void distinct(File[] littleFiles,String distinctFilePath,int splitSize){
        File distinctedFile = new File(distinctFilePath);
        FileReader[] frs = new FileReader[splitSize];
        BufferedReader[] brs = new BufferedReader[splitSize];
        PrintWriter pw = null;
        try {
            if(distinctedFile.exists()){
                distinctedFile.delete();
            }
            distinctedFile.createNewFile();
            pw = new PrintWriter(distinctedFile);
            Set<String> unicSet = new HashSet<String>();
            for(int i=0;i<splitSize;i++){
                if(littleFiles[i].exists()){
                    System.out.println("开始对小文件:" + littleFiles[i].getName() + "去重");
                    frs[i] = new FileReader(littleFiles[i]);
                    brs[i] = new BufferedReader(frs[i]);
                    String line = null;
                    while((line = brs[i].readLine())!=null){
                        if(line != ""){
                            unicSet.add(line);
                        }
                    }
                    for(String s:unicSet){
                        pw.println(s);
                    }
                    unicSet.clear();
                    System.gc();
                }
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e1){
            e1.printStackTrace();
        } finally {
            for(int i=0;i<splitSize;i++){
                try {
                    if(null != brs[i]){
                        brs[i].close();
                    }
                    if(null != frs[i]){
                        frs[i].close();
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
                //合并完成之后删除临时小文件
                if(littleFiles[i].exists()){
                    littleFiles[i].delete();
                }
            }
            if(null != pw){
                pw.close();
            }
        }
    }

 

posted @ 2024-09-24 23:03  DaDa~  阅读(16)  评论(0编辑  收藏  举报