java 两个csv文件数据去重
1.pom.xml配置
<dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.4</version> </dependency>
2.实现
package com.tangxin.kafka.service; import org.apache.commons.io.FileUtils; import org.apache.commons.io.LineIterator; import org.springframework.util.StringUtils; import java.io.*; import java.math.BigDecimal; import java.util.*; /** * 两个csv文件数据去重 */ public class CSVDeduplication { private static final String CSV_PATH = "I:\\"; public static List<String> ids(String path) { List<String> result = new ArrayList<>(); File csv = new File(path); // CSV文件路径 LineIterator it = null; try { it = FileUtils.lineIterator(csv); while (it.hasNext()) { String line = it.nextLine(); if (line.trim().contains("id")) { continue; } String[] arr = line.split(","); String id = arr[0]; id = id.replaceAll("\"", "").trim(); result.add(id); } } catch (Exception e) { } finally { LineIterator.closeQuietly(it); } return result; } public static void main(String[] args) throws Exception { String path1 = CSV_PATH+"100w.csv"; String path2 = CSV_PATH+"300w.csv"; List<String> ids1 = ids(path1); Set<String> idSet1 = new HashSet<>(); Set<String> idSet2 = new HashSet<>(); for (int i = 0; i < ids1.size(); i++) { if(StringUtils.isEmpty(ids1.get(i))){ continue; } idSet1.add(ids1.get(i)); } List<String> ids2 = ids(path2); for (int i = 0; i < ids2.size(); i++) { if(StringUtils.isEmpty(ids2.get(i))){ continue; } idSet2.add(ids2.get(i)); } System.out.println("用户100万=" + idSet1.size()); System.out.println("用户300万=" + idSet2.size()); BigDecimal b1 = new BigDecimal(idSet1.size()); BigDecimal b2 = new BigDecimal(idSet2.size()); BigDecimal b3 = b1.add(b2); System.out.println("用户100万和用户300万="+b3.toString()); List<String> ids4 = new ArrayList<>();//重复数据 Set<String> ids3 = new HashSet<>(); Iterator<String> iterator1 = idSet1.iterator(); while (iterator1.hasNext()){ String t1 = iterator1.next(); ids3.add(t1); } Iterator<String> iterator2 = idSet2.iterator(); while (iterator2.hasNext()){ String t1 = iterator2.next(); ids3.add(t1); } System.out.println("用户100万和用户300万去重=" + ids3.size()); ids1.removeAll(ids3); ids2.removeAll(ids3); ids4.addAll(ids1); ids4.addAll(ids2); System.out.println("用户100万和用户300万重复="+ids4.size()); Set<String> fiveMillion = splitHeadData(ids3, 50000); System.out.println("5W用户推送数据:" + fiveMillion.size()); List<String> staffsList = new ArrayList<>(fiveMillion); createCSV(staffsList,"5w.csv"); System.out.println("剩余推送总数:" + ids3.size()); System.out.println("============剩余总数每50w分页显示================="); List<List<String>> pageListTotal = pageList(ids3,500000); for (int i = 0; i < pageListTotal.size(); i++) { List<String> items = pageListTotal.get(i); createCSV(items,"50w"+i+".csv"); } } public static Set<String> splitHeadData(Set<String> mySet, int size) { Set<String> result = new HashSet<>(); Iterator<String> iterator = mySet.iterator(); int count = 0; while (iterator.hasNext()) { if (count == size) { break; } result.add(iterator.next()); count++; } mySet.removeAll(result); return result; } /** * 分页list的id数据 * @return */ public static List<List<String>> pageList(Set<String> totalSet, int pageSize) { List<List<String>> allIdList = new ArrayList<>(); List<String> idList = new ArrayList<>(); Iterator<String> it = totalSet.iterator(); int count = 0; while (it.hasNext()) { String id = it.next(); if (count > pageSize) { allIdList.add(idList); count = 0; idList = new ArrayList<>(); } idList.add(id); count++; } if (idList.size() > 0) { allIdList.add(idList); } return allIdList; } /** * 创建CSV文件 */ public static void createCSV(List<String> list,String fileName) { // 表格头 Object[] head = {"id"}; List<Object> headList = Arrays.asList(head); //数据 List<List<Object>> dataList = new ArrayList<>(); List<Object> rowList; for (int i = 0; i < list.size(); i++) { rowList = new ArrayList<>(); rowList.add(list.get(i)); dataList.add(rowList); } String filePath = CSV_PATH; //文件路径 File csvFile; BufferedWriter csvWriter = null; try { csvFile = new File(filePath + fileName); File parent = csvFile.getParentFile(); if (parent != null && !parent.exists()) { parent.mkdirs(); } csvFile.createNewFile(); // GB2312使正确读取分隔符"," csvWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(csvFile), "GB2312"), 1024); // 写入文件头部 writeRow(headList, csvWriter); // 写入文件内容 for (List<Object> row : dataList) { writeRow(row, csvWriter); } csvWriter.flush(); } catch (Exception e) { e.printStackTrace(); } finally { try { csvWriter.close(); } catch (IOException e) { e.printStackTrace(); } } } private static void writeRow(List<Object> row, BufferedWriter csvWriter) throws IOException { for (Object data : row) { StringBuffer sb = new StringBuffer(); String rowStr = sb.append("\"").append(data).append("\",").toString(); csvWriter.write(rowStr); } csvWriter.newLine(); } }
3.开始的实现思路和后面的实现思路
3.1 开始的实现思路
读取文件1.csv,数据大概有100多万 读取文件2.csv,数据大概有300多万,然后用100万和300万的数据一个个去比较看哪些已经存在了,两个for循环,100万*300万=3万亿次 卡着不动放弃了。
然后想着用多线程把300万数据分页成每50万来跑也是跑的很。
3.2 后面的实现思路
代码就在上面,整体思路就是通过java的Set集合来去重复,因为java单个循环处理还是很快的,注意需要配置jvm参数来跑不然会内存溢出:
VM options:
-Xms1g -Xmx1g -XX:SurvivorRatio=2 -XX:+UseParallelGC