《java提高数据导入效率优化思路》
写在前边的实现需求:
1.总共10万个电话号码;
2.电话号码中有重复和错误;
3.查找出正确的号码(不重复);
一、优化前的实现方式:
1.先用正则过滤一遍10万条数据,找出错误的;
2.用List.Contains验证重复数据,List.Add添加不重复数据;
3.最终从List中取出正确的数据。
1 public class appMain { 2 final static int _capacity = 1000000; 3 final static Random rand = new Random(System.currentTimeMillis() + _capacity); 4 static ArrayList<String> list = new ArrayList<String>(_capacity); 5 static ArrayList<String> newlist = new ArrayList<String>(_capacity); 6 7 public static void main(String[] args) throws InterruptedException { 8 long ts = System.currentTimeMillis(); 9 int modVal = _capacity / 3; 10 for (int i = 0; i < _capacity; i++) { 11 rand.setSeed(i); 12 list.add(Integer.toString(Math.abs(rand.nextInt() % modVal))); 13 } 14 ts = System.currentTimeMillis() - ts; 15 System.out.println("生成时间 :" + ts); 16 17 test1(); 18 } 19 20 static void test1() { 21 newlist.clear(); 22 int repetition = 0; 23 long ts = System.currentTimeMillis(); 24 for (String s : list) { 25 if (!newlist.contains(s)) 26 newlist.add(s); 27 else { 28 repetition++; 29 } 30 } 31 ts = System.currentTimeMillis() - ts; 32 System.out.println("------ 插入检查方法 -------"); 33 System.out.println("查找时间 :" + ts); 34 System.out.println("重复 :" + repetition); 35 System.out.println("正确 :" + newlist.size()); 36 } 37 }
优化前执行结果:
/* 条件:capacity = 100000 结果: 生成时间 :33 ------ 插入检查方法 ------- 查找时间 :6612 重复 :76871 正确 :23129 ------ 排序检查方法 ------- 查找时间 :91 重复 :76871 正确 :23129 */
使用以上方式做导入的话数据量一旦超过5w以上马上出现假死状态,故肯定不可取,所以有了下边的优化。
二、优化后的实现方式:
1.先对10万数据排序;
2.对比前后两条数据(这个我之后会详细说明为什么这么做);
3.筛选出正确数据。
1 public class appMain { 2 final static int _capacity = 1000000; 3 final static Random rand = new Random(System.currentTimeMillis() + _capacity); 4 static ArrayList<String> list = new ArrayList<String>(_capacity); 5 static ArrayList<String> newlist = new ArrayList<String>(_capacity); 6 7 public static void main(String[] args) throws InterruptedException { 8 long ts = System.currentTimeMillis(); 9 int modVal = _capacity / 3; 10 for (int i = 0; i < _capacity; i++) { 11 rand.setSeed(i); 12 list.add(Integer.toString(Math.abs(rand.nextInt() % modVal))); 13 } 14 ts = System.currentTimeMillis() - ts; 15 System.out.println("生成时间 :" + ts); 16 17 test2(); 18 } 19 20 static void test2() { 21 newlist.clear(); 22 int repetition = 0; 23 long ts = System.currentTimeMillis(); 24 25 Collections.sort(list); 26 String str = list.get(0); 27 int max = list.size(); 28 for (int i = 1; i < max; i++) { 29 if (str.equals(list.get(i))) { 30 repetition++; 31 continue; 32 } 33 newlist.add(str); 34 str = list.get(i); 35 } 36 newlist.add(str); 37 38 ts = System.currentTimeMillis() - ts; 39 System.out.println("------ 排序检查方法 -------"); 40 System.out.println("查找时间 :" + ts); 41 System.out.println("重复 :" + repetition); 42 System.out.println("正确 :" + newlist.size()); 43 } 44 }
优化后执行结果:
/* 条件:capacity = 1000000 结果: 生成时间 :392 ------ 插入检查方法 ------- 查找时间 :1033818 重复 :703036 正确 :296964 ------ 排序检查方法 ------- 查找时间 :1367 重复 :703036 正确 :296964 */
当数据量达到10万条的时候,查找时间比差不多90倍的差距了;当数据量达到100万时,我这边测试数据已经卡死在test1(),而test2()依然能在数十秒内反馈结果。
下边来简单解剖下源码:
1 Collections.sort(list); 2 String str = list.get(0); 3 int max = list.size(); 4 for (int i = 1; i < max; i++) { 5 if (str.equals(list.get(i))) { 6 repetition++; 7 continue; 8 } 9 newlist.add(str); 10 str = list.get(i); 11 }
Line 1:排序,加入list排序后的结果是[1,2,2,3,3,3,4,4,4,4,5,5,5,5,5]
Line 2:初始str = 1;
从Line 4开始进入循环:
Line 5:判断str是否和当先selector值相等(暂借我们认为list.get(i)是一个指针),如果相等则跳过以下步骤进入下一个循环
Line 9:将str = 1,加入newlist尾
Line10:将当前selector值赋给str,此时str=2,进入下一个循环
...
这种语言解释我个人觉得特别麻烦,我还是写段代码让程序告诉你它怎么执行的。
1 public class appList { 2 static ArrayList<String> list = new ArrayList<String>(); 3 static ArrayList<String> newlist = new ArrayList<String>(); 4 5 public static void main(String[] args) { 6 for (int i = 1; i < 5 + 1; i++) { 7 for (int j = 0; j < i; j++) { 8 list.add(Integer.toString(i)); 9 } 10 } 11 System.out.println("list初始值 " + list.toString()); 12 // print输出值 [1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5] 13 14 String str = list.get(0); 15 int max = list.size(); 16 for (int i = 1; i < max; i++) { 17 Print(i); 18 if (str.equals(list.get(i))) { 19 PrintNew(); 20 continue; 21 } 22 newlist.add(str); 23 System.out.println("add\t" + str); 24 str = list.get(i); 25 PrintNew(); 26 } 27 28 newlist.add(str); 29 System.out.println("add\t" + str); 30 PrintNew(); 31 32 System.out.println("newlist值 " + newlist.toString()); 33 // print输出值 [1, 2, 3, 4, 5] 34 } 35 36 static void PrintNew(){ 37 StringBuilder stringBuilder = new StringBuilder(); 38 stringBuilder.append("newlist\t"); 39 for (int i = 0; i < newlist.size(); i++) { 40 stringBuilder.append(newlist.get(i)); 41 stringBuilder.append(","); 42 } 43 System.out.println(stringBuilder.toString()); 44 System.out.println(); 45 } 46 static void Print(int pos) { 47 StringBuilder stringBuilder = new StringBuilder(); 48 stringBuilder.append("list\t"); 49 for (int i = 0; i < list.size(); i++) { 50 if (i == pos) { 51 stringBuilder.append("["); 52 stringBuilder.append(list.get(i)); 53 stringBuilder.append("],"); 54 } else { 55 stringBuilder.append(list.get(i)); 56 stringBuilder.append(","); 57 } 58 } 59 System.out.println(stringBuilder.toString()); 60 }
执行结果:
list初始值 [1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5] list 1,[2],2,3,3,3,4,4,4,4,5,5,5,5,5, add 1 newlist 1, list 1,2,[2],3,3,3,4,4,4,4,5,5,5,5,5, newlist 1, list 1,2,2,[3],3,3,4,4,4,4,5,5,5,5,5, add 2 newlist 1,2, list 1,2,2,3,[3],3,4,4,4,4,5,5,5,5,5, newlist 1,2, list 1,2,2,3,3,[3],4,4,4,4,5,5,5,5,5, newlist 1,2, list 1,2,2,3,3,3,[4],4,4,4,5,5,5,5,5, add 3 newlist 1,2,3, list 1,2,2,3,3,3,4,[4],4,4,5,5,5,5,5, newlist 1,2,3, list 1,2,2,3,3,3,4,4,[4],4,5,5,5,5,5, newlist 1,2,3, list 1,2,2,3,3,3,4,4,4,[4],5,5,5,5,5, newlist 1,2,3, list 1,2,2,3,3,3,4,4,4,4,[5],5,5,5,5, add 4 newlist 1,2,3,4, list 1,2,2,3,3,3,4,4,4,4,5,[5],5,5,5, newlist 1,2,3,4, list 1,2,2,3,3,3,4,4,4,4,5,5,[5],5,5, newlist 1,2,3,4, list 1,2,2,3,3,3,4,4,4,4,5,5,5,[5],5, newlist 1,2,3,4, list 1,2,2,3,3,3,4,4,4,4,5,5,5,5,[5], newlist 1,2,3,4, add 5 newlist 1,2,3,4,5, newlist值 [1, 2, 3, 4, 5]