解决CDATA多层嵌套
之前看到网上解决CDATA嵌套的问题,就是直接用"]]]]><![CDATA[>"替换,但是如果是并列,或者是并列再嵌套。。。
只是单纯的替换,解决不了项目的实际问题,写了这个工具方法,在解析之前把CDATA格式化了。
1 public class CDATAUtil { 2 3 private static final String CDATA_PREFIX = "<![CDATA["; 4 private static final String CDATA_END = "]]>"; 5 private static final String REPLACE_CDATA = "]]]]><![CDATA[>"; 6 7 /*@Test 8 public void testCDATA() { 9 String s = "<PlainText><![CDATA[a<![CDATA[b1<![CDATA[<![CDATA[b3]]>b2]]>]]><![CDATA[d]]>]]><![CDATA[e]]></PlainText><Content><![CDATA[f<![CDATA[f2]]>]]><![CDATA[g]]></Content>"; 10 String correctStr = "<PlainText><![CDATA[a<![CDATA[b1<![CDATA[<![CDATA[b3]]]]><![CDATA[>b2]]]]><![CDATA[>]]]]><![CDATA[><![CDATA[d]]]]><![CDATA[>]]><![CDATA[e]]></PlainText><Content><![CDATA[f<![CDATA[f2]]]]><![CDATA[>]]><![CDATA[g]]></Content>"; 11 System.out.println(executeCDATA(s)); 12 System.out.println(executeCDATA(s).equals(correctStr)); 13 }*/ 14 15 public static String executeCDATA(String retStr) { 16 int length = retStr.length(); 17 Map<Integer, Integer> map = new HashMap<Integer, Integer>(); 18 //获取CDATA开始坐标与结束坐标 19 map = getCDATAStartEnd(retStr, map); 20 Map<Integer, Integer> retMap = new HashMap<Integer, Integer>(); 21 //获得每个区间的最外围区间 22 retMap = getZone(map, retMap); 23 Set<Map.Entry<Integer, Integer>> entrySet = retMap.entrySet(); 24 ArrayList<Map.Entry<Integer, Integer>> l = new ArrayList<Map.Entry<Integer, Integer>>( 25 entrySet); 26 Collections.sort(l, new Comparator<Map.Entry<Integer, Integer>>() { 27 public int compare(Map.Entry<Integer, Integer> o1, 28 Map.Entry<Integer, Integer> o2) { 29 return (o1.getKey() - o2.getKey()); 30 } 31 }); 32 Iterator<Map.Entry<Integer, Integer>> i = l.iterator(); 33 List<String> strs = new ArrayList<String>(); 34 //从字符串的首字节开始 35 int startIndex = 0; 36 while (i.hasNext()) { 37 Map.Entry<Integer, Integer> entry = i.next(); 38 //对最外围区间的所有CDATA结束符替换 39 String _str = CDATA_PREFIX 40 + retStr.substring(entry.getKey() + CDATA_PREFIX.length(), 41 entry.getValue() - CDATA_END.length()).replaceAll( 42 CDATA_END, REPLACE_CDATA) + CDATA_END; 43 //最外围区间左边的文本 44 strs.add(retStr.substring(startIndex, entry.getKey())); 45 strs.add(_str); 46 startIndex = entry.getValue(); 47 } 48 StringBuffer _s = new StringBuffer(""); 49 for (String s : strs) { 50 _s.append(s); 51 } 52 //最外围区间右边的文本 53 _s.append(retStr.substring(startIndex, length)); 54 return _s.toString(); 55 } 56 57 //获得Map的并集区间:如有数轴(0,10),现在有区间(map的值)(1,5),(2,3),(3,4),(5,6),(7,8);retMap的值则返回区间:(1,5),(5,6),(7,8) 58 public static Map<Integer, Integer> getZone(Map<Integer, Integer> map, Map<Integer, Integer> retMap) { 59 // 第一个CDATA的起始位置 60 int start = 0; 61 int end = 0; 62 if (map.size() != 0) { 63 Set<Map.Entry<Integer, Integer>> entrySet = map.entrySet(); 64 ArrayList<Map.Entry<Integer, Integer>> l = new ArrayList<Map.Entry<Integer, Integer>>(entrySet); 65 Collections.sort(l, new Comparator<Map.Entry<Integer, Integer>>() { 66 public int compare(Map.Entry<Integer, Integer> o1, 67 Map.Entry<Integer, Integer> o2) { 68 return (o1.getKey() - o2.getKey()); 69 } 70 }); 71 start = l.get(0).getKey(); 72 end = l.get(0).getValue(); 73 entrySet.remove(l.get(0)); 74 retMap.put(start, end); 75 // 在此区间嵌套的所有CDATA坐标区间都删除 76 Iterator<Map.Entry<Integer, Integer>> j = entrySet.iterator(); 77 while (j.hasNext()) { 78 Map.Entry<Integer, Integer> jentry = j.next(); 79 if (jentry.getKey() > start && jentry.getValue() < end) { 80 j.remove(); 81 } 82 } 83 return getZone(map, retMap); 84 } 85 return retMap; 86 } 87 88 //记录每个CDATA的起始位置和结束位置<startIndex,endIndex+CDATA_END.length()>等价于<'<![CDATA[', ']]>'> 89 public static Map<Integer, Integer> getCDATAStartEnd(String str, Map<Integer, Integer> startEnd) { 90 int endIndex = str.indexOf(CDATA_END, 0); 91 //返回的就是<![CDATA的"<"位置 92 int startIndex = str.lastIndexOf(CDATA_PREFIX, endIndex); 93 if(endIndex == -1) { 94 return startEnd; 95 } 96 String _str1 = str.substring(0, startIndex); 97 String _str2 = str.substring(endIndex + CDATA_END.length()); 98 StringBuffer _strAppend = new StringBuffer(); 99 for(int i=0; i<endIndex + CDATA_END.length() - startIndex; i++) { 100 //只是一个占位符,数据里面含有|,也没问题 101 _strAppend.append("|"); 102 } 103 startEnd.put(startIndex, endIndex + CDATA_END.length()); 104 str = _str1 + _strAppend.toString() + _str2; 105 return getCDATAStartEnd(str, startEnd); 106 } 107 }
用法:
1 SAXReader saxReader = new SAXReader();
saxReader.setErrorHandler(new XMLErrorHandler()); //向XML阅读器注册一个实例 2 //处理CDATA嵌套的情况 3 msg = CDATAUtil.executeCDATA(msg); 4 InputStream is = new ByteArrayInputStream(msg.getBytes("GBK")); 5 InputStreamReader in = new InputStreamReader(is, "GBK"); 6 Dom4JReader dom4jReader = new Dom4JReader(saxReader.read(in)); 7 in.close(); 8 is.close(); 9 in = null; 10 is = null;