解决CDATA多层嵌套

之前看到网上解决CDATA嵌套的问题,就是直接用"]]]]><![CDATA[>"替换,但是如果是并列,或者是并列再嵌套。。。

只是单纯的替换,解决不了项目的实际问题,写了这个工具方法,在解析之前把CDATA格式化了。

  1 public class CDATAUtil {
  2 
  3     private static final String CDATA_PREFIX = "<![CDATA[";
  4     private static final String CDATA_END = "]]>";
  5     private static final String REPLACE_CDATA = "]]]]><![CDATA[>";
  6     
  7     /*@Test
  8     public void testCDATA() {
  9         String s = "<PlainText><![CDATA[a<![CDATA[b1<![CDATA[<![CDATA[b3]]>b2]]>]]><![CDATA[d]]>]]><![CDATA[e]]></PlainText><Content><![CDATA[f<![CDATA[f2]]>]]><![CDATA[g]]></Content>";
 10         String correctStr = "<PlainText><![CDATA[a<![CDATA[b1<![CDATA[<![CDATA[b3]]]]><![CDATA[>b2]]]]><![CDATA[>]]]]><![CDATA[><![CDATA[d]]]]><![CDATA[>]]><![CDATA[e]]></PlainText><Content><![CDATA[f<![CDATA[f2]]]]><![CDATA[>]]><![CDATA[g]]></Content>";
 11         System.out.println(executeCDATA(s));
 12         System.out.println(executeCDATA(s).equals(correctStr));
 13     }*/
 14     
 15     public static String executeCDATA(String retStr) {
 16         int length = retStr.length();
 17         Map<Integer, Integer> map = new HashMap<Integer, Integer>();
 18         //获取CDATA开始坐标与结束坐标
 19         map = getCDATAStartEnd(retStr, map);
 20         Map<Integer, Integer> retMap = new HashMap<Integer, Integer>();
 21         //获得每个区间的最外围区间
 22         retMap = getZone(map, retMap);
 23         Set<Map.Entry<Integer, Integer>> entrySet = retMap.entrySet();
 24         ArrayList<Map.Entry<Integer, Integer>> l = new ArrayList<Map.Entry<Integer, Integer>>(
 25                 entrySet);
 26         Collections.sort(l, new Comparator<Map.Entry<Integer, Integer>>() {
 27             public int compare(Map.Entry<Integer, Integer> o1,
 28                     Map.Entry<Integer, Integer> o2) {
 29                 return (o1.getKey() - o2.getKey());
 30             }
 31         });
 32         Iterator<Map.Entry<Integer, Integer>> i = l.iterator();
 33         List<String> strs = new ArrayList<String>();
 34         //从字符串的首字节开始
 35         int startIndex = 0;
 36         while (i.hasNext()) {
 37             Map.Entry<Integer, Integer> entry = i.next();
 38             //对最外围区间的所有CDATA结束符替换
 39             String _str = CDATA_PREFIX
 40                     + retStr.substring(entry.getKey() + CDATA_PREFIX.length(),
 41                             entry.getValue() - CDATA_END.length()).replaceAll(
 42                             CDATA_END, REPLACE_CDATA) + CDATA_END;
 43             //最外围区间左边的文本
 44             strs.add(retStr.substring(startIndex, entry.getKey()));
 45             strs.add(_str);
 46             startIndex = entry.getValue();
 47         }
 48         StringBuffer _s = new StringBuffer("");
 49         for (String s : strs) {
 50             _s.append(s);
 51         }
 52         //最外围区间右边的文本
 53         _s.append(retStr.substring(startIndex, length));
 54         return _s.toString();
 55     }
 56     
 57     //获得Map的并集区间:如有数轴(0,10),现在有区间(map的值)(1,5),(2,3),(3,4),(5,6),(7,8);retMap的值则返回区间:(1,5),(5,6),(7,8)
 58     public static Map<Integer, Integer> getZone(Map<Integer, Integer> map, Map<Integer, Integer> retMap) {
 59         // 第一个CDATA的起始位置
 60         int start = 0;
 61         int end = 0;
 62         if (map.size() != 0) {
 63             Set<Map.Entry<Integer, Integer>> entrySet = map.entrySet();
 64             ArrayList<Map.Entry<Integer, Integer>> l = new ArrayList<Map.Entry<Integer, Integer>>(entrySet);
 65             Collections.sort(l, new Comparator<Map.Entry<Integer, Integer>>() {
 66                 public int compare(Map.Entry<Integer, Integer> o1,
 67                         Map.Entry<Integer, Integer> o2) {
 68                     return (o1.getKey() - o2.getKey());
 69                 }
 70             });
 71             start = l.get(0).getKey();
 72             end = l.get(0).getValue();
 73             entrySet.remove(l.get(0));
 74             retMap.put(start, end);
 75             // 在此区间嵌套的所有CDATA坐标区间都删除
 76             Iterator<Map.Entry<Integer, Integer>> j = entrySet.iterator();
 77             while (j.hasNext()) {
 78                 Map.Entry<Integer, Integer> jentry = j.next();
 79                 if (jentry.getKey() > start && jentry.getValue() < end) {
 80                     j.remove();
 81                 }
 82             }
 83             return getZone(map, retMap);
 84         }
 85         return retMap;
 86     }
 87 
 88     //记录每个CDATA的起始位置和结束位置<startIndex,endIndex+CDATA_END.length()>等价于<'<![CDATA[', ']]>'>
 89     public static Map<Integer, Integer> getCDATAStartEnd(String str, Map<Integer, Integer> startEnd) {
 90         int endIndex = str.indexOf(CDATA_END, 0);
 91         //返回的就是<![CDATA的"<"位置
 92         int startIndex = str.lastIndexOf(CDATA_PREFIX, endIndex);
 93         if(endIndex == -1) {
 94             return startEnd;
 95         }
 96         String _str1 = str.substring(0, startIndex);
 97         String _str2 = str.substring(endIndex + CDATA_END.length());
 98         StringBuffer _strAppend = new StringBuffer();
 99         for(int i=0; i<endIndex + CDATA_END.length() - startIndex; i++) {
100             //只是一个占位符,数据里面含有|,也没问题
101             _strAppend.append("|");
102         }
103         startEnd.put(startIndex, endIndex + CDATA_END.length());
104         str = _str1 + _strAppend.toString() + _str2;
105         return getCDATAStartEnd(str, startEnd);
106     }
107 }

用法:

 1 SAXReader saxReader = new SAXReader();                         
  saxReader.setErrorHandler(new XMLErrorHandler()); //向XML阅读器注册一个实例 2 //处理CDATA嵌套的情况 3 msg = CDATAUtil.executeCDATA(msg); 4 InputStream is = new ByteArrayInputStream(msg.getBytes("GBK")); 5 InputStreamReader in = new InputStreamReader(is, "GBK"); 6 Dom4JReader dom4jReader = new Dom4JReader(saxReader.read(in)); 7 in.close(); 8 is.close(); 9 in = null; 10 is = null;

 

 

posted @ 2013-10-25 11:28  lizebin0918  Views(3562)  Comments(0Edit  收藏  举报