java 字典树

一直做搜索,用的ik,但是用ik的话只能按照ik里面的字典去做分词不太满足自己的场景,但每个分词的原始属性你却没办法打上标签,于是就想自己写一个字典树用最长匹配规则取分词,然后封装自己的标签属性,我也不知道这样玩对不对,反正是写了一个,也不知道自己写的算不算字典树,自己封的 哈哈  ,自己是个小白不知道里面有没有bug,欢迎指正,我自己用了4000w数据怼进去了数据长度大概是10个长度以内,内存用了16g左右吧,没报错,识别也正常,

 

字典树是一个前缀搜索树,可以用于做敏感字词匹配,也可以做智能提示,但长度不要太长,容易内存占用过大,可以使用双数组字典树(DAT double array tire )解决内存占用问题,但是使用DAT要提前排序,他不是一个平衡树可能出现某一条支链很庞大,某一条支链只有几条数据的问题。

//tire的数据结构
package
com.tire; import java.util.List; import java.util.Map; public class TireNode { public char c; public boolean hasNext; public TireNode pre; public TireNode next; public List<Object> pos; public List<Object> type; public List<Object> cityCode; public Map<Character, TireNode> subTire; public TireNode() { } public TireNode(char c, TireNode pre, TireNode next) { this.c = c; this.pre = pre; this.next = next; } public TireNode(TireNode node, Map<Character, TireNode> subTire) { node.subTire = subTire; } public TireNode(char c) { this(c, null, null); } }
//操作方法
package com.tire; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.store.FSDirectory; import com.tianditu.util.Util; /** * 规定有next 就没有subMap * * @author wangnanhui * */ public class TireTree { public int DEFAULT_SIZE = 500; public int size = DEFAULT_SIZE; public Map<Character, TireNode> dic = new HashMap<Character, TireNode>(size); /** * put 数据 * * @param chs * @param type * 数据类型 * @param citycode * 行政区划码 * @param pos * 所在位置 */ private void putDateToDic(char[] chs, Object type, Object cityCode, Object pos) { char ch = chs[0]; TireNode node = dic.get(ch); if (node == null) { putNoExistToDic(chs, type, cityCode, pos); } else { putExistToDic(Util.copyChar(chs, 1, chs.length - 1), node, type, cityCode, pos); } setSize(); } /** * 存放存在的数据 */ void putExistToDic(char[] chs, TireNode subRoot, Object type, Object cityCode, Object pos) { TireNode curret = subRoot; // 等于null 说明是单链 if (curret.next != null) {// 下一个节点不为空 putData(chs, 0, curret, type, cityCode, pos); } else if (curret.subTire != null) {// subTire 不为空 putData(chs, 0, curret, type, cityCode, pos); } else {// 直接添加就行 putData(chs, 0, curret, type, cityCode, pos); } } void putData(char[] chs, int i, TireNode node, Object type, Object cityCode, Object pos) { if (i == chs.length) { if (node.cityCode == null) { node.cityCode = new ArrayList<>(); } if (node.type == null) { node.type = new ArrayList<>(); } if (node.pos == null) node.pos = new ArrayList<>(); node.pos.add(pos); node.cityCode.add(cityCode); node.type.add(type); return; } node.hasNext = true; if (node.next != null) {// 下一个节点不为空 if (node.next.c == chs[i++]) { chs = Util.copyChar(chs, i, chs.length - i); putData(chs, 0, node.next, type, cityCode, pos); } else { if (node.subTire == null) { if (chs == null || chs.length == 0) { return; } node.subTire = new HashMap<Character, TireNode>(); TireNode newNode = createNewTire(chs, node, null, type, cityCode, pos); TireNode oldNode = node.next; node.subTire.put(newNode.c, newNode); node.subTire.put(oldNode.c, oldNode); node.next = null; node.hasNext = false; return; } else { TireNode current = node.subTire.get(chs[i]); if (current == null) { current = createNewTire( Util.copyChar(chs, i, chs.length - i), node, null, type, cityCode, pos); node.subTire.put(current.c, current); } else { putData(Util.copyChar(chs, i, chs.length - i), 0, node.next, type, cityCode, pos); } } } } else if (node.subTire != null) {// subTire 不为空 TireNode current = node.subTire.get(chs[i++]); if (current == null) { current = createNewTire(chs, node, null, type, cityCode, pos); node.subTire.put(current.c, current); } else { putData(Util.copyChar(chs, i, chs.length - i), 0, current, type, cityCode, pos); } } else {// 直接添加就行 TireNode newNode = createNewTire( Util.copyChar(chs, i, chs.length - i), node, null, type, cityCode, pos); node.next = newNode; node.hasNext = true; } } /** * 存放不存在的数据 */ void putNoExistToDic(char[] chs, Object type, Object cityCode, Object pos) { dic.put(chs[0], createNewTire(chs, null, null, type, cityCode, pos)); } /** * 如果字典中包含 abc 待存入词为abcd , abc的subNode还有其他词 如abce ,abcf , 那么就相当于是直接把 d * 放入SubNode 里 或者获取subNode 销毁abc, 将map连接到新的abc上然后将d放入 * * @param chs * @param pre * @param next * @param subNode * @return */ TireNode put(char ch, Map<Character, TireNode> subNode) { return null; } /** * 正常情况下put 数据 * * @param chs * @param pre * @param next * @return */ TireNode put(char[] chs, TireNode pre, TireNode next) { return null; } /** * 根据匹配关键字获取数据 * * @param matchWords * @param useFull * 是否全匹配 * @return */ public TireNode get(String matchWords, boolean useFull) { if (Util.nullValue(matchWords)) return null; char[] chs = matchWords.toCharArray(); return get(chs, useFull); } /** * @param chs * @return */ TireNode get(char[] chs, boolean useFull) { char ch = chs[0]; TireNode node = dic.get(ch); if (chs.length == 1) return node; if (node == null) { return null; } else { return get(chs, node, 1, useFull); } } public TireNode getTire(String matchWords, boolean useFull) { if (Util.nullValue(matchWords)) return null; return this.get(matchWords.toCharArray(), useFull); } public TireNode getTire(String matchWords) { if (Util.nullValue(matchWords)) return null; return this.get(matchWords.toCharArray(), false); } public String getTireName(TireNode node) { List<Character> c = new ArrayList<>(); while (node != null) { c.add(node.c); node = node.pre; } String w = ""; for (int i = c.size() - 1; i >= 0; i--) { w += c.get(i); } // System.out.println(w); return w; } private List<String> getAllTireName(TireNode node, List<String> list, String prefix) { prefix = getTireName(node); if (node.cityCode != null) { list.add(prefix); } if (node.next != null) { prefix += node.next.c; getAllTireName(node.next, list, prefix); } else if (node.subTire != null) { for (char c : node.subTire.keySet()) { TireNode current = node.subTire.get(c); prefix += c; getAllTireName(current, list, prefix); } } return list; } public List<String> getAllTireName(TireNode node, List<String> list) { return getAllTireName(node, list, null); } TireNode get(char[] chs, TireNode node, int pos, boolean useFull) { if (chs == null || chs.length == 0) return node; if (node.next != null) {// 循环查找 if (node.next.c == chs[pos++]) { return get(Util.copyChar(chs, pos, chs.length - pos), node.next, 0, useFull); } else { if (chs.length > 0 && useFull) return null; return node; } } else if (node.subTire != null) {// 在subMap里面查找 TireNode current = node.subTire.get(chs[pos++]); if (current != null) { return get(Util.copyChar(chs, pos, chs.length - pos), current, 0, useFull); } else { if (chs.length > 0 && useFull) return null; return node; } } else {// 返回当前的节点就行 if (chs.length > 0 && useFull) return null; return node; } } public void setSize() { this.size = dic.size(); } public void put(String words, Object type, Object cityCode, Object pos) { if (Util.nullValue(words)) return; putDateToDic(words.toCharArray(), type, cityCode, pos); } /** * 生成新节点 , 如果有父节点或者儿子节点直接添加就行 * * @param chs * @param pre * @param next * @return */ public TireNode createNewTire(char[] chs, TireNode pre, TireNode next, Object type, Object cityCode, Object pos) { TireNode head = new TireNode(chs[0], pre, next); TireNode current = head; if (chs.length == 1) { if (Util.nullValue(current.type)) { current.type = new ArrayList<>(); } if (Util.nullValue(current.cityCode)) { current.cityCode = new ArrayList<>(); } if (Util.nullValue(current.pos)) { current.pos = new ArrayList<>(); } current.cityCode.add(cityCode); current.type.add(type); current.pos.add(pos); current.hasNext = false; } for (int i = 1; i < chs.length; i++) { TireNode nodeNext = new TireNode(chs[i]); if (i == chs.length - 1) { // 结束 if (Util.nullValue(nodeNext.type)) { nodeNext.type = new ArrayList<>(); } if (Util.nullValue(nodeNext.cityCode)) { nodeNext.cityCode = new ArrayList<>(); } if (Util.nullValue(nodeNext.pos)) { nodeNext.pos = new ArrayList<>(); } nodeNext.cityCode.add(cityCode); nodeNext.type.add(type); nodeNext.pos.add(pos); } else { nodeNext.hasNext = true; } current.next = nodeNext; nodeNext.pre = current; current = current.next; } return head; } public static void main(String[] args) throws IOException { TireTree t = new TireTree(); IndexReader reader = IndexReader.open(FSDirectory.open(new File( "D:/index/AdminIndex"))); int maxDoc = reader.maxDoc(); for (int i = 0; i < maxDoc; i++) { Document doc = reader.document(i); String name = doc.get("name"); String type = doc.get("type"); String cityCode = doc.get("totalcity"); if (!Util.nullValue(name)) { t.putDateToDic(name.toCharArray(), type, cityCode, i); } }

          t.putDateToDic("今天天气很好适合出去玩".toCharArray(), "123", "123",1);
          t.putDateToDic("今天天气很不好不适合出去玩".toCharArray(), "123", "123",1);
          TireNode node = t.get("覃塘镇".toCharArray(), true);
          System.out.println(t.getTireName(node));
          node = t.get("今天天气".toCharArray(), false);
          List<String> list = new ArrayList<>();
          t.getAllTireName(node, list, null);
          System.out.println(list);

		/*
		 * t.putDateToDic("abc".toCharArray(), "123", "123");
		 * t.putDateToDic("abcd".toCharArray(), "123", "123");
		 * t.putDateToDic("abcde".toCharArray(), "123", "123");
		 * t.putDateToDic("abcdef".toCharArray(), "123", "123");
		 * t.putDateToDic("abcdefg".toCharArray(), "123", "123");
		 * t.putDateToDic("abcdefgh".toCharArray(), "123", "123");
		 * t.putDateToDic("bc".toCharArray(), "123", "123");
		 * t.putDateToDic("bcd".toCharArray(), "123", "123");
		 * t.putDateToDic("cde".toCharArray(), "123", "123");
		 * t.putDateToDic("cdef".toCharArray(), "123", "123");
		 * t.putDateToDic("defg".toCharArray(), "123", "123");
		 * t.putDateToDic("dfg".toCharArray(), "123", "123");
		 * t.putDateToDic("dgfe".toCharArray(), "123", "123");
		 * t.putDateToDic("dg啊fe".toCharArray(), "123", "123");
		 * t.putDateToDic("daefg".toCharArray(), "123", "123");
		 * 
		 * t.putDateToDic("张".toCharArray(), "123", "123");
		 * t.putDateToDic("张三".toCharArray(), "123", "123");
		 * 
		 * t.putDateToDic("张四".toCharArray(), "123", "123");
		 * t.putDateToDic("张四五".toCharArray(), "123", "123");
		 * 
		 * t.putDateToDic("张五六".toCharArray(), "123", "123");
		 * t.putDateToDic("张五七".toCharArray(), "123", "123");
		 * 
		 * t.putDateToDic("张六二".toCharArray(), "123", "1231314");
		 * t.putDateToDic("张六一".toCharArray(), "123", "12312313");
		 * t.putDateToDic("张六二".toCharArray(), "123", "1231321");
		 * t.putDateToDic("张六一".toCharArray(), "123", "12312313");
		 * t.putDateToDic("张六二".toCharArray(), "123", "123132132");
		 * t.putDateToDic("张六一".toCharArray(), "123", "1231231");
		 * 
		 * t.putDateToDic("efgh".toCharArray(), "123", "123");
		 */
		System.out.println();
	}



}

  

 

posted @ 2017-12-20 09:50  王南辉  阅读(388)  评论(1编辑  收藏  举报