java trie树 压缩空间版本
最近一直在加强自己在数据结构和算法的理解,这不,最近碰到了一个帖子,收藏起来。
收藏自:http://www.hankcs.com/program/java/双数组trie树doublearraytriejava实现.html
双数组Trie树(DoubleArrayTrie)是一种空间复杂度低的Trie树,应用于字符区间大的语言(如中文、日文等)分词领域。
双数组Trie (Double-Array Trie)结构由日本人JUN-ICHI AOE于1989年提出的,是Trie结构的压缩形式,仅用两个线性数组来表示Trie树,该结构有效结合了数字搜索树(Digital Search Tree)检索时间高效的特点和链式表示的Trie空间结构紧凑的特点。双数组Trie的本质是一个确定有限状态自动机(DFA),每个节点代表自动机的一个状态,根据变量不同,进行状态转移,当到达结束状态或无法转移时,完成一次查询操作。在双数组所有键中包含的字符之间的联系都是通过简单的数学加法运算表示,不仅提高了检索速度,而且省去了链式结构中使用的大量指针,节省了存储空间。
——《基于双数组Trie树算法的字典改进和实现》
trie改进代码实现:
package com.trie; /** * DoubleArrayTrie: Java implementation of Darts (Double-ARray Trie System) * * <p> * Copyright(C) 2001-2007 Taku Kudo <taku@chasen.org><br /> * Copyright(C) 2009 MURAWAKI Yugo <murawaki@nlp.kuee.kyoto-u.ac.jp> * Copyright(C) 2012 KOMIYA Atsushi <komiya.atsushi@gmail.com> * </p> * * <p> * The contents of this file may be used under the terms of either of the GNU * Lesser General Public License Version 2.1 or later (the "LGPL"), or the BSD * License (the "BSD"). * </p> */ import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class DoubleArrayTrie { private final static int BUF_SIZE = 16384; private final static int UNIT_SIZE = 8; // size of int + int private static class Node { int code; int depth; int left; int right; }; private int check[]; private int base[]; private boolean used[]; private int size; private int allocSize; private List<String> key; private int keySize; private int length[]; private int value[]; private int progress; private int nextCheckPos; // boolean no_delete_; int error_; // int (*progressfunc_) (size_t, size_t); // inline _resize expanded private int resize(int newSize) { int[] base2 = new int[newSize]; int[] check2 = new int[newSize]; boolean used2[] = new boolean[newSize]; if (allocSize > 0) { System.arraycopy(base, 0, base2, 0, allocSize); System.arraycopy(check, 0, check2, 0, allocSize); System.arraycopy(used2, 0, used2, 0, allocSize); } base = base2; check = check2; used = used2; return allocSize = newSize; } private int fetch(Node parent, List<Node> siblings) { if (error_ < 0) return 0; int prev = 0; for (int i = parent.left; i < parent.right; i++) { if ((length != null ? length[i] : key.get(i).length()) < parent.depth) continue; String tmp = key.get(i); int cur = 0; if ((length != null ? length[i] : tmp.length()) != parent.depth) cur = (int) tmp.charAt(parent.depth) + 1; if (prev > cur) { error_ = -3; return 0; } if (cur != prev || siblings.size() == 0) { Node tmp_node = new Node(); tmp_node.depth = parent.depth + 1; tmp_node.code = cur; tmp_node.left = i; if (siblings.size() != 0) siblings.get(siblings.size() - 1).right = i; siblings.add(tmp_node); } prev = cur; } if (siblings.size() != 0) siblings.get(siblings.size() - 1).right = parent.right; return siblings.size(); } private int insert(List<Node> siblings) { if (error_ < 0) return 0; int begin = 0; int pos = ((siblings.get(0).code + 1 > nextCheckPos) ? siblings.get(0).code + 1 : nextCheckPos) - 1; int nonzero_num = 0; int first = 0; if (allocSize <= pos) resize(pos + 1); outer: while (true) { pos++; if (allocSize <= pos) resize(pos + 1); if (check[pos] != 0) { nonzero_num++; continue; } else if (first == 0) { nextCheckPos = pos; first = 1; } begin = pos - siblings.get(0).code; if (allocSize <= (begin + siblings.get(siblings.size() - 1).code)) { // progress can be zero double l = (1.05 > 1.0 * keySize / (progress + 1)) ? 1.05 : 1.0 * keySize / (progress + 1); resize((int) (allocSize * l)); } if (used[begin]) continue; for (int i = 1; i < siblings.size(); i++) if (check[begin + siblings.get(i).code] != 0) continue outer; break; } // -- Simple heuristics -- // if the percentage of non-empty contents in check between the // index // 'next_check_pos' and 'check' is greater than some constant value // (e.g. 0.9), // new 'next_check_pos' index is written by 'check'. if (1.0 * nonzero_num / (pos - nextCheckPos + 1) >= 0.95) nextCheckPos = pos; used[begin] = true; size = (size > begin + siblings.get(siblings.size() - 1).code + 1) ? size : begin + siblings.get(siblings.size() - 1).code + 1; for (int i = 0; i < siblings.size(); i++) check[begin + siblings.get(i).code] = begin; for (int i = 0; i < siblings.size(); i++) { List<Node> new_siblings = new ArrayList<Node>(); if (fetch(siblings.get(i), new_siblings) == 0) { base[begin + siblings.get(i).code] = (value != null) ? (-value[siblings .get(i).left] - 1) : (-siblings.get(i).left - 1); if (value != null && (-value[siblings.get(i).left] - 1) >= 0) { error_ = -2; return 0; } progress++; // if (progress_func_) (*progress_func_) (progress, // keySize); } else { int h = insert(new_siblings); base[begin + siblings.get(i).code] = h; } } return begin; } public DoubleArrayTrie() { check = null; base = null; used = null; size = 0; allocSize = 0; // no_delete_ = false; error_ = 0; } // no deconstructor // set_result omitted // the search methods returns (the list of) the value(s) instead // of (the list of) the pair(s) of value(s) and length(s) // set_array omitted // array omitted void clear() { // if (! no_delete_) check = null; base = null; used = null; allocSize = 0; size = 0; // no_delete_ = false; } public int getUnitSize() { return UNIT_SIZE; } public int getSize() { return size; } public int getTotalSize() { return size * UNIT_SIZE; } public int getNonzeroSize() { int result = 0; for (int i = 0; i < size; i++) if (check[i] != 0) result++; return result; } public int build(List<String> key) { return build(key, null, null, key.size()); } public int build(List<String> _key, int _length[], int _value[], int _keySize) { if (_keySize > _key.size() || _key == null) return 0; // progress_func_ = progress_func; key = _key; length = _length; keySize = _keySize; value = _value; progress = 0; resize(65536 * 32); base[0] = 1; nextCheckPos = 0; Node root_node = new Node(); root_node.left = 0; root_node.right = keySize; root_node.depth = 0; List<Node> siblings = new ArrayList<Node>(); fetch(root_node, siblings); insert(siblings); // size += (1 << 8 * 2) + 1; // ??? // if (size >= allocSize) resize (size); used = null; key = null; return error_; } public void open(String fileName) throws IOException { File file = new File(fileName); size = (int) file.length() / UNIT_SIZE; check = new int[size]; base = new int[size]; DataInputStream is = null; try { is = new DataInputStream(new BufferedInputStream( new FileInputStream(file), BUF_SIZE)); for (int i = 0; i < size; i++) { base[i] = is.readInt(); check[i] = is.readInt(); } } finally { if (is != null) is.close(); } } public void save(String fileName) throws IOException { DataOutputStream out = null; try { out = new DataOutputStream(new BufferedOutputStream( new FileOutputStream(fileName))); for (int i = 0; i < size; i++) { out.writeInt(base[i]); out.writeInt(check[i]); } out.close(); } finally { if (out != null) out.close(); } } public int exactMatchSearch(String key) { return exactMatchSearch(key, 0, 0, 0); } public int exactMatchSearch(String key, int pos, int len, int nodePos) { if (len <= 0) len = key.length(); if (nodePos <= 0) nodePos = 0; int result = -1; char[] keyChars = key.toCharArray(); int b = base[nodePos]; int p; for (int i = pos; i < len; i++) { p = b + (int) (keyChars[i]) + 1; if (b == check[p]) b = base[p]; else return result; } p = b; int n = base[p]; if (b == check[p] && n < 0) { result = -n - 1; } return result; } public List<Integer> commonPrefixSearch(String key) { return commonPrefixSearch(key, 0, 0, 0); } public List<Integer> commonPrefixSearch(String key, int pos, int len, int nodePos) { if (len <= 0) len = key.length(); if (nodePos <= 0) nodePos = 0; List<Integer> result = new ArrayList<Integer>(); char[] keyChars = key.toCharArray(); int b = base[nodePos]; int n; int p; for (int i = pos; i < len; i++) { p = b; n = base[p]; if (b == check[p] && n < 0) { result.add(-n - 1); } p = b + (int) (keyChars[i]) + 1; if (b == check[p]) b = base[p]; else return result; } p = b; n = base[p]; if (b == check[p] && n < 0) { result.add(-n - 1); } return result; } // debug public void dump() { for (int i = 0; i < size; i++) { System.err.println("i: " + i + " [" + base[i] + ", " + check[i] + "]"); } } }
测试代码:
package com.trie; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; public class TestTrie { public static void sort() throws IOException { List<String> words = new ArrayList<String>(); String line; BufferedReader reader = new BufferedReader(new FileReader("small.dic")); while ((line = reader.readLine()) != null) { words.add(line); } reader.close(); // 这个字典如果要加入新词必须按字典序,参考下面的代码 Collections.sort(words); BufferedWriter writer = new BufferedWriter(new FileWriter("small.dic", false)); for (String w : words) { writer.write(w); writer.newLine(); } writer.flush(); writer.close(); } public static void main(String[] args) throws IOException { sort(); BufferedReader reader = new BufferedReader(new FileReader("small.dic")); String line; List<String> words = new ArrayList<String>(); Set<Character> charset = new HashSet<Character>(); while ((line = reader.readLine()) != null) { words.add(line); // 制作一份码表debug for (char c : line.toCharArray()) { charset.add(c); } } reader.close(); System.out.println("字典词条:" + words.size()); { String infoCharsetValue = ""; String infoCharsetCode = ""; for (Character c : charset) { infoCharsetValue += c.charValue() + " "; infoCharsetCode += (int)c.charValue() + " "; } infoCharsetValue += '\n'; infoCharsetCode += '\n'; System.out.print(infoCharsetValue); System.out.print(infoCharsetCode); } DoubleArrayTrie dat = new DoubleArrayTrie(); System.out.println("是否错误: " + dat.build(words)); System.out.println(dat); //List<Integer> integerList = dat.commonPrefixSearch("一举成名天下知"); List<Integer> integerList = dat.commonPrefixSearch("华夏民族"); for (int index : integerList) { System.out.println(words.get(index)); } } }
small.dic文本如下:
一举
一举一动
一举成名
一举成名天下知
万能
万能胶
中华人民共和国
华夏民族
夏天的花草真绿
我是一个中国人
花香四溢