Trie
import java.util.TreeMap; public class Trie { private class Node { public boolean isWord; public TreeMap<Character, Node> next; public Node(boolean isWord) { this.isWord = isWord; next = new TreeMap<>(); } public Node() { this(false); } } public Node root; private int size; public Trie() { root = new Node(); size = 0; } // 获得Trie中存储的单词数量 public int getSize() { return size; } // 向Trie中添加一个新的单词word public void add(String word) { Node cur = root; for(int i = 0; i < word.length(); ++ i) { char c = word.charAt(i); if(cur.next.get(c) == null) { cur.next.put(c, new Node()); } cur = cur.next.get(c); } if(!cur.isWord) { cur.isWord = true; size ++; } } // 查询单词word是否在Trie中 public boolean contains(String word) { Node cur = root; for(int i = 0; i < word.length(); ++ i) { char c = word.charAt(i); if(cur.next.get(c) == null) { return false; } cur = cur.next.get(c); } return cur.isWord; } // 查询是否在Trie中有单词以prefix为前缀 public boolean isPrefix(String prefix) { Node cur = root; for(int i = 0; i < prefix.length(); ++ i) { char c = prefix.charAt(i); if(cur.next.get(c) == null) { return false; } cur = cur.next.get(c); } return true; } public boolean match(Node node, String word, int index) { if(index == word.length()) { return node.isWord; } char c = word.charAt(index); if(c != '.') { if(node.next.get(c) == null) { return false; } return match(node.next.get(c), word, index + 1); } else { for(char nextChar : node.next.keySet()) { if(match(node.next.get(nextChar), word, index + 1)) { return true; } } return false; } } }
import java.util.ArrayList; public class Main { public static void main(String[] args) { System.out.println("Pride and Prejudice"); ArrayList<String> words = new ArrayList<>(); if(FileOperation.readFile("pride-and-prejudice.txt", words)) { long startTime = System.nanoTime(); Trie trie = new Trie(); for(String word : words) { trie.add(word); } for(String word : words) { trie.contains(word); } long endTime = System.nanoTime(); // 纳秒向秒的转换 double time = (endTime - startTime) / 1000000000.0; System.out.println("Total different words: " + trie.getSize()); System.out.println("Trie: " + time + " s"); } } }
import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Locale; import java.util.Scanner; // 文件相关操作 public class FileOperation { // 读取文件名称为filename中的内容,并将其中包含的所有词语放进words中 public static boolean readFile(String filename, ArrayList<String> words){ if (filename == null || words == null){ System.out.println("filename is null or words is null"); return false; } // 文件读取 Scanner scanner; try { File file = new File(filename); if(file.exists()){ FileInputStream fis = new FileInputStream(file); scanner = new Scanner(new BufferedInputStream(fis), "UTF-8"); scanner.useLocale(Locale.ENGLISH); } else return false; } catch(IOException ioe){ System.out.println("Cannot open " + filename); return false; } // 简单分词 // 这个分词方式相对简陋, 没有考虑很多文本处理中的特殊问题 // 在这里只做demo展示用 if (scanner.hasNextLine()) { String contents = scanner.useDelimiter("\\A").next(); int start = firstCharacterIndex(contents, 0); for (int i = start + 1; i <= contents.length(); ) if (i == contents.length() || !Character.isLetter(contents.charAt(i))) { String word = contents.substring(start, i).toLowerCase(); words.add(word); start = firstCharacterIndex(contents, i); i = start + 1; } else i++; } return true; } // 寻找字符串s中,从start的位置开始的第一个字母字符的位置 private static int firstCharacterIndex(String s, int start){ for( int i = start ; i < s.length() ; i ++ ) if( Character.isLetter(s.charAt(i)) ) return i; return s.length(); } }