后缀树(Suffix Tree)的文本匹配算法

后缀树(Suffix Tree)是一种特殊的Trie,它的用途非常广泛,其中一个主要的应用是作文本匹配,也像KMP等算法一样,它也是空间换时间的一个典范。利用 Suffix Tree做文本匹配与其他的模式匹配算法比如KMP和Boyer-Moore算法的主要区别是,后缀树文本匹配算法是对文本T做预处理,而KMP算法是对 模式串P做预处理。因此后缀树常用于文本静态,而模式串动态的场合;而KMP等算法常用于文本动态,模式串静态的场合。设T的长度为n,P的长度为m,一 般情况下m<n。在预处理中,用Suffix Tree匹配的复杂度为O(n),而KMP和Boyer-Moore的复杂度为O(m)。可是预处理结束后,KMP等算法的复杂度为O(n),后缀树匹配 算法的复杂度只有O(m),这是令人惊叹的效率!


本文后缀树用蛮力法构建,跟构建前缀树Patricia Trie类似。后缀树用Patricia Trie压缩存储的好处是,Patricia Trie存储空间只与单词的个数相关(因为有了压缩),而普通的Trie的存储空间与单词的总长度相关(因为没有压缩)。一个文本text的所有后缀总长 度为n + (n-1) + ... + 1 = n(n+1)/2,如果用普通的Trie存储后缀树,所需空间为O(n^2);而用Patricia Trie压缩之后的为O(n),这里n为后缀的个数。没有使用压缩存储的后缀树叫做Suffix Trie,而不是Suffix Tree。一般情况下,使用压缩方式存储后缀树是最基本的要求。


在下面的实现中,利用Patricia Trie来构造后缀树,每一个结点除了存储Patricia Trie的key值之外,还存储了该结点key值在文本text中出现的最小下标值minStartIndex,这样便于匹配时输出成功匹配的位置。另 外,出于实际应用考虑,后缀树在叶子结点中不必要存储value。除了没有delete操作(文本是静态的,不需要修改)之外,建树操作(insert) 和查询匹配(find)操作跟Patricia Trie的实现差别不大。

实现:

import java.util.LinkedList;
import java.util.List;
 
/**
 * 
 * Suffix-Tree String Pattern Matching(Building tree using brute-force)
 *  
 * Copyright (c) 2011 ljs (http://blog.csdn.net/ljsspace/)
 * Licensed under GPL (http://www.opensource.org/licenses/gpl-license.php) 
 * 
 * @author ljs
 * 2011-06-27
 *
 */
public class SuffixTree {
	private class SuffixNode {	
		private String key;
	    private List<SuffixNode> children = new LinkedList<SuffixNode>();
	    
	    //use "#" for terminal char
	    private boolean terminal; 
	    
	    private int minStartIndex;
	    
	    public SuffixNode(){	    	
	    	this.key = "";
	    	minStartIndex = -1;
	    }
	    public SuffixNode(String key){
	    	this.key = key;	    	
	    }	    
	    public String toString(){	    	
	    	return this.key + "[" + this.minStartIndex + "]" + (this.terminal?"#":"") + "(" + children.size() +")";
	    }
	   
	}
	private SuffixNode root;
	private String text;
	
	public SuffixTree(String text){
		this.text = text;
	}
	
	//return the start index of the matched substring;
	//return -1 if no match is found
	public int find(String pattern){
		if(pattern == null || pattern.length() == 0) 
			return -1;
		
		if(root==null){
			return -1;
		}else{
			return find(root,pattern);
		}
	}
	
	private int find(SuffixNode currNode,String pattern) {
		for(int i=0;i<currNode.children.size();i++){
			SuffixNode child = currNode.children.get(i);
			
			//use min(child.key.length, pattern.length)
			int len = child.key.length()<pattern.length()?child.key.length():
				pattern.length();
			int j = 0;
			for(;j<len;j++){
				if(pattern.charAt(j) != child.key.charAt(j)){
					break;
				}
			}
			
			if(j==0){//this child doesn't match	any character with the new pattern			
				//order suffix-key by lexi-order
				if(pattern.charAt(0)<child.key.charAt(0)){
					//e.g. child="e", pattern="c" (currNode="abc")
					//	   abc                     
					//    /  \     
					//   e    h   
					return -1;
				}else{
					//e.g. child="e", pattern="h" (currNode="abc")
					continue;
				}
			}else{//current child's key partially matches with the new pattern; 0<j<=len				
				if(j==len){
					if(pattern.length()==child.key.length()){
						if(child.terminal){
							//e.g. child="ab", pattern="ab"
							//	   ab#                    
							//       \    
							//        f#    
							return child.minStartIndex;						
						}else{
							//e.g. child="ab", pattern="ab"
							//	   ab                    
							//    /  \    
							//   e    f    
							return child.minStartIndex;
						}
					}else if(pattern.length()>child.key.length()){
						//e.g. child="ab#", pattern="abc"
						//	   ab#                     
						//    /  \     						
						//   a    c#   			
						String subpattern = pattern.substring(j); //c
						//recursion
						int index = find(child,subpattern);
						if(index==-1){
							return -1;
						}else{
							return index-child.key.length();
						}
					}else{ //pattern.length()<child.key.length()
						//e.g. child="abc", pattern="ab"
						//	   abc                      
						//    /   \       
						//   e     f     
						return child.minStartIndex;						
					}					
				}else{//0<j<len
					//e.g. child="abc", pattern="abd"
					//	   abc                     
					//    /  \      
					//   e    f    
					return -1;					
				}				
			}
			
		}
		return -1;
	}
	private void insert(SuffixNode currNode,String key,int startIndex) throws Exception{		
		boolean done = false;
		for(int i=0;i<currNode.children.size();i++){
			SuffixNode child = currNode.children.get(i);
			
			//use min(child.key.length, key.length)
			int len = child.key.length()<key.length()?child.key.length():
				key.length();
			int j = 0;
			for(;j<len;j++){
				if(key.charAt(j) != child.key.charAt(j)){
					break;
				}
			}
			if(j==0){//this child doesn't match	any character with the new key			
				//order keys by lexi-order
				if(key.charAt(0)<child.key.charAt(0)){
					//e.g. child="e" (currNode="abc")
					//	   abc                     abc
					//    /  \    =========>      / | \
					//   e    f   insert "c"     c# e  f
				
					SuffixNode node = new SuffixNode(key);
					currNode.children.add(i,node);
					node.terminal = true;	
					node.minStartIndex = startIndex;
					done = true;
					break;					
				}else{ //key.charAt(0)>child.key.charAt(0)
					//don't forget to add the largest new key after iterating all children
					continue;
				}
			}else{//current child's key partially matches with the new key; 0<j<=len				
				if(j==len){
					if(key.length()==child.key.length()){
						if(child.terminal){
							throw new Exception("Duplicate Key is found when insertion!");							
						}else{
							//e.g. child="ab"
							//	   ab                    ab#
							//    /  \    =========>    /   \
							//   e    f   insert "ab"  e     f
							child.terminal = true;
							if(child.minStartIndex>startIndex)
								child.minStartIndex = startIndex;
						}
					}else if(key.length()>child.key.length()){
						//e.g. child="ab#"
						//	   ab#                    ab#
						//    /  \    ==========>    / | \ 							
						//   e    f   insert "abc"  c# e  f		
						if(child.minStartIndex>startIndex)
							child.minStartIndex = startIndex;
						String subkey = key.substring(j);
						//recursion
						insert(child,subkey,startIndex+j);
					}else{ //key.length()<child.key.length()
						//e.g. child="abc#"
						//	   abc#                      ab#
						//    /   \      =========>      /   
						//   e     f     insert "ab"    c#    
						//					           /  \
						//                            e    f													
						String childSubkey = child.key.substring(j); //c
						SuffixNode subChildNode = new SuffixNode(childSubkey);
						subChildNode.terminal = child.terminal;
						subChildNode.children = child.children; //inherited from parent
						subChildNode.minStartIndex = child.minStartIndex+j;
						
						child.key = key;  //ab
						child.terminal = true;  //ab#	
						if(child.minStartIndex>startIndex)
							child.minStartIndex = startIndex;
						
						child.children = new LinkedList<SuffixNode>();
						child.children.add(subChildNode);
					}					
				}else{//0<j<len
					//e.g. child="abc#"
					//	   abc#                     ab
					//    /  \     ==========>     / \
					//   e    f   insert "abd"    c#  d# 
					//                           /  \
					//                          e    f					
					//split at j
					String childSubkey = child.key.substring(j);  //c
					String subkey = key.substring(j); //d
					
					SuffixNode subChildNode = new SuffixNode(childSubkey);
					subChildNode.terminal = child.terminal;
					subChildNode.children = child.children; //inherited from parent
					subChildNode.minStartIndex = child.minStartIndex+j;
					
					//update child's key
					child.key = child.key.substring(0,j);
					if(child.minStartIndex>startIndex)
						child.minStartIndex = startIndex;
					//child is not terminal now due to split, it is inherited by subChildNode
					child.terminal = false;
					
					//Note: no need to merge subChildNode					
					
					SuffixNode node = new SuffixNode(subkey);
					node.terminal = true;
					node.minStartIndex = startIndex+j;
					child.children = new LinkedList<SuffixNode>();
					if(subkey.charAt(0)<childSubkey.charAt(0)){
						child.children.add(node);
						child.children.add(subChildNode);
					}else{
						child.children.add(subChildNode);
						child.children.add(node);
					}
				}
				done = true;
				break;
			}
		}
		if(!done){
			SuffixNode node = new SuffixNode(key);		
			node.terminal = true;
			node.minStartIndex = startIndex;
			currNode.children.add(node);
		}
	}
	public void insert(String suffix,int startIndex) throws Exception{
		if(suffix == null || suffix.length() == 0) return;
		
		if(root==null){
			root = new SuffixNode();				
		}
		insert(root,suffix,startIndex);		
	}
	
	//build a suffix-tree for a string of text
	public void buildSuffixTree() throws Exception{
		for(int i=0;i<text.length();i++){
			this.insert(text.substring(i), i);
		}		
	}
	
	//for test purpose only
	public void printTree(){
		this.print(0, this.root);
	}
	private void print(int level, SuffixNode node){
		for (int i = 0; i < level; i++) {
            System.out.format(" ");
        }
		System.out.format("|");
        for (int i = 0; i < level; i++) {
        	System.out.format("-");
        }
        if (node.terminal)
        	System.out.format("%s[%s]#%n", node.key,node.minStartIndex);
        else
        	System.out.format("%s[%s]%n", node.key,node.minStartIndex);
        for (SuffixNode child : node.children) {
        	print(level + 1, child);
        }		
	}
	public void testFind(String pattern){
		int index = this.find(pattern);
		if(index != -1)
			System.out.format("Found pattern \"%s\" at: %s%n",pattern,index);
		else
			System.out.format("Found no such pattern: \"%s\"%n",pattern);
	}
	public static void main(String[] args) throws Exception {
		//test suffix-tree
		System.out.println("****************************");		
		String text = "minimize";
		SuffixTree strie = new SuffixTree(text);
		strie.buildSuffixTree();
		strie.printTree();
		
		System.out.println("****************************");		
		text = "mississippi";
		strie = new SuffixTree(text);
		strie.buildSuffixTree();
		strie.printTree();
		
		String pattern = "iss";
		strie.testFind(pattern);
		pattern = "ip";
		strie.testFind(pattern);
		pattern = "pi";
		strie.testFind(pattern);
		pattern = "miss";
		strie.testFind(pattern);
		pattern = "tt";
		strie.testFind(pattern);
		pattern = "si";
		strie.testFind(pattern);
		pattern = "ssi";
		strie.testFind(pattern);
		pattern = "sissippi";
		strie.testFind(pattern);
		pattern = "ssippi";
		strie.testFind(pattern);
		
		System.out.println("****************************");		
		text = "After a long text, here's a needle ZZZZZ";  
        pattern = "ZZZZZ";    
        strie = new SuffixTree(text);
		strie.buildSuffixTree();
		//strie.printTree();
		strie.testFind(pattern);
		
		
		System.out.println("****************************");		
		text = "The quick brown fox jumps over the lazy dog.";  
        pattern = "lazy";  
        strie = new SuffixTree(text);
		strie.buildSuffixTree();
		//strie.printTree();
		strie.testFind(pattern);
		
		
		System.out.println("****************************");		
		text = "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna...";  
        pattern = "tempor"; 
        strie = new SuffixTree(text);
		strie.buildSuffixTree();
		//strie.printTree();
		strie.testFind(pattern);
		
		System.out.println("****************************");		
		text = "GGGGGGGGGGGGCGCAAAAGCGAGCAGAGAGAAAAAAAAAAAAAAAAAAAAAA";  
        pattern = "GCAGAGAG";      
        strie = new SuffixTree(text);
		strie.buildSuffixTree();
		//strie.printTree();
		strie.testFind(pattern);
	}
}

测试输出:

****************************
|[-1]
 |-e[7]#
 |-i[1]
  |--mize[4]#
  |--nimize[2]#
  |--ze[6]#
 |-mi[0]
  |--nimize[2]#
  |--ze[6]#
 |-nimize[2]#
 |-ze[6]#
****************************
|[-1]
 |-i[1]#
  |--ppi[8]#
  |--ssi[2]
   |---ppi[8]#
   |---ssippi[5]#
 |-mississippi[0]#
 |-p[8]
  |--i[10]#
  |--pi[9]#
 |-s[2]
  |--i[4]
   |---ppi[8]#
   |---ssippi[5]#
  |--si[3]
   |---ppi[8]#
   |---ssippi[5]#
Found pattern "iss" at: 1
Found pattern "ip" at: 7
Found pattern "pi" at: 9
Found pattern "miss" at: 0
Found no such pattern: "tt"
Found pattern "si" at: 3
Found pattern "ssi" at: 2
Found pattern "sissippi" at: 3
Found pattern "ssippi" at: 5
****************************
Found pattern "ZZZZZ" at: 35
****************************
Found pattern "lazy" at: 35
****************************
Found pattern "tempor" at: 73
****************************
Found pattern "GCAGAGAG" at: 23

posted @ 2011-06-30 16:23  ljsspace  阅读(2281)  评论(0编辑  收藏  举报