正向最大长度匹配与逆向最大长度匹配

package com.smart.basic;


public class TernarySearchTrie {
    public final class TSTNode {
        /**
         * 节点的值,词原文,词性,词频等
         */
        public String data = null;
        /**
         * 低节点
         */
        protected TSTNode loNode;
        /**
         * 相等节点
         */
        protected TSTNode eqNode;
        /**
         * 高节点
         */
        protected TSTNode hiNode;
        
        /**
         * 节点的字符
         */
        protected char splitchar;

        /**
         * 构造方法
         * 
         * @param splitchar
         *            该节点表示的字符
         */
        protected TSTNode(char splitchar) {
            this.splitchar = splitchar;
        }

        public String toString() {
            return "splitchar:" + splitchar;
        }
    }

    protected TSTNode rootNode;
    
    /**
     * 查询
     * @param word 要查询的单词
     * @return 未找到返回null,找到返回单词的结束节点
     */
    protected TSTNode getNode(String word) {
        if (null == word) {
            return null;
        }
        int len = word.length();
        if (len == 0)
            return null;
        TSTNode currentNode = rootNode; // 匹配过程中的当前节点的位置
        int charIndex = 0; // 表示当前要比较的字符在Key中的位置
        char cmpChar = word.charAt(charIndex);
        int charComp;
        while (true) {
            if (currentNode == null) {// 没找到
                return null;
            }
            charComp = cmpChar - currentNode.splitchar;
            if (charComp == 0) {//相等往下走
                charIndex++;
                if (charIndex == len) {//找到了
                    return currentNode;
                } else {
                    cmpChar = word.charAt(charIndex);//词往下走
                }
                currentNode = currentNode.eqNode;
            } else if (charComp < 0) {//小于往左走
                currentNode = currentNode.loNode;
            } else {//大于往右走
                currentNode = currentNode.hiNode;
            }
        }
    }

    /**
     * 向词典添加单词
     * @param word 单词
     * @return 单词的结束节点
     */
    protected TSTNode addWord(String word) {
        if (null == word) {
            throw new NullPointerException("空指针异常");
        }
        int charIndex = 0;
        if (null == rootNode) {
            rootNode = new TSTNode(word.charAt(0));
        }
        TSTNode currentNode = rootNode;
        while (true) {
            int charComp = word.charAt(charIndex) - currentNode.splitchar;
            if (charComp == 0) {
                charIndex++;
                if (charIndex == word.length()) {
                    return currentNode;
                }
                if (null == currentNode.eqNode) {
                    currentNode.eqNode = new TSTNode(word.charAt(charIndex));
                }
                currentNode = currentNode.eqNode;
            } else if (charComp < 0) {
                if (null == currentNode.loNode) {
                    currentNode.loNode = new TSTNode(word.charAt(charIndex));
                }
                currentNode = currentNode.loNode;
            } else {
                if (null == currentNode.hiNode) {
                    currentNode.hiNode = new TSTNode(word.charAt(charIndex));
                }
                currentNode = currentNode.hiNode;
            }
        }

    }
}
package com.smart.basic;

import java.util.ArrayList;
import java.util.List;

public class TSTMaxMatch extends TernarySearchTrie {

    private int matchEnglish(int start, String sentence) {
        int i = start;
        for (; i < sentence.length();) {
            char c = sentence.charAt(i);
            if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z') {
                ++i;
            } else {
                break;
            }
        }
        return i;
    }
    
    private int matchNum(int start, String sentence) {
        int i = start;
        for (; i < sentence.length();) {
            char c = sentence.charAt(i);
            if (c >= '0' && c <= '9') {
                ++i;
            } else {
                break;
            }
        }
        return i;
    }
    
    public List<String> tag(String sentence){
        List<String> words = new ArrayList<String>();
        for (int i = 0; i < sentence.length();) {
            String w = maxMatch(sentence, i);
            if (!"".equals(w)) {
                words.add(w);
                i += w.length();
            } else {
                words.add(sentence.substring(i, i + 1));
                i++;
            }
        }
        return words;
    }
    
    /**
     * 正向最大长度匹配
     * @param sentence
     * @param offset
     * @return
     */
    public String maxMatch(String sentence, int offset) {
        String ret = "";
        if (sentence == null || rootNode == null || "".equals(sentence)) {
            return "";
        }
        int endIndex = matchEnglish(offset, sentence);
        if (endIndex != offset) {
            return sentence.substring(offset,endIndex);
        }

        endIndex = matchNum(offset, sentence);
        if (endIndex != offset) {
            return sentence.substring(offset,endIndex);
        }

        TSTNode currentNode = rootNode;
        int charIndex = offset;
        while (currentNode != null) {
            
            int charComp = sentence.charAt(charIndex) - currentNode.splitchar;
            if (charComp == 0) {
                charIndex++;
                if(currentNode.data != null){
                    ret = currentNode.data;
                }
                if (charIndex == sentence.length()) {
                    return ret;
                }
                currentNode = currentNode.eqNode;
            } else if (charComp < 0) {
                currentNode = currentNode.loNode;
            } else {
                currentNode = currentNode.hiNode;
            }
        }
        return ret;
    }
    
    
    public static void main(String[] args) {
        TSTMaxMatch tree=new TSTMaxMatch();
        tree.addWord("大学生").data="大学生";
        tree.addWord("大学").data="大学";
        tree.addWord("活动中心").data="活动中心";
        
        List<String> ret =tree.tag("大学生活动中心");
        for (int i = 0; i < ret.size(); i++) {
            System.out.println(ret.get(i));
        }
    }

}
package com.smart.basic;

import java.util.ArrayList;
import java.util.List;

public class TSTBackMaxMatch extends TernarySearchTrie {
        
    /**
     * 匹配英文
     * 
     * @param sen
     * @param offset
     * @return
     */
    private String matchEnglish(char[] sen, int offset) {
        int i = offset;
        for (; i >= 0;) {
            char ch = sen[i];
            if (ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z' || ch == '\'')
                i--;
            else
                break;
        }
        String eng = subCharArray(sen, i + 1, offset + 1);
        return eng;

    }

    /**
     * 匹配数字
     * 
     * @param sen
     * @param offset
     * @return
     */
    private String matchNum(char[] sen, int offset) {
        int i = offset;
        for (; i >= 0;) {
            char ch = sen[i];
            if (ch >= '0' && ch <= '9')
                i--;
            else
                break;
        }
        String num = subCharArray(sen, i + 1, offset + 1);
        return num;
    }

    /**
     * 截取子串
     * 
     * @param sen
     * @param start
     * @param end
     * @return
     */
    private String subCharArray(char[] sen, int start, int end) {
        char[] chs = new char[end - start];
        if (start != end) {
            System.arraycopy(sen, start, chs, 0, end - start);
        }
        return String.valueOf(chs);
    }
    
    
    
    /**
     * 逆向最大长度匹配
     * 
     * @param sentence
     * @param offset
     * @return
     */
    private String matchLongBackward(char[] sentence, int offset) {
        String ret = null;
        if (rootNode == null || sentence == null || sentence.length == 0) {
            return ret;
        }

        String eng = matchEnglish(sentence, offset);
        if (!"".equals(eng)) {
            return eng;
        }

        String num = matchNum(sentence, offset);
        if (!"".equals(num)) {
            return num;
        }

        int charIndex = offset;
        TSTNode currentNode = rootNode;
        while (true) {
            if (currentNode == null) {
                if (ret == null) {
                    String singleCn = subCharArray(sentence, offset, offset + 1);
                    return singleCn;
                }
                return ret;
            }
            int charComp = sentence[charIndex] - currentNode.splitchar;
            if (charComp == 0) {
                charIndex--;
                if (currentNode.data != null) {
                    ret = currentNode.data;
                }
                if (charIndex < 0) {
                    if (ret == null) {
                        String singleCn = subCharArray(sentence, offset,
                                offset + 1);
                        return singleCn;
                    }
                    return ret;
                }
                currentNode = currentNode.eqNode;
            } else if (charComp < 0) {
                currentNode = currentNode.loNode;
            } else {
                currentNode = currentNode.hiNode;
            }
        }
    }
    
    
    /**
     * 切分
     * 
     * @param sentence
     * @return
     */
    public List<String> tag(String sentence) {
        ArrayList<String> list = new ArrayList<String>();
        char[] sen = sentence.toCharArray();
        int offset = sentence.length() - 1;
        while (offset >= 0) {
            String word = matchLongBackward(sen, offset);
            if (word == null) {
                offset--;
            } else {
                offset -= word.length();
            }
            if (word != null) {// 过滤掉空白字符
                list.add(word);
            }
        }
        return list;
    }
    

    public static void main(String[] args) {
        TSTBackMaxMatch tree=new TSTBackMaxMatch();
        tree.addWord("心中").data="中心";
        tree.addWord("动活").data="活动";
        tree.addWord("生学大").data="大学生";
        tree.addWord("学大").data="大学";
        
        List<String> words=tree.tag("大学生123活动abc中心");
        for (String string : words) {
            System.out.println(string);
        }
    }

}

posted on 2012-10-16 23:18  雨渐渐  阅读(966)  评论(0编辑  收藏  举报

导航