![](https://pic002.cnblogs.com/images/2012/316046/2012101623163097.png)
package com.smart.basic;
public class TernarySearchTrie {
public final class TSTNode {
/**
* 节点的值,词原文,词性,词频等
*/
public String data = null;
/**
* 低节点
*/
protected TSTNode loNode;
/**
* 相等节点
*/
protected TSTNode eqNode;
/**
* 高节点
*/
protected TSTNode hiNode;
/**
* 节点的字符
*/
protected char splitchar;
/**
* 构造方法
*
* @param splitchar
* 该节点表示的字符
*/
protected TSTNode(char splitchar) {
this.splitchar = splitchar;
}
public String toString() {
return "splitchar:" + splitchar;
}
}
protected TSTNode rootNode;
/**
* 查询
* @param word 要查询的单词
* @return 未找到返回null,找到返回单词的结束节点
*/
protected TSTNode getNode(String word) {
if (null == word) {
return null;
}
int len = word.length();
if (len == 0)
return null;
TSTNode currentNode = rootNode; // 匹配过程中的当前节点的位置
int charIndex = 0; // 表示当前要比较的字符在Key中的位置
char cmpChar = word.charAt(charIndex);
int charComp;
while (true) {
if (currentNode == null) {// 没找到
return null;
}
charComp = cmpChar - currentNode.splitchar;
if (charComp == 0) {//相等往下走
charIndex++;
if (charIndex == len) {//找到了
return currentNode;
} else {
cmpChar = word.charAt(charIndex);//词往下走
}
currentNode = currentNode.eqNode;
} else if (charComp < 0) {//小于往左走
currentNode = currentNode.loNode;
} else {//大于往右走
currentNode = currentNode.hiNode;
}
}
}
/**
* 向词典添加单词
* @param word 单词
* @return 单词的结束节点
*/
protected TSTNode addWord(String word) {
if (null == word) {
throw new NullPointerException("空指针异常");
}
int charIndex = 0;
if (null == rootNode) {
rootNode = new TSTNode(word.charAt(0));
}
TSTNode currentNode = rootNode;
while (true) {
int charComp = word.charAt(charIndex) - currentNode.splitchar;
if (charComp == 0) {
charIndex++;
if (charIndex == word.length()) {
return currentNode;
}
if (null == currentNode.eqNode) {
currentNode.eqNode = new TSTNode(word.charAt(charIndex));
}
currentNode = currentNode.eqNode;
} else if (charComp < 0) {
if (null == currentNode.loNode) {
currentNode.loNode = new TSTNode(word.charAt(charIndex));
}
currentNode = currentNode.loNode;
} else {
if (null == currentNode.hiNode) {
currentNode.hiNode = new TSTNode(word.charAt(charIndex));
}
currentNode = currentNode.hiNode;
}
}
}
}
package com.smart.basic;
import java.util.ArrayList;
import java.util.List;
public class TSTMaxMatch extends TernarySearchTrie {
private int matchEnglish(int start, String sentence) {
int i = start;
for (; i < sentence.length();) {
char c = sentence.charAt(i);
if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z') {
++i;
} else {
break;
}
}
return i;
}
private int matchNum(int start, String sentence) {
int i = start;
for (; i < sentence.length();) {
char c = sentence.charAt(i);
if (c >= '0' && c <= '9') {
++i;
} else {
break;
}
}
return i;
}
public List<String> tag(String sentence){
List<String> words = new ArrayList<String>();
for (int i = 0; i < sentence.length();) {
String w = maxMatch(sentence, i);
if (!"".equals(w)) {
words.add(w);
i += w.length();
} else {
words.add(sentence.substring(i, i + 1));
i++;
}
}
return words;
}
/**
* 正向最大长度匹配
* @param sentence
* @param offset
* @return
*/
public String maxMatch(String sentence, int offset) {
String ret = "";
if (sentence == null || rootNode == null || "".equals(sentence)) {
return "";
}
int endIndex = matchEnglish(offset, sentence);
if (endIndex != offset) {
return sentence.substring(offset,endIndex);
}
endIndex = matchNum(offset, sentence);
if (endIndex != offset) {
return sentence.substring(offset,endIndex);
}
TSTNode currentNode = rootNode;
int charIndex = offset;
while (currentNode != null) {
int charComp = sentence.charAt(charIndex) - currentNode.splitchar;
if (charComp == 0) {
charIndex++;
if(currentNode.data != null){
ret = currentNode.data;
}
if (charIndex == sentence.length()) {
return ret;
}
currentNode = currentNode.eqNode;
} else if (charComp < 0) {
currentNode = currentNode.loNode;
} else {
currentNode = currentNode.hiNode;
}
}
return ret;
}
public static void main(String[] args) {
TSTMaxMatch tree=new TSTMaxMatch();
tree.addWord("大学生").data="大学生";
tree.addWord("大学").data="大学";
tree.addWord("活动中心").data="活动中心";
List<String> ret =tree.tag("大学生活动中心");
for (int i = 0; i < ret.size(); i++) {
System.out.println(ret.get(i));
}
}
}
package com.smart.basic;
import java.util.ArrayList;
import java.util.List;
public class TSTBackMaxMatch extends TernarySearchTrie {
/**
* 匹配英文
*
* @param sen
* @param offset
* @return
*/
private String matchEnglish(char[] sen, int offset) {
int i = offset;
for (; i >= 0;) {
char ch = sen[i];
if (ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z' || ch == '\'')
i--;
else
break;
}
String eng = subCharArray(sen, i + 1, offset + 1);
return eng;
}
/**
* 匹配数字
*
* @param sen
* @param offset
* @return
*/
private String matchNum(char[] sen, int offset) {
int i = offset;
for (; i >= 0;) {
char ch = sen[i];
if (ch >= '0' && ch <= '9')
i--;
else
break;
}
String num = subCharArray(sen, i + 1, offset + 1);
return num;
}
/**
* 截取子串
*
* @param sen
* @param start
* @param end
* @return
*/
private String subCharArray(char[] sen, int start, int end) {
char[] chs = new char[end - start];
if (start != end) {
System.arraycopy(sen, start, chs, 0, end - start);
}
return String.valueOf(chs);
}
/**
* 逆向最大长度匹配
*
* @param sentence
* @param offset
* @return
*/
private String matchLongBackward(char[] sentence, int offset) {
String ret = null;
if (rootNode == null || sentence == null || sentence.length == 0) {
return ret;
}
String eng = matchEnglish(sentence, offset);
if (!"".equals(eng)) {
return eng;
}
String num = matchNum(sentence, offset);
if (!"".equals(num)) {
return num;
}
int charIndex = offset;
TSTNode currentNode = rootNode;
while (true) {
if (currentNode == null) {
if (ret == null) {
String singleCn = subCharArray(sentence, offset, offset + 1);
return singleCn;
}
return ret;
}
int charComp = sentence[charIndex] - currentNode.splitchar;
if (charComp == 0) {
charIndex--;
if (currentNode.data != null) {
ret = currentNode.data;
}
if (charIndex < 0) {
if (ret == null) {
String singleCn = subCharArray(sentence, offset,
offset + 1);
return singleCn;
}
return ret;
}
currentNode = currentNode.eqNode;
} else if (charComp < 0) {
currentNode = currentNode.loNode;
} else {
currentNode = currentNode.hiNode;
}
}
}
/**
* 切分
*
* @param sentence
* @return
*/
public List<String> tag(String sentence) {
ArrayList<String> list = new ArrayList<String>();
char[] sen = sentence.toCharArray();
int offset = sentence.length() - 1;
while (offset >= 0) {
String word = matchLongBackward(sen, offset);
if (word == null) {
offset--;
} else {
offset -= word.length();
}
if (word != null) {// 过滤掉空白字符
list.add(word);
}
}
return list;
}
public static void main(String[] args) {
TSTBackMaxMatch tree=new TSTBackMaxMatch();
tree.addWord("心中").data="中心";
tree.addWord("动活").data="活动";
tree.addWord("生学大").data="大学生";
tree.addWord("学大").data="大学";
List<String> words=tree.tag("大学生123活动abc中心");
for (String string : words) {
System.out.println(string);
}
}
}