未登录词识别
未登录词识别:不在词典中的词 ---新词:杀马特 ---命名实体:奥克兰
主要解决方案:基于规则合词,然后通过百度验证。
Start Char Char 1-2-Combine #[图 n][里 f][市场 n][站 n]
Start Char Char Char 1-3-Combine #
Start Char Char Char Char 1-4-Combine #
Start Char Char Char Char Char 1-5-Combine #
Start Char Char Char Char Char Char 1-6-Combine #
Start Direction Char 1-2-Combine #东澳站 南势站
Start Char Word 1-2-Combine #[台 j][中港 nz][站 n]
Word Char Keyword 0-1-Combine #[梨园 nz][寮 g][站 v][白沙 nz][屯 ng][站 n]
Char Char Keyword 0-1-Combine #[商水县 ns][黄 a][寨 ng][站 n]
NumPrefix Num 0-1-Seq #地五医院
Num NumSuffix 0-1-Seq #93/号/酒家
Num Num 0-1-Combine #
Num Num Num 0-2-Combine #
Num Num Num Num 0-3-Combine #
Num Num Num Num Num 0-4-Combine #
Num Num Num Num Num Num 0-5-Combine #
Num Num Num Num Num Num Num 0-6-Combine #
Num Num Num Num Num Num Num Num 0-7-Combine #
Num Num Num Num Num Num Num Num Num 0-8-Combine #
Num Num Num Num Num Num Num Num Num Num 0-9-Combine #
Letter Letter Letter Letter Letter Letter Letter Letter Letter Letter Letter 0-10-Combine #
Letter Letter Letter Letter Letter Letter Letter Letter Letter Letter 0-9-Combine #
Letter Letter Letter Letter Letter Letter Letter Letter Letter 0-8-Combine #
Letter Letter Letter Letter Letter Letter Letter Letter 0-7-Combine #
Letter Letter Letter Letter Letter Letter Letter 0-6-Combine #
Letter Letter Letter Letter Letter Letter 0-5-Combine #
Letter Letter Letter Letter Letter 0-4-Combine #
Letter Letter Letter Letter 0-3-Combine #
Letter Letter Letter 0-2-Combine #
Letter Letter 0-1-Combine #
Num NumSuffix Keyword 0-1-Seq #海口1号场BLACKSTONE球场
Num Char Char Keyword 0-2-Combine #八里岔中学
Char Num Char Keyword 0-2-Combine #八里岔中学
Char Char Num Keyword 0-2-Combine #八里岔中学
ackage cn.tianditu.mt.common; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; public class Grammar { protected static Log logger = LogFactory.getLog(Grammar.class); public final class TSTNode { public CombinRule data = null; protected TSTNode loNode; protected TSTNode eqNode; protected TSTNode hiNode; protected SegMarkType splitchar; public TSTNode(SegMarkType type) { this.splitchar = type; } } public TSTNode rootNode; public TSTNode add(List<SegMarkType> word) { if (null == word) { throw new NullPointerException("空指针异常"); } int charIndex = 0; if (null == rootNode) { rootNode = new TSTNode(word.get(0)); } TSTNode currentNode = rootNode; while (true) { int charComp = word.get(charIndex).compareTo(currentNode.splitchar); if (charComp == 0) { charIndex++; if (charIndex == word.size()) { return currentNode; } if (null == currentNode.eqNode) { currentNode.eqNode = new TSTNode(word.get(charIndex)); } currentNode = currentNode.eqNode; } else if (charComp < 0) { if (null == currentNode.loNode) { currentNode.loNode = new TSTNode(word.get(charIndex)); } currentNode = currentNode.loNode; } else { if (null == currentNode.hiNode) { currentNode.hiNode = new TSTNode(word.get(charIndex)); } currentNode = currentNode.hiNode; } } } protected TSTNode getNode(List<SegMarkType> word) { if (null == word) { return null; } int len = word.size(); if (len == 0) return null; TSTNode currentNode = rootNode; // 匹配过程中的当前节点的位置 int charIndex = 0; // 表示当前要比较的字符在Key中的位置 SegMarkType cmpChar = word.get(charIndex); int charComp; while (true) { if (currentNode == null) {// 没找到 return null; } charComp = cmpChar.compareTo(currentNode.splitchar); if (charComp == 0) {// 相等往下走 charIndex++; if (charIndex == len) {// 找到了 return currentNode; } else { cmpChar = word.get(charIndex);// 词往下走 } currentNode = currentNode.eqNode; } else if (charComp < 0) {// 小于往左走 currentNode = currentNode.loNode; } else {// 大于往右走 currentNode = currentNode.hiNode; } } } public MatchRet matchLong(List<WordInfo> tokens, int offset) { if (tokens == null || rootNode == null) { return null; } MatchRet ret = null; TSTNode currentNode = rootNode; int index = offset; while (currentNode != null) { int charComp = tokens.get(index).getType().compareTo( currentNode.splitchar); if (charComp == 0) { index++; if (currentNode.data != null) { ret = new MatchRet(currentNode, index); } if (index == tokens.size()) { return ret; } currentNode = currentNode.eqNode; } else if (charComp < 0) { currentNode = currentNode.loNode; } else { currentNode = currentNode.hiNode; } } return ret; } /** * 根据语法规则进行合并 * 支持多次合并 * 且保留了源序列 * @param tokens * @param rules * @return */ private List<WordInfo> combineByRules(List<WordInfo> tokens,List<Combin> rules){ if(rules==null){ return tokens; } List<WordInfo> list=new ArrayList<WordInfo>(); for (int i = 0; i < tokens.size();) { for (Combin com : rules) { if(i==com.getStart()){ int start=com.getStart(); int end=com.getEnd(); List<WordInfo> sub=tokens.subList(start, end+1);//前闭后开 StringBuilder buff=new StringBuilder(); for (WordInfo wordInfo : sub) { buff.append(wordInfo.getCn()); } String cn=buff.toString(); SegMarkType type=com.getType(); WordInfo info=new WordInfo(cn,null,type,sub); list.add(info); i=end+1; continue; } } list.add(tokens.get(i)); i++; } return list; } /** * 仅支持一次合并,不支持内部的多次合并,即无法达到有限状态机的效果 * @param tokens * @param rules */ @SuppressWarnings("unused") private void CombineOnce(LinkedList<WordInfo> tokens, List<Combin> rules) { for (Combin com : rules) { int start = com.getStart(); int end = com.getEnd(); SegMarkType type = com.getType(); StringBuilder buff=new StringBuilder(); for (int i = start; i <= end; i++) { WordInfo word=tokens.get(i); buff.append(word.getCn()); } int dis=end-start+1; for (int i = 0; i < dis; i++) { tokens.remove(start); } String cn=buff.toString(); WordInfo info=new WordInfo(cn,null,type); tokens.add(start, info); } } public List<WordInfo> tag(List<WordInfo> tokens) { if (tokens == null || rootNode == null) { return null; } List<Combin> rules = new ArrayList<Combin>(); for (int i = 0; i < tokens.size();) { MatchRet ret = matchLong(tokens, i); if (null != ret) { CombinRule rule = ret.getNode().data;//找到了树上的东西 int indexCurrent = ret.getIndex()-1; List<Combin> list_com = rule.getPosition(); for (Combin com : list_com) { int start = indexCurrent - rule.getLen() + 1 + com.getStart(); int end = indexCurrent - rule.getLen() + 1 + com.getEnd(); Combin c = new Combin(start, end, com.getType());//拿到规则 rules.add(c);//放入规则列表 } i = ret.getIndex(); } else { i++; } } List<WordInfo> words= combineByRules(tokens,rules);//根据规则合并 return words; } public Grammar(Config config){ loadGrammar(config.getBasicGramFileName()); loadGrammar(config.getGramFileName()); } public void loadGrammar(String gramFileName){ try { FileReader fileReader = new FileReader(gramFileName); BufferedReader reader = new BufferedReader(fileReader); String line; try { while ((line = reader.readLine()) != null) { String[] arr=line.split("\t"); List<SegMarkType> seq=FormSeq(arr[0]); CombinRule rule=FormRule(arr[1],seq.size()); TSTNode node = this.add(seq); node.data=rule; } } catch (NullPointerException e) { logger.info(e.getMessage()); logger.info(e.getStackTrace()); } catch (IllegalArgumentException e) { logger.info(e.getMessage()); logger.info(e.getStackTrace()); } catch (IOException e) { logger.info(e.getMessage()); logger.info(e.getStackTrace()); } } catch (FileNotFoundException e) { logger.info(e.getMessage()); logger.info(e.getStackTrace()); } } private CombinRule FormRule(String line,int size) { List<Combin> rec = new ArrayList<Combin>(); String[] arr_1=line.split("#"); for (String str : arr_1) { String[] arr_2=str.split("-"); int start = Integer.parseInt(arr_2[0]); int end=Integer.parseInt(arr_2[1]); SegMarkType type=Enum.valueOf(SegMarkType.class, arr_2[2].trim()); Combin pos = new Combin(start, end, type); rec.add(pos); } CombinRule rule = new CombinRule(rec,size); return rule; } private List<SegMarkType> FormSeq(String string) { List<SegMarkType> list=new ArrayList<SegMarkType>(); String[] arr=string.split(" "); for (String str : arr) { SegMarkType type=Enum.valueOf(SegMarkType.class, str); list.add(type); } return list; } }