动态规划分词(结巴分词算法)
看了好几次结巴的算法, 总也记不住, 还是得自己写一遍才能真正明白.
其实也不难, 就是动态规划算法, 先把所有的分词路径都找出来 ,然后分词的路径就是概率最大的路径.
每个路径的概率=该路径所有词的概率乘积, 也就是log之和; 每个词的概率取log=log(freq/total), total是所有词的总词频.
/**
* example: 研究生命的起源
*/
private List<String> seg(String str) {
// get all paths like this:
// 0 [3, 1, 2]
// 1 [2]
// 2 [4, 3]
// 3 [4]
// 4 [5]
// 5 [7, 6]
// 6 [7]
IntArray[] paths = new IntArray[str.length()];
char[] chars = str.toCharArray();
for (int i = 0; i < str.length(); i++) {
IntArray path = new IntArray(1);
int max = TRIE.maxMatch(chars, i, chars.length - i);
path.add(i + max);
for (int j = 1; j < max; j++) {
if (TRIE.contains(chars, i, j)) {
path.add(i + j);
}
}
paths[i] = path;
}
// 动态规划自下向上开始计算, 每个节点算出最大的分数, 同时记录其下一个节点
// 获取的nexts路径像这样: [2, 2, 4, 4, 5, 7, 7]
float[] maxScores = new float[str.length() + 1];
maxScores[str.length()] = 0;
int[] nexts = new int[str.length()];
for (int i = str.length() - 1; i >= 0; i--) {
float maxScore = Float.NEGATIVE_INFINITY;
int next = 0;
for (int j = 0; j < paths[i].size(); j++) {
int possibleNext = paths[i].get(j);
float score = TRIE.weight(chars, i, possibleNext - i) + maxScores[possibleNext];
if (score > maxScore) {
maxScore = score;
next = possibleNext;
}
}
maxScores[i] = maxScore;
nexts[i] = next;
}
List<String> terms = new ArrayList<>(4);
int current = 0;
while (current != str.length()) {
int next = nexts[current];
String term = str.substring(current, next);
terms.add(term);
current = next;
}
return terms;
}