pyhanlp隐马尔可夫模型(HMM)中文分词
目录
1、一阶和二阶隐马尔可夫模型中文分词(hmm_cws.py)
5、下载、移动语料库文件等(test_utility.py)
6、加载语料库(demo_corpus_loader.py)
1、一阶和二阶隐马尔可夫模型中文分词(hmm_cws.py)
# - 一阶和二阶隐马尔可夫模型中文分词 -
# 导入pyhanlp库中的所有包
from pyhanlp import *
from eval_bigram_cws import CWSEvaluator
# 导入微软亚洲研究院语料库 MSR的预测、训练等函数
from msr import msr_dict, msr_train, msr_model, msr_test, msr_output, msr_gold
# 导入一阶隐马尔可夫模型
FirstOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.FirstOrderHiddenMarkovModel')
# 导入二阶隐马尔可夫模型
SecondOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.SecondOrderHiddenMarkovModel')
# 导入隐马尔可夫模型分词器
HMMSegmenter = JClass('com.hankcs.hanlp.model.hmm.HMMSegmenter')
# 定义一阶和二阶隐马尔可夫模型的中文分词算法
def train(corpus, model):
segmenter = HMMSegmenter(model)
segmenter.train(corpus)
print(segmenter.segment('圣诞节快到了'))
# 确保被import到其他文件时只导入前面的代码,而忽略后面的代码
if __name__ == '__main__':
# 输出一阶隐马尔可夫模型的分词结果
segment = train(msr_train, FirstOrderHiddenMarkovModel())
# 输出二阶隐马尔可夫模型的分词结果
segment = train(msr_train, SecondOrderHiddenMarkovModel())
2、训练(ngram_segment.py)
# - 训练 -
from jpype import JString
from pyhanlp import *
from demo_corpus_loader import my_cws_corpus
from msr import msr_model
from test_utility import test_data_path
NatureDictionaryMaker = SafeJClass('com.hankcs.hanlp.corpus.dictionary.NatureDictionaryMaker')
CorpusLoader = SafeJClass('com.hankcs.hanlp.corpus.document.CorpusLoader')
WordNet = JClass('com.hankcs.hanlp.seg.common.WordNet')
Vertex = JClass('com.hankcs.hanlp.seg.common.Vertex')
ViterbiSegment = JClass('com.hankcs.hanlp.seg.Viterbi.ViterbiSegment')
DijkstraSegment = JClass('com.hankcs.hanlp.seg.Dijkstra.DijkstraSegment')
CoreDictionary = LazyLoadingJClass('com.hankcs.hanlp.dictionary.CoreDictionary')
Nature = JClass('com.hankcs.hanlp.corpus.tag.Nature')
def train_bigram(corpus_path, model_path):
sents = CorpusLoader.convert2SentenceList(corpus_path)
for sent in sents:
for word in sent:
if word.label is None:
word.setLabel("n")
maker = NatureDictionaryMaker()
maker.compute(sents)
maker.saveTxtTo(model_path) # tests/data/my_cws_model.txt
def load_bigram(model_path, verbose=True, ret_viterbi=True):
HanLP.Config.CoreDictionaryPath = model_path + ".txt" # unigram
HanLP.Config.BiGramDictionaryPath = model_path + ".ngram.txt" # bigram
# 以下部分为兼容新标注集,不感兴趣可以跳过
HanLP.Config.CoreDictionaryTransformMatrixDictionaryPath = model_path + ".tr.txt" # 词性转移矩阵,分词时可忽略
if model_path != msr_model:
with open(HanLP.Config.CoreDictionaryTransformMatrixDictionaryPath) as src:
for tag in src.readline().strip().split(',')[1:]:
Nature.create(tag)
CoreBiGramTableDictionary = SafeJClass('com.hankcs.hanlp.dictionary.CoreBiGramTableDictionary')
CoreDictionary.getTermFrequency("商品")
# 兼容代码结束
if verbose:
print(CoreDictionary.getTermFrequency("商品"))
print(CoreBiGramTableDictionary.getBiFrequency("商品", "和"))
sent = '商品和服务'
# sent = '货币和服务'
wordnet = generate_wordnet(sent, CoreDictionary.trie)
print(wordnet)
print(viterbi(wordnet))
return ViterbiSegment().enableAllNamedEntityRecognize(False).enableCustomDictionary(
False) if ret_viterbi else DijkstraSegment().enableAllNamedEntityRecognize(False).enableCustomDictionary(False)
def generate_wordnet(sent, trie):
"""
生成词网
:param sent: 句子
:param trie: 词典(unigram)
:return: 词网
"""
searcher = trie.getSearcher(JString(sent), 0)
wordnet = WordNet(sent)
while searcher.next():
wordnet.add(searcher.begin + 1,
Vertex(sent[searcher.begin:searcher.begin + searcher.length], searcher.value, searcher.index))
# 原子分词,保证图连通
vertexes = wordnet.getVertexes()
i = 0
while i < len(vertexes):
if len(vertexes[i]) == 0: # 空白行
j = i + 1
for j in range(i + 1, len(vertexes) - 1): # 寻找第一个非空行 j
if len(vertexes[j]):
break
wordnet.add(i, Vertex.newPunctuationInstance(sent[i - 1: j - 1])) # 填充[i, j)之间的空白行
i = j
else:
i += len(vertexes[i][-1].realWord)
return wordnet
# 定义隐马尔可夫模型的维特比算法
def viterbi(wordnet):
nodes = wordnet.getVertexes()
# 前向遍历
for i in range(0, len(nodes) - 1):
for node in nodes[i]:
for to in nodes[i + len(node.realWord)]:
to.updateFrom(node) # 根据距离公式计算节点距离,并维护最短路径上的前驱指针from
# 后向回溯
path = [] # 最短路径
f = nodes[len(nodes) - 1].getFirst() # 从终点回溯
while f:
path.insert(0, f)
f = f.getFrom() # 按前驱指针from回溯
return [v.realWord for v in path]
if __name__ == '__main__':
corpus_path = my_cws_corpus()
model_path = os.path.join(test_data_path(), 'my_cws_model')
train_bigram(corpus_path, model_path)
load_bigram(model_path)
3、标准化评测(eval_bigram_cws.py)
# - 标准化评测 -
from pyhanlp import *
from msr import msr_dict, msr_train, msr_model, msr_test, msr_output, msr_gold
from ngram_segment import train_bigram, load_bigram
CWSEvaluator = SafeJClass('com.hankcs.hanlp.seg.common.CWSEvaluator')
if __name__ == '__main__':
train_bigram(msr_train, msr_model) # 训练
segment = load_bigram(msr_model) # 加载
result = CWSEvaluator.evaluate(segment, msr_test, msr_output, msr_gold, msr_dict) # 预测打分
print(result)
4、微软亚洲研究院语料库 MSR(msr.py)
# - 微软亚洲研究院语料库 MSR -
import os
from test_utility import ensure_data, test_data_path
sighan05 = ensure_data('icwb2-data', 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip')
msr_dict = os.path.join(sighan05, 'gold', 'msr_training_words.utf8')
msr_train = os.path.join(sighan05, 'training', 'msr_training.utf8')
msr_model = os.path.join(test_data_path(), 'msr_cws')
msr_test = os.path.join(sighan05, 'testing', 'msr_test.utf8')
msr_output = os.path.join(sighan05, 'testing', 'msr_bigram_output.txt')
msr_gold = os.path.join(sighan05, 'gold', 'msr_test_gold.utf8')
5、下载、移动语料库文件等(test_utility.py)
# - 下载、移动语料库文件等 -
import zipfile
import os
from pyhanlp.static import download, remove_file, HANLP_DATA_PATH
def test_data_path():
data_path = os.path.join(HANLP_DATA_PATH, 'test')
if not os.path.isdir(data_path):
os.mkdir(data_path)
return data_path
def ensure_data(data_name, data_url):
root_path = test_data_path()
dest_path = os.path.join(root_path, data_name)
if os.path.exists(dest_path):
return dest_path
if data_url.endswith('.zip'):
dest_path += '.zip'
download(data_url, dest_path)
if data_url.endswith('.zip'):
with zipfile.ZipFile(dest_path, "r") as archive:
archive.extractall(root_path)
remove_file(dest_path)
dest_path = dest_path[:-len('.zip')]
return dest_path
6、加载语料库(demo_corpus_loader.py)
# - 加载语料库 -
from pyhanlp import *
from test_utility import test_data_path
CorpusLoader = SafeJClass('com.hankcs.hanlp.corpus.document.CorpusLoader')
def my_cws_corpus():
data_root = test_data_path()
corpus_path = os.path.join(data_root, 'my_cws_corpus.txt')
if not os.path.isfile(corpus_path):
with open(corpus_path, 'w') as out:
out.write('''商品 和 服务
商品 和服 物美价廉
服务 和 货币''')
return corpus_path
def load_cws_corpus(corpus_path):
return CorpusLoader.convert2SentenceList(corpus_path)
if __name__ == '__main__':
corpus_path = my_cws_corpus()
sents = load_cws_corpus(corpus_path)
for sent in sents:
print(sent)
备注
句子太长时,可以在字符串中某一位置键入“回车”,即可实现在编辑器中换行而在显示器中不换行的效果。
具体源代码可以查看下面“图灵社区”链接,下载资料。以上代码保存为对应的py文件。
参考文献
《自然语言处理入门》 by 何晗(@hankcs)