ltp分词加入词典后处理——强制分词
ltp分词加入词典后处理——强制分词
问题:在使用ltp分词模块的时候,某些词典的词被切开后,所分的词没有按照词典进行标注?
- 此处在借鉴https://blog.csdn.net/KanShiMeKan/article/details/80974977的基础上对进行改进,主要通过正则表达式
import os
import re
from pyltp import Segmentor
class ForceSegmentor(object):
def __init__(self):
self.forcelist = [] #清楚换行符的列表
def find_in_dict(self, filepath,sentence):
"""
获取字典
:param filepath: 字典文件路径
:param sentence: 带分词的内容
:return:
"""
compilelist = []
result_re = []
with open(filepath, 'r',encoding="utf-8") as file:
line = file.readline() #一行行的读取
while line:
line = file.readline()
line = line.replace('\n', '')
compilelist.append(re.compile(line))
de_sentence = sentence
for compilestr in compilelist: #对字典一个个进行循环,找到的是一个个零散的词
result = compilestr.search(de_sentence) #这是循环啊,一个个,de_sentence是整个句子
if result: #此时找到的是一个次
# 找到句子中包含的字典中的词
result_re.append(result.group())
return result_re
def merge(self, sentence,filepath, words):
"""
分词主函数
:param sentence: 待处理句子
:param filepath: 字典文件路径
:param words: ltp原始分词结果,要求list形式
:return: 新分词结果
"""
found = self.find_in_dict(filepath,sentence) #调用函数并获得结果
result = []
indexs_starts = []
indexs_distance = []
found_words =[]
for found_word in found:
if found_word: #此时是一个个的词语进行循
indexs_start = []
# 合并的词首尾距离
index_distance = 0
index_start = -1
strm = ''
for i, word in enumerate(words): #计数并加编号
wl = len(word)#本身词长!
if (index_start == -1 and word == found_word[0:wl]): #现在只是半截有值
index_start = i
strm += word #strm此时含有的也是半截,冀州
elif (index_start != -1): #第一次循环,他是等于-1的所以跳过判断
strm += word
if (strm == found_word):
# 已经完全匹配
indexs_start.append(index_start) #此时index_start是-1啊亲爱的
index_distance = i - index_start + 1
index_start = -1
strm = ''
elif (strm not in found_word): #此时它是在的
# 现在连接得到的多个词是错误的,重新开始匹配
index_start = -1
strm = ''
indexs_starts.append((indexs_start)[0])
indexs_distance.append(index_distance)
found_words.append(found_word)
j = 0
while (j < len(words)): #此时是个列表啊
word = words[j] #一个个分词
# print(word,end="/")
if (j in indexs_starts[0:-1:]):#等于起始位置,此时它是个列表
# print(j,found_word)
num = indexs_starts.index(j) #获取信息编号
result.append(found_words[num])#就夸嚓全覆盖
j += indexs_distance[num]
else:
result.append(word)#其实还是靠谱的,问题就在于循环不起来
j += 1
return result
def run(text):
"""
:param text: 带分词内容
:return:
"""
LTP_DIR = r"D:\ltp_data_v3.4.0" #LTP分词模块路径
segmentor = Segmentor() # 启动分词模块
segmentor.load_with_lexicon(os.path.join(LTP_DIR, "cws.model"),os.path.join(LTP_DIR, "lexicon.txt"))#此处外接了个其他词典
segmentor_res = segmentor.segment(text)
words = list(segmentor_res) # 调用默认的ltp分词得到的分词结果
forceSegmentor = ForceSegmentor()
filepath = r"C:\Users\lenovo\PycharmProjects\pythonProjectketi\address" #分词词典路径
words1 = forceSegmentor.merge(text,filepath, words) # 强制分词以后的结果
print(words1)
if __name__ == '__main__':
sentnce = input("请输入要分词的内容:")
run()
记录学习的点点滴滴