ltp分词加入词典后处理——强制分词

ltp分词加入词典后处理——强制分词

问题:在使用ltp分词模块的时候,某些词典的词被切开后,所分的词没有按照词典进行标注?

  • 此处在借鉴https://blog.csdn.net/KanShiMeKan/article/details/80974977的基础上对进行改进,主要通过正则表达式
import os
import re
from pyltp import Segmentor


class ForceSegmentor(object):
    def __init__(self):
        self.forcelist = [] #清楚换行符的列表

    def find_in_dict(self, filepath,sentence):
        """
        获取字典
        :param filepath: 字典文件路径
        :param sentence: 带分词的内容
        :return:
        """
        compilelist = []
        result_re = []
        with open(filepath, 'r',encoding="utf-8") as file:
            line = file.readline() #一行行的读取
            while line:
                line = file.readline()
                line = line.replace('\n', '')
                compilelist.append(re.compile(line))
        de_sentence = sentence
        for compilestr in compilelist: #对字典一个个进行循环,找到的是一个个零散的词
            result = compilestr.search(de_sentence) #这是循环啊,一个个,de_sentence是整个句子
            if result: #此时找到的是一个次
                # 找到句子中包含的字典中的词
                result_re.append(result.group())
        return  result_re

    def merge(self, sentence,filepath, words):
        """
        分词主函数
        :param sentence: 待处理句子
        :param filepath: 字典文件路径
        :param words: ltp原始分词结果,要求list形式
        :return: 新分词结果
        """
        found = self.find_in_dict(filepath,sentence) #调用函数并获得结果
        result = []
        indexs_starts = []
        indexs_distance = []
        found_words =[]
        for found_word in  found:
            if found_word: #此时是一个个的词语进行循
                indexs_start = []
                # 合并的词首尾距离
                index_distance = 0
                index_start = -1
                strm = ''
                for i, word in enumerate(words): #计数并加编号
                    wl = len(word)#本身词长!
                    if (index_start == -1 and word == found_word[0:wl]): #现在只是半截有值
                        index_start = i
                        strm += word  #strm此时含有的也是半截,冀州

                    elif (index_start != -1): #第一次循环,他是等于-1的所以跳过判断
                        strm += word
                        if (strm == found_word):
                            # 已经完全匹配
                            indexs_start.append(index_start) #此时index_start是-1啊亲爱的
                            index_distance = i - index_start + 1
                            index_start = -1
                            strm = ''
                        elif (strm not in found_word): #此时它是在的
                            # 现在连接得到的多个词是错误的,重新开始匹配
                            index_start = -1
                            strm = ''

            indexs_starts.append((indexs_start)[0])
            indexs_distance.append(index_distance)
            found_words.append(found_word)

        j = 0
        while (j < len(words)): #此时是个列表啊
            word = words[j] #一个个分词
            # print(word,end="/")
            if (j in indexs_starts[0:-1:]):#等于起始位置,此时它是个列表
                # print(j,found_word)
                num = indexs_starts.index(j) #获取信息编号
                result.append(found_words[num])#就夸嚓全覆盖
                j += indexs_distance[num]
            else:
                result.append(word)#其实还是靠谱的,问题就在于循环不起来
                j += 1
        return result



def run(text):
    """
    :param text: 带分词内容
    :return: 
    """    
    LTP_DIR = r"D:\ltp_data_v3.4.0" #LTP分词模块路径
    segmentor = Segmentor()  # 启动分词模块
    segmentor.load_with_lexicon(os.path.join(LTP_DIR, "cws.model"),os.path.join(LTP_DIR, "lexicon.txt"))#此处外接了个其他词典
    segmentor_res = segmentor.segment(text)
    words = list(segmentor_res)  # 调用默认的ltp分词得到的分词结果
    forceSegmentor = ForceSegmentor()
    filepath = r"C:\Users\lenovo\PycharmProjects\pythonProjectketi\address" #分词词典路径
    words1 = forceSegmentor.merge(text,filepath, words)  # 强制分词以后的结果
    print(words1)

if __name__ == '__main__':
    sentnce = input("请输入要分词的内容:")
    run()
    




posted @ 2022-03-10 17:43  小杨的冥想课  阅读(268)  评论(0编辑  收藏  举报