正向最大匹配算法

# -*- coding: utf-8 -*-
# MM
# 使用正向最大匹配算法实现中文分词
dic = []
MAX_LENGTH = 5
 
def init():
    """
    读文件
    获取中文词典
    :return:
    """
    input = open("test.txt")
    lines = input.readlines()
    for line in lines:
        temp = line.split(',')
        dic.append(temp[0])
    for d in dic:
        print(d)
 
def if_contain(words):
    """
    判断当前词在词典中是否存在
    :param words:
    :return:
    """
    flag = False
    for d in dic:
        if d == words:
            flag = True
            break
    return flag

def spl(sentence):
    """
    正向最大匹配算法的主要实现部分
    从后向前切割字符串,直到切割出的子串与词典中的词匹配
    :param sentence:
    :return:
    """
    result = ''
    words = []
 
    while len(sentence) > 0:
        except_flag = False
        for i in range(MAX_LENGTH, 0, -1):
            temp = sentence[:i]    # 中文字符串切割方式
            print(i,temp)
            flag = if_contain(temp)
            if flag:
                words.append(temp)
                sentence = sentence[i:]
                except_flag = True
                break
        if not except_flag:
            # 判断当前字符串是否在词典中并不存在,若该字符串从头切割到尾都没有词典中的词则认为无法切割并且
            # 词典中不存在,此时直接将该词当成切割后的结果加入结果列表
            words.append(sentence)
            break
    for w in words:
        result += (w + '/')
    return result

def main():
    """
    与用户交互接口
    :return:
    """
    init()
    while True:
        input_str = input(">")
        if not input_str:
            break
        result = spl(input_str)
        print("分词结果为:")
        print(result)
 
 
if __name__ == "__main__":
    main()

 

posted @ 2021-05-19 05:47  祈欢  阅读(491)  评论(0编辑  收藏  举报