随笔都是学习笔记
随笔仅供参考,为避免笔记中可能出现的错误误导他人,请勿转载。
posts - 398,comments - 0,views - 13万
复制代码
from hmm_wb.prob_start import P as p_start
from hmm_wb.prob_trans import P as p_trans
from hmm_wb.prob_emit import P as p_emit


# 维特比算法
def vtb(n, o, s, ps, pe, pt):
    ret = {}
    path = {}
    if n == 0:
        for x in s:
            ret[x] = ps[x] + pe[x][o[n]]
            path[x] = [x]
    else:
        lret, lp = vtb(n - 1, o, s, ps, pe, pt)  # n-1天的,结果有3^n-1个
        for x in s:
            ret[x], mlx = max((lret[lx] + pt[lx][x] + pe[x][o[n]], lx) for lx in s)
            path[x] = lp[mlx] + [x]
    return ret, path  # 返回最大值和路径


# 马尔科夫算法
def hmm(n, o, s, ps, pe, pt):
    ret = {}
    if n == 0:
        for x in s:
            ret[x] = ps[x] * pe[x][o[n]]
    else:
        lret = hmm(n - 1, o, s, ps, pe, pt)  # n-1天的,结果有3^n-1个
        for k, v in lret.items():
            for x in s:
                ret[k + "-" + x] = v * pt[k.split("-")[-1]][x] * pe[x][o[n]]
    return ret


def fenci(path, obs):
    ret = []
    str = ""
    for i in range(len(path)):
        if path[i] == 'B':
            str = ""
            str += obs[i]
        elif path[i] == 'E':
            str += obs[i]
            ret.append(str)
            str = ""
        elif path[i] == 'M':
            str += obs[i]
        elif path[i] == 'S':
            ret.append(obs[i])
    return ret


def get_stop():
    with open("data/stopword.txt", 'r', encoding="gbk") as f:
        dic = [line.strip() for line in f.readlines()]
    return dic

# obs:分词后的列表,stop:停留词列表
def rm_stop(obs, stop):
    nostp = []
    for i in obs:
        if i not in stop:
            nostp.append(i)
    return nostp


if __name__ == '__main__':
    stats = "BMES"
    obs = "今天我来到北京清华大学"

    r, mpath = vtb(len(obs) - 1, obs, stats, p_start, p_emit, p_trans)  # 获取概率及路径
    mv, mk = max(zip(r.values(), r.keys()))  # 最大概率的路径
    path = mpath[mk]  # 最大概率路径

    fc = fenci(path, obs)
    stp = get_stop()  # 读取停留词文档
    str = rm_stop(fc, stp)  # 移除停留词
    print(r)
    print(path)
    print(str)

    # print(get_stop())
复制代码

运行结果:

1
2
3
{'B': -69.1432668906028, 'M': -68.42252839082724, 'E': -68.04528930719714, 'S': -71.23304310663175}
['B', 'E', 'S', 'S', 'S', 'B', 'E', 'B', 'M', 'M', 'E']
['北京', '清华大学']

  

 
posted on   时间完全不够用啊  阅读(63)  评论(0编辑  收藏  举报
(评论功能已被禁用)
相关博文:
阅读排行:
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· 上周热点回顾(2.24-3.2)
< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5

点击右上角即可分享
微信分享提示