单词纠错系统
单词纠错
vocab = set([line.rstrip() for line in open('/content/drive/My Drive/data/vocab_data/vocab.txt')])
需要生成所有候选集合
def generate_candidates(word):
"""
word: 给定的输入(错误的输入)
返回所有(valid)候选集合
"""
# 生成编辑距离为1的单词
# 1. insert 2. delete 3. replace
# app: replace: bpp1, cpp1, app1, abp1....
# insert: bappl, cappl, abppl, acppl....
# delete: ppl, apl, app....
# 假设使用26个字符
letters = 'abcderghigklmnopqrstuvwxyz'
splits = [(word[:i], word[i:]) for i in range(len(word)+1)]
# insert copteration
inserts = [L+c+R for L,R in splits for c in letters]
# delete opteration
deletes = [L+R[1:] for L,R in splits if R]
# repalce opteration
repalces =[L+c+R[1:] for L,R in splits if R for c in letters]
# 生成的所有候选单词
candidates = set(inserts+deletes+repalces)
# 过滤掉不存在与词典库里面的单词
return [word for word in candidates if word in vocab]
generate_candidates("apple")
读取语料库
import nltk
nltk.download('reuters')
nltk.download('punkt')
from nltk.corpus import reuters
categories = reuters.categories()
corpus = reuters.sents(categories=categories)
构建语言模型:bigram
term_count = {}
bigram_count = {}
for doc in corpus:
doc = ['<s>'] + doc
# bigram; [i, i+1]
for i in range(0,len(doc)-1):
term = doc[i]
bigram = doc[i:i+2]
if term in term_count:
term_count[term]+=1
else:
term_count[term]=1
bigram = ''.join(bigram)
if bigram in bigram_count:
bigram_count[bigram]+=1
else:
bigram_count[bigram]=1
用户打错的概率
channel_prob = {}
for line in open('/content/drive/My Drive/data/vocab_data/spell-errors.txt'):
items = line.split(":")
correct = items[0].strip()
mistakes =[item.strip() for item in items[1].strip().split(",")]
channel_prob[correct] = {}
for mis in mistakes:
channel_prob[correct][mis]=1.0/len(mistakes)
import numpy as np
V = len(term_count.keys())
file = open('/content/drive/My Drive/data/vocab_data/testdata.txt')
for line in file:
items = line.rstrip().split('\t')
line = items[2].split()
# ["I","loke","palying"]
for word in line:
if word not in vocab:
# 需要替换word成正确的单词
# Step1: 生成所有的valid候选集合
candidates = generate_candidates(word=word)
if len(candidates) < 1:
continue # 不建议这么做(这是不对的)
# TODO : 根据条件生成更多的集合条件
probs = []
# 对于每一恶搞candidate,计算他的score
# score = p(correct)*p(misktake|correct)
# = log p(correct)*p(misktake|correct)
# 返回 score最大的candiate
for candi in candidates:
prob = []
# a. 计算channelprobability
if candi in channel_prob and word in channel_prob[candi]:
prob += np.log(channel_prob[candi][word])
else:
prob += np.log(0.0001)
# b. 计算语言模型的概率,开始有个s的符号
idx = items[2].index(word)+1
if (items[2][idx-1] in bigram_count and candi in bigram_count[items[2]]):
prob += np.log((bigram_count[item[2][idx-1]][candi]+1.0)/(
term_count[bigram_count[items[2][idx-1]]]+V))
else:
prob += np.log(1.0/V)
probs.append(prob)
max_idx = probs.index(max(probs))
print(word,candidates[max_idx])
后续未完待续.....
posted on 2020-03-20 21:21 TuringEmmy 阅读(271) 评论(0) 编辑 收藏 举报