用马尔科夫模型做拼写检查
原理和原文见 Peter Norvig的这篇文章
原文是基于词频,在文后提到可以通过上下文来提高准确率
下面这段代码只考虑了待纠正词在序列末尾的情况,应当还要考虑其在序列中和序列首的情况
import re, collections, sys, random
def words(text): return re.findall('[a-z]+', text.lower())
def defaultdict_factoryn(n, default):
if n == 1: return lambda: default
return lambda: collections.defaultdict(defaultdict_factoryn(n-1, default))
def multidict_set(d, l, v):
curd = d;
i = 0
for ele in l:
i += 1
if i == len(l):
curd[ele] = v
else:
curd = curd[ele]
def multidict_add(d, l, v):
curd = d;
i = 0
for ele in l:
i += 1
if i == len(l):
curd[ele] += v
else:
curd = curd[ele]
def multidict_get(d, l):
curd = d;
i = 0
for ele in l:
curd = curd[ele]
return curd
def train(features, n):
model = collections.defaultdict(defaultdict_factoryn(n, 1))
prevlen = n
prev = collections.deque()
for f in features:
if (len(prev) < prevlen):
prev.append(f)
continue
multidict_add(model, prev, 1)
prev.popleft()
prev.append(f)
return model
def most_likely(prev):
l = multidict_get(Model, prev)
if not l:
return ""
l = l.items()
if len(l) == 0:
return ""
l = sorted(l, cmp=lambda x, y:y[1] - x[1])
count = min(len(l) - 1, 10)
return l[random.randint(0, count)][0]
stage = 1
Model = train(words(file('big.txt').read()), stage + 1)
def train_1(features):
model = collections.defaultdict(lambda: 1)
for f in features:
model[f] += 1
return model
NWORDS = train_1(words(file('big.txt').read()))
alphabet = 'abcdefghijklmnopqrstuvwxyz'
def edits1(word):
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [a + b[1:] for a, b in splits if b]
transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b]
inserts = [a + c + b for a, b in splits for c in alphabet]
return set(deletes + transposes + replaces + inserts)
def known_edits2(word):
return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)
def known(words): return set(w for w in words if w in NWORDS)
def correct(prev, word):
# print [(c, NWORDS.get(c)) for c in candidates]
if len(prev) < stage:
candidates = known([word]) or known(edits1(word)) or known_edits2(word) or set([word])
return max(candidates, key=NWORDS.get)
else:
candidates = known([word]) | known(edits1(word)) | known_edits2(word) | set([word])
return max(candidates, key=lambda x:multidict_get(Model, prev + [x]))
print correct(["i"], "ove")
print correct([], "ove")
def words(text): return re.findall('[a-z]+', text.lower())
def defaultdict_factoryn(n, default):
if n == 1: return lambda: default
return lambda: collections.defaultdict(defaultdict_factoryn(n-1, default))
def multidict_set(d, l, v):
curd = d;
i = 0
for ele in l:
i += 1
if i == len(l):
curd[ele] = v
else:
curd = curd[ele]
def multidict_add(d, l, v):
curd = d;
i = 0
for ele in l:
i += 1
if i == len(l):
curd[ele] += v
else:
curd = curd[ele]
def multidict_get(d, l):
curd = d;
i = 0
for ele in l:
curd = curd[ele]
return curd
def train(features, n):
model = collections.defaultdict(defaultdict_factoryn(n, 1))
prevlen = n
prev = collections.deque()
for f in features:
if (len(prev) < prevlen):
prev.append(f)
continue
multidict_add(model, prev, 1)
prev.popleft()
prev.append(f)
return model
def most_likely(prev):
l = multidict_get(Model, prev)
if not l:
return ""
l = l.items()
if len(l) == 0:
return ""
l = sorted(l, cmp=lambda x, y:y[1] - x[1])
count = min(len(l) - 1, 10)
return l[random.randint(0, count)][0]
stage = 1
Model = train(words(file('big.txt').read()), stage + 1)
def train_1(features):
model = collections.defaultdict(lambda: 1)
for f in features:
model[f] += 1
return model
NWORDS = train_1(words(file('big.txt').read()))
alphabet = 'abcdefghijklmnopqrstuvwxyz'
def edits1(word):
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [a + b[1:] for a, b in splits if b]
transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b]
inserts = [a + c + b for a, b in splits for c in alphabet]
return set(deletes + transposes + replaces + inserts)
def known_edits2(word):
return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)
def known(words): return set(w for w in words if w in NWORDS)
def correct(prev, word):
# print [(c, NWORDS.get(c)) for c in candidates]
if len(prev) < stage:
candidates = known([word]) or known(edits1(word)) or known_edits2(word) or set([word])
return max(candidates, key=NWORDS.get)
else:
candidates = known([word]) | known(edits1(word)) | known_edits2(word) | set([word])
return max(candidates, key=lambda x:multidict_get(Model, prev + [x]))
print correct(["i"], "ove")
print correct([], "ove")
posted on 2011-05-13 17:12 Michael Peng 阅读(397) 评论(0) 编辑 收藏 举报