拼写检查算法 Golang 版
最近看了 阮一峰的一篇文章介绍使用贝叶斯推断方法做拼写检查的文章,该文章的易懂程度输于 Google 技术总监写的原文,其优秀的译文。
说明了啥,越是大师级的人写的文章往往越易懂。所以关于贝叶斯方法我就不解释了。只帖代码
我使用golang对照实现了一遍:
一是为了弄懂其算法细节
二是不使前段时间看的golang语法忘记
就像几年前在学校时候对着C版的数据结构书用C#去实现一样。
package main
import ("fmt""io/ioutil""regexp")var (NWORDS map[string]int)const (alphabet = "abcdefghijklmnopqrstuvwxyz")func words(text string) []string {regex, _ := regexp.Compile("[a-z]+")return regex.FindAllString(text, -1)}func train(features []string) map[string]int {result := make(map[string]int)for i := range features {_, isexist := result[features[i]]if !isexist {result[features[i]] = 1} else {result[features[i]] += 1}}return result}func edit1(word string) []string {type tuple struct{ a, b string }var splits []tuplefor i := 0; i < len(word)+1; i++ {splits = append(splits, tuple{word[:i], word[i:]})}var deletes []stringfor _, t := range splits {if len(t.b) > 0 {deletes = append(deletes, t.a+t.b[1:])}}var transposes []stringfor _, t := range splits {if len(t.b) > 1 {transposes = append(transposes, t.a+string(t.b[1])+string(t.b[0])+t.b[2:])}}var replaces []stringfor _, c := range alphabet {for _, t := range splits {if len(t.b) > 0 {replaces = append(replaces, t.a+string(c)+t.b[1:])}}}var inserts []stringfor _, c := range alphabet {for _, t := range splits {inserts = append(inserts, t.a+string(c)+t.b)}}//concat this slicedeletes = append(deletes, transposes...)deletes = append(deletes, replaces...)deletes = append(deletes, inserts...)return set(deletes)}func known_edits2(word string) []string {var arr []stringfor _, e1 := range edit1(word) {for _, e2 := range edit1(e1) {if _, ok := NWORDS[e2]; ok {arr = append(arr, e2)}}}return set(arr)}func known(words []string) []string {var knows []stringfor _, value := range words {if _, ok := NWORDS[value]; ok {knows = append(knows, value)}}return knows}func appendIfMissing(slice []string, i string) []string {for _, ele := range slice {if ele == i {return slice}}return append(slice, i)}func set(arr []string) []string {var result []stringfor _, ele := range arr {result = appendIfMissing(result, ele)}return result}func correct(word string) string {candidates := known([]string{word})if len(candidates) <= 0 {candidates = known(edit1(word))if len(candidates) <= 0 {candidates = known(known_edits2(word))}}return max(candidates, NWORDS)}func max(arr []string, dict map[string]int) string {flag := 0index := 0for ix, value := range arr {if v, ok := dict[value]; ok {if v > flag {flag = vindex = ix}}}return arr[index]}func main() {buf, _ := ioutil.ReadFile("big.txt")NWORDS = train(words(string(buf)))word := "beford"fmt.Println("input:", word, "correct word:", correct(word))}
python 版本只有30行左右,golang对各种集合操作和python对比差了许多。
python里用set(arr),即可将列表里重复的删除。简洁的用for in 构造列表实在很cool