单词拼写检查器

哈哈，一个有趣的问题，写一个单词拼写检查器，在微薄上看到的，各路大神用各种语言实现～～
学习了一下人家代码：表示各种malloc,realloc, char **, 等等。。。。自己写静态写习惯了
现在看动态都有点不习惯了。。
我写了注释代码：
原文：http://marcelotoledo.com/2007/08/10/how-to-write-a-spelling-corrector/
View Code
/* 
 * spell.c --- spell corrector
 * 
 * Copyright  (C)  2007  Marcelo Toledo <marcelo@marcelotoledo.org>
 * 
 * Version: 1.0
 * Keywords: spell corrector
 * Author: Marcelo Toledo <marcelo@marcelotoledo.org>
 * Maintainer: Marcelo Toledo <marcelo@marcelotoledo.org>
 * URL: http://www.marcelotoledo.org
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 * 
 * Commentary: 
 * 
 * See http://www.marcelotoledo.org.
 * 
 * Code:
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <search.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>

#define DICTIONARY "./big.txt"
#define DICT_SZ    3000000

const char delim[]    = ".,:;`/\"+-_(){}[]<>*&^%$#@!?~/|\\=1234567890 \t\n"; //读取语义材料时，识别每个单词，分割符为delim数组内的字符
const char alphabet[] = "abcdefghijklmnopqrstuvwxyz";//26个字符，对与每一个位置，都要替换，插入～～

//将单词转为小写
static char *strtolower(char *word)
{
        char *s;
        
        for (s = word; *s; s++)
                *s = tolower(*s);
        
        return word;
}
/*
hsearch,hash table,
typedef struct entry
{
  char *key;
  void *data;
}ENTRY;
*/
static ENTRY *find(char *word)
{
        ENTRY e;

        e.key = word;
        return hsearch(e, FIND);
}
//如果此关键字存在就次数++,不存在就返回0
static int update(char *word)
{
        ENTRY *e = find(word);
        
        if (!e)
                return 0;
        
        e->data++;
        
        return 1;
}
//读取语义材料的每个单词，并将其加入HASH表，建立正确的统计模型
static int read_file(ENTRY dict)
{
        char *file, *word, *w;
        FILE *fp = fopen(DICTIONARY, "r");
        struct stat sb;
        
        if (!fp)
                return 0;
        
        if (stat(DICTIONARY, &sb))
                return 0;
        
        file = malloc(sb.st_size);
        if (!file) {
                fclose(fp);
                return 0;
        }

        fread(file, sizeof(char), sb.st_size, fp);

        word = strtok(file, delim);
        while(word != NULL) {
                w = strtolower(strdup(word));
                
                if (!update(w)) {
                        dict.key  = w;
                        dict.data = 0;
                        hsearch(dict, ENTER);
                }
                
                word = strtok(NULL, delim);
        }
        
        free(file);
        fclose(fp);

        return 1;
}

//将str从offset开始的limit个字节复制成一个新字符串，并返回这个字符串
static char *substr(char *str, int offset, int limit)
{
        char *new_str;
        int str_size = strlen(str);
        
        if ((limit > str_size) || ((offset + limit) > str_size) || 
            (str_size < 1) || (limit == 0))
                return NULL;
        
        new_str = malloc(limit+1 * sizeof(char));
        if (!new_str)
                return NULL;
        
        strncpy(new_str, str+offset, limit);
        *(new_str + limit) = '\0';
        
        return new_str;
}
//连接字符串1和字符串2
static char *concat(char *str1, char *str2)
{
        if (!str1) { 
                str1 = malloc(sizeof(char));
                *str1 = '\0';
        }
        
        if (!str2) { 
                str2 = malloc(sizeof(char));
                *str2 = '\0';
        }
        
        str1 = realloc(str1, strlen(str1) + strlen(str2) + 1);
        return strcat(str1, str2);
}
//删除一个字符
static int deletion(char *word, char **array, int start_idx)
{
        int i, word_len = strlen(word);
        
        for (i = 0; i < word_len; i++)
                array[i + start_idx] = concat(substr(word, 0, i), substr(word, i+1, word_len-(i+1)));
        
        return i;
}
//交换i位置与i+1位置
static int transposition(char *word, char **array, int start_idx)
{
        int i, word_len = strlen(word);
        
        for (i = 0; i < word_len-1; i++)
                array[i + start_idx] = concat(concat(substr(word, 0, i), 
                                                     substr(word, i+1, 1)), 
                                              concat(substr(word, i, 1), 
                                                     substr(word, i+2, word_len-(i+2))));
        
        return i;
}
//将i位置替换26个字符
static int alteration(char *word, char **array, int start_idx)
{
        int i, j, k, word_len = strlen(word);
        char c[2] = { 0, 0 };
        
        for (i = 0, k = 0; i < word_len; i++)
                for (j = 0; j < sizeof(alphabet); j++, k++) {
                        c[0] = alphabet[j];
                        array[start_idx + k] = concat(concat(substr(word, 0, i), (char *) &c), 
                                                      substr(word, i+1, word_len - (i+1)));
                }
        
        return k;
}
//在i位置插入26个字符
static int insertion(char *word, char **array, int start_idx)
{
        int i, j, k, word_len = strlen(word);
        char c[2] = { 0, 0 };

        for (i = 0, k = 0; i <= word_len; i++)
                for (j = 0; j < sizeof(alphabet); j++, k++) {
                        c[0] = alphabet[j];
                        array[start_idx + k] = concat(concat(substr(word, 0, i), (char *) &c), 
                                                      substr(word, i, word_len - i));
                }
        
        return k;
}
//返回1次编辑，将产生多少行，也就是需要多少个一维array数组
static int edits1_rows(char *word)
{
        register int size = strlen(word);
        
        return (size)                          + // deletion
               (size - 1)                      + // transposition
               (size * sizeof(alphabet))       + // alteration
               (size + 1) * sizeof(alphabet);    // insertion
}
//对字符串四种操作，枚举每一种可能，结果保存在array数组中
static char **edits1(char *word)
{
        int next_idx;
        char **array = malloc(edits1_rows(word) * sizeof(char *));

        if (!array)
                return NULL;

        next_idx  = deletion(word, array, 0);
        next_idx += transposition(word, array, next_idx);
        next_idx += alteration(word, array, next_idx);
        insertion(word, array, next_idx);

        return array;
}
//如果存在该单词，就返回1
static int array_exist(char **array, int rows, char *word)
{
        int i;
        
        for (i = 0; i < rows; i++)
                if (!strcmp(array[i], word))
                        return 1;
        
        return 0;
}
//2次编辑
static char **known_edits2(char **array, int rows, int *e2_rows)
{
        int i, j, res_size, e1_rows;
        char **res = NULL, **e1;
        
        for (i = 0, res_size = 0; i < rows; i++) {
                e1      = edits1(array[i]);
                e1_rows = edits1_rows(array[i]);
                
                for (j = 0; j < e1_rows; j++)
                        if (find(e1[j]) && !array_exist(res, res_size, e1[j])) {
                                res             = realloc(res, sizeof(char *) * (res_size + 1));
                                res[res_size++] = e1[j];
                        }
        }
        
        *e2_rows = res_size;
        
        return res;
}
//找单词出现次数最多的一个单词
static char *max(char **array, int rows)
{
        char *max_word = NULL;
        int i, max_size = 0;
        ENTRY *e;
        
        for (i = 0; i < rows; i++) {
                e = find(array[i]);
                if (e && ((int) e->data > max_size)) {
                        max_size = (int) e->data;
                        max_word = e->key;
                }
        }

        return max_word;
}
//释放动态申请的内存
static void array_cleanup(char **array, int rows)
{
        int i;
        
        for (i = 0; i < rows; i++)
                free(array[i]);
}
//返回最接近或正确的单词
static char *correct(char *word)
{
        char **e1, **e2, *e1_word, *e2_word, *res_word = word;
        int e1_rows, e2_rows;
        
        if (find(word))
                return word;
        
        e1_rows = edits1_rows(word);
        if (e1_rows) {
                e1      = edits1(word);
                e1_word = max(e1, e1_rows);

                if (e1_word) {
                        array_cleanup(e1, e1_rows);
                        free(e1);
                        return e1_word;
                }
        }

        e2 = known_edits2(e1, e1_rows, &e2_rows);
        if (e2_rows) {
                e2_word = max(e2, e2_rows);
                if (e2_word)
                        res_word = e2_word;
        }
        
        array_cleanup(e1, e1_rows);
        array_cleanup(e2, e2_rows);
        
        free(e1);
        free(e2);
        
        return res_word;
}

int main(int argc, char **argv)
{
        char *corrected_word;
        ENTRY dict;
        
        hcreate(DICT_SZ);
        
        if (!read_file(dict))
                return -1;
        
        corrected_word = correct(argv[1]);
        if (strcmp(corrected_word, argv[1])) {
                printf("Did you mean \"%s\"?\n", corrected_word);
        } else {
                printf("\"%s\" is correct!\n", argv[1]);
        }
        
        return 0;
}
posted on 2013-04-08 22:41 luckyboy1991 阅读(241) 评论(0) 编辑收藏举报