最近在看贝叶斯算法,该算法在不少方面都有应用,已有的开发实例有:拼写检查、文本分类、垃圾邮件过滤、中文分词等方面。根据需要,决定实现前面两种,拼写检查已经实现,先贴于此。
程序效果图:
有关贝叶斯算法的学习和拼写纠正方面请参照:原文这里,徐宥的翻译版这里。
程序流程:
1.根据训练语料统计训练语料中每个单词的出现次数、频率,计算出p(h)先验概率;训练语料在此下载 big.txt,内含几百万单词,可作为语料使用。
2.计算条件概率p(D|h),即假设(猜测)的单词是我们输入的单词的概率大小,这里使用了编辑距离的概念,简化起见,计算了所有的编辑距离为1的可能编辑,其他请Google或参见这里。
3.根据Bayes原理,后验概率与每个输入的生成概率p(D)无关,所以p(h|D)∝ P(h) * P(D | h),计算出最可能的拼写。
说明:
1.对于语料中没有出现的单词,采用平滑处理1/N,N为训练样本中所有单词的出现次数之和。
2.条件概率采用1/M,M为所有可能的单词之和,如speling的每一个猜测单词的条件概率1/290,290是编辑距离为1的所有可能的猜测。(也可以将26个字母表示为矩阵,求出每个字母在键盘上的距离,相信会更有说服力。俄罗斯有人在1973年做过这方面研究。)
3.训练语料进性了简单的预处理,统一转换为小写字母。
4.输入exit可以退出程序。
关键代码:
入口:
代码
static void Main(string[] args)
{
// bool trainFlag = false;//语料是否已经训练
string currentPath = Environment.CurrentDirectory.ToString()+ "/big.txt";
double sumWordNum = 0.0;//总的单词数目
Hashtable htProbability = new Hashtable();//存放所有训练语料中每个单词的概率
Hashtable htTmp = new Hashtable();//临时ht 存放训练语料中每个单词的出现次数
Hashtable recommandWordHt = new Hashtable();//拼写检查后推荐的准正确单词
Train train = new Train();//
Task task = new Task();
Console.WriteLine("System is now training the Corpus....");
htTmp = train.getWordsHt(currentPath, ref sumWordNum);
htProbability = train.getWordsProbality(htTmp, sumWordNum);
Console.WriteLine("Training Corpus is finished....");
string inputWord = "";
Console.WriteLine("Please input the word....");
inputWord = Console.ReadLine();
while (!inputWord.Equals("exit"))
{
recommandWordHt = task.getRightWord(inputWord, htProbability, sumWordNum);
if (recommandWordHt != null)
{
foreach (DictionaryEntry de in recommandWordHt)
{
Console.Write("Do you want to input: " + de.Key.ToString() + ";");
Console.WriteLine("the probality:" + de.Value.ToString());
}
}
else
{
Console.WriteLine("The word 【"+inputWord+"】 you have input is right!");
}
Console.WriteLine("Please input the word....");
inputWord = Console.ReadLine();
}
}
{
// bool trainFlag = false;//语料是否已经训练
string currentPath = Environment.CurrentDirectory.ToString()+ "/big.txt";
double sumWordNum = 0.0;//总的单词数目
Hashtable htProbability = new Hashtable();//存放所有训练语料中每个单词的概率
Hashtable htTmp = new Hashtable();//临时ht 存放训练语料中每个单词的出现次数
Hashtable recommandWordHt = new Hashtable();//拼写检查后推荐的准正确单词
Train train = new Train();//
Task task = new Task();
Console.WriteLine("System is now training the Corpus....");
htTmp = train.getWordsHt(currentPath, ref sumWordNum);
htProbability = train.getWordsProbality(htTmp, sumWordNum);
Console.WriteLine("Training Corpus is finished....");
string inputWord = "";
Console.WriteLine("Please input the word....");
inputWord = Console.ReadLine();
while (!inputWord.Equals("exit"))
{
recommandWordHt = task.getRightWord(inputWord, htProbability, sumWordNum);
if (recommandWordHt != null)
{
foreach (DictionaryEntry de in recommandWordHt)
{
Console.Write("Do you want to input: " + de.Key.ToString() + ";");
Console.WriteLine("the probality:" + de.Value.ToString());
}
}
else
{
Console.WriteLine("The word 【"+inputWord+"】 you have input is right!");
}
Console.WriteLine("Please input the word....");
inputWord = Console.ReadLine();
}
}
训练:
代码
public Hashtable getWordsHt(string objFilePath, ref double sumWordNum)
{
try
{
StreamReader objReader = new StreamReader(objFilePath);
string sLine = "";//存放每一个句子
Hashtable ht = new Hashtable();//存放所有的单词
string pattern = @"\,|\.|\ |\n|\r|\?|\;|\:|\!|\(|\)|\042|\“|\”|\-|[\u4e00-\u9fa5]|[0-9]";//分词标记符号
Regex regex = new Regex(pattern);
int count = 0;//计算单词的个数
while (sLine != null)
{
sLine = objReader.ReadLine();
if (sLine != null)
{
sLine = sLine.ToLower();
string[] words = regex.Split(sLine);
foreach (string word in words)
{
if (!ht.ContainsKey(word))
{
count++;
ht.Add(word, 1);
}
else
{
ht[word] = Convert.ToInt32(ht[word]) + 1;
}
sumWordNum = sumWordNum + 1;
}
}
}
objReader.Close();
return ht;
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
}
public Hashtable getWordsHt(string objFilePath, ref double sumWordNum)
{
try
{
StreamReader objReader = new StreamReader(objFilePath);
string sLine = "";//存放每一个句子
Hashtable ht = new Hashtable();//存放所有的单词
string pattern = @"\,|\.|\ |\n|\r|\?|\;|\:|\!|\(|\)|\042|\“|\”|\-|[\u4e00-\u9fa5]|[0-9]";//分词标记符号
Regex regex = new Regex(pattern);
int count = 0;//计算单词的个数
while (sLine != null)
{
sLine = objReader.ReadLine();
if (sLine != null)
{
sLine = sLine.ToLower();
string[] words = regex.Split(sLine);
foreach (string word in words)
{
if (!ht.ContainsKey(word))
{
count++;
ht.Add(word, 1);
}
else
{
ht[word] = Convert.ToInt32(ht[word]) + 1;
}
sumWordNum = sumWordNum + 1;
}
}
}
objReader.Close();
return ht;
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
}
编辑距离计算:
代码
for (int i = 0; i < n; i++)//delete一个字母的情况
{
tempWord = word.Substring(0, i) + word.Substring(i + 1);
al.Add(tempWord);
possibleNum++;
}
for (int i = 0; i < n - 1; i++)//transposition一个字母的情况
{
tempWord = word.Substring(0, i) + word.Substring(i + 1, 1) + word.Substring(i, 1) + word.Substring(i + 2);
al.Add(tempWord);
possibleNum++;
}
for (int i = 0; i < n; i++)//alter一个字母的情况
{
string t = word.Substring(i, 1);
for (int ch = 'a'; ch <= 'z'; ch++)
{
if (ch != Convert.ToChar(t))
{
tempWord = word.Substring(0, i) + Convert.ToChar(ch) + word.Substring(i + 1);
al.Add(tempWord);
possibleNum++;
}
}
}
for (int i = 0; i <= n; i++)//insert一个字母的情况
{
//string t = word.Substring(i, 1);
for (int ch = 'a'; ch <= 'z'; ch++)
{
tempWord = word.Substring(0, i) + Convert.ToChar(ch) + word.Substring(i);
al.Add(tempWord);
possibleNum++;
// Console.WriteLine(tempWord);
}
}
{
tempWord = word.Substring(0, i) + word.Substring(i + 1);
al.Add(tempWord);
possibleNum++;
}
for (int i = 0; i < n - 1; i++)//transposition一个字母的情况
{
tempWord = word.Substring(0, i) + word.Substring(i + 1, 1) + word.Substring(i, 1) + word.Substring(i + 2);
al.Add(tempWord);
possibleNum++;
}
for (int i = 0; i < n; i++)//alter一个字母的情况
{
string t = word.Substring(i, 1);
for (int ch = 'a'; ch <= 'z'; ch++)
{
if (ch != Convert.ToChar(t))
{
tempWord = word.Substring(0, i) + Convert.ToChar(ch) + word.Substring(i + 1);
al.Add(tempWord);
possibleNum++;
}
}
}
for (int i = 0; i <= n; i++)//insert一个字母的情况
{
//string t = word.Substring(i, 1);
for (int ch = 'a'; ch <= 'z'; ch++)
{
tempWord = word.Substring(0, i) + Convert.ToChar(ch) + word.Substring(i);
al.Add(tempWord);
possibleNum++;
// Console.WriteLine(tempWord);
}
}
程序采用vs2005实现. 如需要可跟我联系。
下一步准备使用Bayes写个文本分类程序。
转载请注明来源。