昨天有幸拜读了洞庭散人的,我正在学习这个,我从内心感谢洞庭散人的分享!随即我把它移植到了c#平台上。
该程序用到了Lucene.Net,用到了基于词典的ICTCLAS中文分词1.0.
昨天有幸拜读了洞庭散人的<基于朴素贝叶斯分类器的文本分类算法>,我正在学习这个,我从内心感谢洞庭散人的分享!随即我把它移植到了c#平台上。
该程序用到了Lucene.Net,用到了基于词典的ICTCLAS中文分词1.0.
ICTCLAS中文分词for Lucene.Net接口代码(实现Analyzer):
![](https://www.cnblogs.com/Images/OutliningIndicators/ContractedBlock.gif)
Code
1
using System;
2
using System.Collections.Generic;
3
using System.Text;
4
using System.IO;
5![](https://www.cnblogs.com/Images/OutliningIndicators/None.gif)
6
using Lucene.Net.Analysis;
7
using Lucene.Net.Analysis.Standard;
8![](https://www.cnblogs.com/Images/OutliningIndicators/None.gif)
9
namespace AspxOn.Search.FenLei
10![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedBlockStart.gif)
![](https://www.cnblogs.com/Images/OutliningIndicators/ContractedBlock.gif)
{
11![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
12![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//// <summary>
13
/// ICTCLAS分词组件for Lucene.net接口
14
/// </summary>
15
public class ICTCLASAnalyzer : Analyzer
16![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
17
//定义要过滤的词
18
public static readonly System.String[] CHINESE_ENGLISH_STOP_WORDS = new string[428];
19
public string NoisePath = Environment.CurrentDirectory + "\\data\\stopwords.txt";
20![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
21
public ICTCLASAnalyzer()
22![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
23
StreamReader reader = new StreamReader(NoisePath, System.Text.Encoding.Default);
24
string noise = reader.ReadLine();
25
int i = 0;
26
27
while (!string.IsNullOrEmpty(noise))
28![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
29
CHINESE_ENGLISH_STOP_WORDS[i] = noise;
30
noise = reader.ReadLine();
31
i++;
32
}
33
34
}
35![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
36![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//**//**//// Constructs a {@link StandardTokenizer} filtered by a {@link
37
/// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
38
///
39
public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
40![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
41
TokenStream result = new ICTCLASTokenizer(reader);
42
result = new StandardFilter(result);
43
result = new LowerCaseFilter(result);
44
result = new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS);
45
return result;
46
}
47![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
48![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
49
}
50
}
ICTCLAS中文分词for Lucene.Net接口代码(实现Tokenizer):
![](https://www.cnblogs.com/Images/OutliningIndicators/ContractedBlock.gif)
Code
1
using System;
2
using System.Collections.Generic;
3
using System.Text;
4![](https://www.cnblogs.com/Images/OutliningIndicators/None.gif)
5
using Lucene.Net.Analysis;
6
using SharpICTCLAS;
7
using System.IO;
8![](https://www.cnblogs.com/Images/OutliningIndicators/None.gif)
9
namespace AspxOn.Search.FenLei
10![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedBlockStart.gif)
![](https://www.cnblogs.com/Images/OutliningIndicators/ContractedBlock.gif)
{
11
public class ICTCLASTokenizer : Tokenizer
12![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
13
int nKind = 1;
14
List<WordResult[]> result;
15
int startIndex = 0;
16
int endIndex = 0;
17
int i = 1;
18![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//**/
19![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**////
20
/// 待分词的句子
21
///
22
private string sentence;
23![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//**/
24![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//// Constructs a tokenizer for this Reader.
25
public ICTCLASTokenizer(System.IO.TextReader reader)
26![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
27
this.input = reader;
28
sentence = input.ReadToEnd();
29
sentence = sentence.Replace("\r\n", "");
30
string DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar;
31
//Console.WriteLine("正在初始化字典库,请稍候");
32
WordSegment wordSegment = new WordSegment();
33
wordSegment.InitWordSegment(DictPath);
34
result = wordSegment.Segment(sentence, nKind);
35
}
36![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
37![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//**/
38![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//// 进行切词,返回数据流中下一个token或者数据流为空时返回null
39
///
40
public override Token Next()
41![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
42
Token token = null;
43
while (i < result[0].Length - 1)
44![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
45
string word = result[0][i].sWord;
46
endIndex = startIndex + word.Length - 1;
47
token = new Token(word, startIndex, endIndex);
48
startIndex = endIndex + 1;
49![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
50
i++;
51
return token;
52![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
53
}
54
return null;
55
}
56![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
57
}
58
}
中文分词器代码:
![](https://www.cnblogs.com/Images/OutliningIndicators/ContractedBlock.gif)
Code
1
using System;
2
using System.Collections.Generic;
3
using System.Text;
4
using System.IO;
5![](https://www.cnblogs.com/Images/OutliningIndicators/None.gif)
6
using Lucene.Net.Analysis;
7
using Lucene.Net.Analysis.Standard;
8
using Lucene.Net.Documents;
9![](https://www.cnblogs.com/Images/OutliningIndicators/None.gif)
10
using Lucene.Net.Analysis.Cn;
11
using Lucene.Net.Analysis.KTDictSeg;
12![](https://www.cnblogs.com/Images/OutliningIndicators/None.gif)
13
namespace AspxOn.Search.FenLei
14![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedBlockStart.gif)
![](https://www.cnblogs.com/Images/OutliningIndicators/ContractedBlock.gif)
{
15![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//// <summary>
16
/// 中文分词器
17
/// </summary>
18
public class ChineseSpliter
19![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
20
public static string Split(string text, string splitToken)
21![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
22
StringBuilder sb = new StringBuilder();
23![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
24
Analyzer an = new ICTCLASAnalyzer();
25![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
26
//TokenStream ts = an.ReusableTokenStream("", new StringReader(text));
27![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
28
TokenStream ts = an.TokenStream("", new StringReader(text));
29![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
30
Lucene.Net.Analysis.Token token;
31
while ((token = ts.Next()) != null)
32![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
33
sb.Append(splitToken + token.TermText());
34
}
35![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
36
return sb.ToString().Substring(1);
37
}
38
}
39
}
训练管理器代码:
![](https://www.cnblogs.com/Images/OutliningIndicators/ContractedBlock.gif)
Code
1
using System;
2
using System.Collections.Generic;
3
using System.Text;
4
using System.IO;
5![](https://www.cnblogs.com/Images/OutliningIndicators/None.gif)
6
using System.Text.RegularExpressions;
7![](https://www.cnblogs.com/Images/OutliningIndicators/None.gif)
8
namespace AspxOn.Search.FenLei
9![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedBlockStart.gif)
![](https://www.cnblogs.com/Images/OutliningIndicators/ContractedBlock.gif)
{
10![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
11![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//// <summary>
12
/// 训练管理器
13
/// </summary>
14
public class TrainingDataManager
15![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
16
private string[] trainingFileClassicfications; //训练预料分类数组
17
private DirectoryInfo trainingTextDir; //训练预料存放目录
18
private string defaultDir = "D:\\SogouC.mini.20061127\\SogouC.mini\\Sample";
19
//private string defaultDir = @"J:\SogouC.reduced.20061127\SogouC.reduced\Reduced";
20![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
21
public TrainingDataManager()
22![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
23
if (!Directory.Exists(defaultDir))
24![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
25
throw new Exception("当前语料目录不存在!");
26
}
27
trainingTextDir = new DirectoryInfo(defaultDir);
28![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
29
trainingFileClassicfications = Directory.GetDirectories(defaultDir,"*",SearchOption.TopDirectoryOnly);
30![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
31
for (int i = 0; i < trainingFileClassicfications.Length; i++)
32![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
33
trainingFileClassicfications[i] = (Regex.Split(trainingFileClassicfications[i], "\\\\"))[(Regex.Split(trainingFileClassicfications[i], "\\\\")).Length - 1];
34
//Console.WriteLine(trainingFileClassicfications[i]);
35
}
36
}
37![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
38![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//// <summary>
39
/// 获取分类列表
40
/// </summary>
41
/// <returns></returns>
42
public string[] GetTrainingClassifications()
43![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
44
return trainingFileClassicfications;
45
}
46![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
47![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//// <summary>
48
/// 获取指定分类下的文件路径
49
/// </summary>
50
/// <param name="classification"></param>
51
/// <returns></returns>
52
public string[] GetFilesPath(string classification)
53![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
54
string[] ret = Directory.GetFiles(defaultDir+"\\"+classification);
55
56
return ret;
57
}
58![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
59![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//// <summary>
60
/// 获取指定位置的文件内容
61
/// </summary>
62
/// <param name="filepath"></param>
63
/// <returns></returns>
64
public string GetFileText(string filepath)
65![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
66
FileStream fs = new FileStream(filepath, FileMode.Open, FileAccess.Read, FileShare.Read);
67
byte[] bt = new byte[fs.Length];
68
fs.Read(bt, 0, bt.Length);
69
fs.Close();
70
string s = Encoding.Default.GetString(bt);
71
return s;
72
}
73![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
74![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//// <summary>
75
/// 获取训练文本集中的文本数目
76
/// </summary>
77
/// <returns></returns>
78
public int GetTrainFileCount()
79![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
80
int ret = 0;
81
for (int i = 0; i < trainingFileClassicfications.Length; i++)
82![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
83
ret += GetTrainFileCountOfCertainClassification(trainingFileClassicfications[i]);
84
}
85
return ret;
86
}
87![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
88![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//// <summary>
89
/// 获取指定分类下的文本数目
90
/// </summary>
91
/// <param name="classification"></param>
92
/// <returns></returns>
93
public int GetTrainFileCountOfCertainClassification(string classification)
94![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
95
int ret = 0;
96![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
97
ret = Directory.GetFiles(defaultDir + "\\" + classification).Length;
98![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
99
return ret;
100
}
101![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
102![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//// <summary>
103
/// 获取指定分类包含关键字或关键词的样本数目
104
/// </summary>
105
/// <param name="classification">指定分类</param>
106
/// <param name="key">关键词或关键字</param>
107
/// <returns>样本数目</returns>
108
public int GetCountContainKeyOfClassification(string classification, string key)
109![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
110
int ret = 0;
111
string[] filepaths = GetFilesPath(classification);
112
try
113![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
114
115
for (int i = 0; i < filepaths.Length; i++)
116![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
117
string text = GetFileText(filepaths[i]);
118
if (text.Contains(key))
119![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
120
ret++;
121
}
122
}
123
}
124
catch
125![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
126
throw new Exception("error!");
127
}
128
return ret;
129
}
130
}
131
}
132![](https://www.cnblogs.com/Images/OutliningIndicators/None.gif)
先验概率计算代码:
![](https://www.cnblogs.com/Images/OutliningIndicators/ContractedBlock.gif)
Code
1
using System;
2
using System.Collections.Generic;
3
using System.Text;
4![](https://www.cnblogs.com/Images/OutliningIndicators/None.gif)
5
namespace AspxOn.Search.FenLei
6![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedBlockStart.gif)
![](https://www.cnblogs.com/Images/OutliningIndicators/ContractedBlock.gif)
{
7![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//// <summary>
8
/// 先验概率(事先概率)计算
9
/// </summary>
10
public class PriorProbability
11![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
12
private static TrainingDataManager tdm = new TrainingDataManager();
13![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
14![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//// <summary>
15
/// 计算先验概率
16
/// </summary>
17
/// <param name="c">给定的分类</param>
18
/// <returns>给定条件下的先验概率</returns>
19
public static float CaculatePc(string c)
20![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
21
float ret = 0F;
22
float Nc = tdm.GetTrainFileCountOfCertainClassification(c);
23
float N = tdm.GetTrainFileCount();
24
ret = Nc / N;
25
return ret;
26
}
27
}
28
}
条件概率计算代码:
![](https://www.cnblogs.com/Images/OutliningIndicators/ContractedBlock.gif)
Code
1
using System;
2
using System.Collections.Generic;
3
using System.Text;
4![](https://www.cnblogs.com/Images/OutliningIndicators/None.gif)
5
namespace AspxOn.Search.FenLei
6![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedBlockStart.gif)
![](https://www.cnblogs.com/Images/OutliningIndicators/ContractedBlock.gif)
{
7![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//// <summary>
8
/// 条件概率计算
9
/// </summary>
10
public class ClassConditionalProbability
11![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
12![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
13
private static TrainingDataManager tdm = new TrainingDataManager();
14
private static float M = 0F;
15![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
16![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//// <summary>
17
/// 类条件概率
18
/// </summary>
19
/// <param name="x">给定关键字</param>
20
/// <param name="c">给定分类</param>
21
/// <returns></returns>
22
public static float CaculatePxc(string x, string c)
23![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
24
float ret = 0F;
25
float Nxc = tdm.GetCountContainKeyOfClassification(c, x);
26
float Nc = tdm.GetTrainFileCountOfCertainClassification(c);
27
float V = tdm.GetTrainingClassifications().Length;
28![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
29
ret = (Nxc + 1) / (Nc + V + M);//为避免出现0这样的极端情况,进行加权处理
30![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
31
return ret;
32
}
33
}
34
}
用于保存分类结果的类:
![](https://www.cnblogs.com/Images/OutliningIndicators/ContractedBlock.gif)
Code
1
using System;
2
using System.Collections.Generic;
3
using System.Text;
4![](https://www.cnblogs.com/Images/OutliningIndicators/None.gif)
5
namespace AspxOn.Search.FenLei
6![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedBlockStart.gif)
![](https://www.cnblogs.com/Images/OutliningIndicators/ContractedBlock.gif)
{
7![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//// <summary>
8
/// 保存分类结果
9
/// </summary>
10
public class ClassifyResult
11![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
12![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
13
public double probability; //分类概率
14
public string classification; //分类
15
public ClassifyResult()
16![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
17
probability = 0;
18
classification = string.Empty;
19
}
20
}
21
}
贝叶斯分类器代码:
![](https://www.cnblogs.com/Images/OutliningIndicators/ContractedBlock.gif)
Code
1
using System;
2
using System.Collections.Generic;
3
using System.Text;
4![](https://www.cnblogs.com/Images/OutliningIndicators/None.gif)
5
namespace AspxOn.Search.FenLei
6![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedBlockStart.gif)
![](https://www.cnblogs.com/Images/OutliningIndicators/ContractedBlock.gif)
{
7![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//// <summary>
8
/// 朴素贝叶斯分类器
9
/// </summary>
10
public class BayesClassifier
11![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
12![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
13
private TrainingDataManager tdm; //训练集合管理器
14
//private string trainingDataPath; //训练集合路径
15
private static float zoomFactor = 10.0F;
16![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
17![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//// <summary>
18
/// 默认构造器,初始化训练集合
19
/// </summary>
20
public BayesClassifier()
21![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
22
tdm = new TrainingDataManager();
23
}
24![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
25![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//// <summary>
26
/// 计算给定的文本属性向量X在给定的分类Cj中的类条件概率
27
/// </summary>
28
/// <param name="X">文本属性向量X</param>
29
/// <param name="Cj">给定的分类</param>
30
/// <returns>分类条件概率连乘值</returns>
31
protected float CaluProd(string[] X, string Cj)
32![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
33
float ret = 1.0F;
34
for (int i = 0; i < X.Length; i++)
35![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
36
string Xi = X[i];
37
ret *= ClassConditionalProbability.CaculatePxc(Xi, Cj) * zoomFactor;//因为数值过小,因此将连乘值放大10倍(通过乘以zoomFactor)
38
}
39
ret *= PriorProbability.CaculatePc(Cj); //再乘以先验概率
40
return ret;
41
}
42![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
43![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
/**//// <summary>
44
/// 对指定文本进行分类
45
/// </summary>
46
/// <param name="text">指定文本</param>
47
/// <returns>分类结果</returns>
48
public List<ClassifyResult> Classify(string text)
49![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
50
string[] terms = ChineseSpliter.Split(text, "|").Split('|'); //中文分词处理(分词结果可能包含停用词)
51
string[] classes = tdm.GetTrainingClassifications(); //分类列表数组
52
float probility = 0.0F;
53
List<ClassifyResult> crs = new List<ClassifyResult>(); //分类结果
54
for (int i = 0; i < classes.Length; i++)
55![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
56
string Ci = classes[i];
57
probility = CaluProd(terms, Ci); //计算给定的文本属性向量terms在给定的分类Ci中的分类条件概率
58
ClassifyResult cr = new ClassifyResult();
59
cr.classification = Ci;
60
cr.probability = probility;
61
crs.Add(cr);
62
}
63
return crs;
64
}
65![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
66
public string GetMaxNum(List<ClassifyResult> crs)
67![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
68
double ret = 0;
69
string classification = string.Empty;
70
ret = crs[0].probability;
71
for (int i = 0; i < crs.Count; i++)
72![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
73
if (crs[i].probability > ret)
74![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
75
ret = crs[i].probability;
76
classification = crs[i].classification;
77
}
78
}
79
return classification;
80
}
81
}
82
}
代码太多,编辑的时候卡的很,于是再整个(二)