基于朴素贝叶斯分类器的文本分类算法C#版
该程序用到了Lucene.Net,用到了基于词典的ICTCLAS中文分词1.0.
ICTCLAS中文分词for Lucene.Net接口代码(实现Analyzer):
Co
1using System;
2using System.Collections.Generic;
3using System.Text;
4using System.IO;
5
6using Lucene.Net.Analysis;
7using Lucene.Net.Analysis.Standard;
8
9namespace AspxOn.Search.FenLei
10{
11
12 /**//// <summary>
13 /// ICTCLAS分词组件for Lucene.net接口
14 /// </summary>
15 public class ICTCLASAnalyzer : Analyzer
16 {
17 //定义要过滤的词
18 public static readonly System.String[] CHINESE_ENGLISH_STOP_WORDS = new string[428];
19 public string NoisePath = Environment.CurrentDirectory + "\\da
20
21 public ICTCLASAnalyzer()
22 {
23 StreamReader reader = new StreamReader(NoisePath, System.Text.Encoding.Default);
24 string noise = reader.ReadLine();
25 int i = 0;
26
27 while (!string.IsNullOrEmpty(noise))
28 {
29 CHINESE_ENGLISH_STOP_WORDS[i] = noise;
30 noise = reader.ReadLine();
31 i++;
32 }
33
34 }
35
36 /**//**//**//// Constructs a {@link StandardTokenizer} filtered by a {@link
37 /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
38 ///
39 public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
40 {
41 TokenStream result = new ICTCLASTokenizer(reader);
42 result = new StandardFilter(result);
43 result = new LowerCaseFilter(result);
44 result = new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS);
45 return result;
46 }
47
48
49 }
50}
ICTCLAS中文分词for Lucene.Net接口代码(实现Tokenizer):
Co
1using System;
2using System.Collections.Generic;
3using System.Text;
4
5using Lucene.Net.Analysis;
6using SharpICTCLAS;
7using System.IO;
8
9namespace AspxOn.Search.FenLei
10{
11 public class ICTCLASTokenizer : Tokenizer
12 {
13 int nKind = 1;
14 List<WordResult[]> result;
15 int startIndex = 0;
16 int endIndex = 0;
17 int i = 1;
18 /**//**/
19 /**////
20 /// 待分词的句子
21 ///
22 private string sentence;
23 /**//**/
24 /**//// Constructs a tokenizer for this Reader.
25 public ICTCLASTokenizer(System.IO.TextReader reader)
26 {
27 this.input = reader;
28 sentence = input.ReadToEnd();
29 sentence = sentence.Replace("\r\n", "");
30 string DictPath = Path.Combine(Environment.CurrentDirectory, "Da
31 //Console.WriteLine("正在初始化字典库,请稍候");
32 WordSegment wordSegment = new WordSegment();
33 wordSegment.InitWordSegment(DictPath);
34 result = wordSegment.Segment(sentence, nKind);
35 }
36
37 /**//**/
38 /**//// 进行切词,返回数据流中下一个token或者数据流为空时返回null
39 ///
40 public override Token Next()
41 {
42 Token token = null;
43 while (i < result[0].Length - 1)
44 {
45 string word = result[0][i].sWord;
46 endIndex = startIndex + word.Length - 1;
47 token = new Token(word, startIndex, endIndex);
48 startIndex = endIndex + 1;
49
50 i++;
51 return token;
52
53 }
54 return null;
55 }
56
57 }
58}
中文分词器代码:
Co
1using System;
2using System.Collections.Generic;
3using System.Text;
4using System.IO;
5
6using Lucene.Net.Analysis;
7using Lucene.Net.Analysis.Standard;
8using Lucene.Net.Documents;
9
10using Lucene.Net.Analysis.Cn;
11using Lucene.Net.Analysis.KTDictSeg;
12
13namespace AspxOn.Search.FenLei
14{
15 /**//// <summary>
16 /// 中文分词器
17 /// </summary>
18 public class ChineseSpliter
19 {
20 public static string Split(string text, string splitToken)
21 {
22 StringBuilder sb = new StringBuilder();
23
24 Analyzer an = new ICTCLASAnalyzer();
25
26 //TokenStream ts = an.ReusableTokenStream("", new StringReader(text));
27
28 TokenStream ts = an.TokenStream("", new StringReader(text));
29
30 Lucene.Net.Analysis.Token token;
31 while ((token = ts.Next()) != null)
32 {
33 sb.Append(splitToken + token.TermText());
34 }
35
36 return sb.ToString().Substring(1);
37 }
38 }
39}
训练管理器代码:
Co
1using System;
2using System.Collections.Generic;
3using System.Text;
4using System.IO;
5
6using System.Text.RegularExpressions;
7
8namespace AspxOn.Search.FenLei
9{
10
11 /**//// <summary>
12 /// 训练管理器
13 /// </summary>
14 public class TrainingDataManager
15 {
16 private string[] trainingFileClassicfications; //训练预料分类数组
17 private DirectoryInfo trainingTextDir; //训练预料存放目录
18 private string defaultDir = "D:\\SogouC.mini.20061127\\SogouC.mini\\Sample";
19 //private string defaultDir = @"J:\SogouC.reduced.20061127\SogouC.reduced\Reduced";
20
21 public TrainingDataManager()
22 {
23 if (!Directory.Exists(defaultDir))
24 {
25 throw new Exception("当前语料目录不存在!");
26 }
27 trainingTextDir = new DirectoryInfo(defaultDir);
28
29 trainingFileClassicfications = Directory.GetDirectories(defaultDir,"*",SearchOption.TopDirectoryOnly);
30
31 for (int i = 0; i < trainingFileClassicfications.Length; i++)
32 {
33 trainingFileClassicfications[i] = (Regex.Split(trainingFileClassicfications[i], "\\\\"))[(Regex.Split(trainingFileClassicfications[i], "\\\\")).Length - 1];
34 //Console.WriteLine(trainingFileClassicfications[i]);
35 }
36 }
37
38 /**//// <summary>
39 /// 获取分类列表
40 /// </summary>
41 /// <returns></returns>
42 public string[] GetTrainingClassifications()
43 {
44 return trainingFileClassicfications;
45 }
46
47 /**//// <summary>
48 /// 获取指定分类下的文件路径
49 /// </summary>
50 /// <param name="classification"></param>
51 /// <returns></returns>
52 public string[] GetFilesPath(string classification)
53 {
54 string[] ret = Directory.GetFiles(defaultDir+"\\"+classification);
55
56 return ret;
57 }
58
59 /**//// <summary>
60 /// 获取指定位置的文件内容
61 /// </summary>
62 /// <param name="filepath"></param>
63 /// <returns></returns>
64 public string GetFileText(string filepath)
65 {
66 FileStream fs = new FileStream(filepath, FileMode.Open, FileAccess.Read, FileShare.Read);
67 byte[] bt = new byte[fs.Length];
68 fs.Read(bt, 0, bt.Length);
69 fs.Close();
70 string s = Encoding.Default.GetString(bt);
71 return s;
72 }
73
74 /**//// <summary>
75 /// 获取训练文本集中的文本数目
76 /// </summary>
77 /// <returns></returns>
78 public int GetTrainFileCount()
79 {
80 int ret = 0;
81 for (int i = 0; i < trainingFileClassicfications.Length; i++)
82 {
83 ret += GetTrainFileCountOfCertainClassification(trainingFileClassicfications[i]);
84 }
85 return ret;
86 }
87
88 /**//// <summary>
89 /// 获取指定分类下的文本数目
90 /// </summary>
91 /// <param name="classification"></param>
92 /// <returns></returns>
93 public int GetTrainFileCountOfCertainClassification(string classification)
94 {
95 int ret = 0;
96
97 ret = Directory.GetFiles(defaultDir + "\\" + classification).Length;
98
99 return ret;
100 }
101
102 /**//// <summary>
103 /// 获取指定分类包含关键字或关键词的样本数目
104 /// </summary>
105 /// <param name="classification">指定分类</param>
106 /// <param name="key">关键词或关键字</param>
107 /// <returns>样本数目</returns>
108 public int GetCountContainKeyOfClassification(string classification, string key)
109 {
110 int ret = 0;
111 string[] filepaths = GetFilesPath(classification);
112 try
113 {
114
115 for (int i = 0; i < filepaths.Length; i++)
116 {
117 string text = GetFileText(filepaths[i]);
118 if (text.Contains(key))
119 {
120 ret++;
121 }
122 }
123 }
124 catch
125 {
126 throw new Exception("error!");
127 }
128 return ret;
129 }
130 }
131}
132
先验概率计算代码:
Co
1using System;
2using System.Collections.Generic;
3using System.Text;
4
5namespace AspxOn.Search.FenLei
6{
7 /**//// <summary>
8 /// 先验概率(事先概率)计算
9 /// </summary>
10 public class PriorProbability
11 {
12 private static TrainingDataManager tdm = new TrainingDataManager();
13
14 /**//// <summary>
15 /// 计算先验概率
16 /// </summary>
17 /// <param name="c">给定的分类</param>
18 /// <returns>给定条件下的先验概率</returns>
19 public static float CaculatePc(string c)
20 {
21 float ret = 0F;
22 float Nc = tdm.GetTrainFileCountOfCertainClassification(c);
23 float N = tdm.GetTrainFileCount();
24 ret = Nc / N;
25 return ret;
26 }
27 }
28}
条件概率计算代码:
Co
1using System;
2using System.Collections.Generic;
3using System.Text;
4
5namespace AspxOn.Search.FenLei
6{
7 /**//// <summary>
8 /// 条件概率计算
9 /// </summary>
10 public class ClassConditionalProbability
11 {
12
13 private static TrainingDataManager tdm = new TrainingDataManager();
14 private static float M = 0F;
15
16 /**//// <summary>
17 /// 类条件概率
18 /// </summary>
19 /// <param name="x">给定关键字</param>
20 /// <param name="c">给定分类</param>
21 /// <returns></returns>
22 public static float CaculatePxc(string x, string c)
23 {
24 float ret = 0F;
25 float Nxc = tdm.GetCountContainKeyOfClassification(c, x);
26 float Nc = tdm.GetTrainFileCountOfCertainClassification(c);
27 float V = tdm.GetTrainingClassifications().Length;
28
29 ret = (Nxc + 1) / (Nc + V + M);//为避免出现0这样的极端情况,进行加权处理
30
31 return ret;
32 }
33 }
34}
用于保存分类结果的类:
Co
1using System;
2using System.Collections.Generic;
3using System.Text;
4
5namespace AspxOn.Search.FenLei
6{
7 /**//// <summary>
8 /// 保存分类结果
9 /// </summary>
10 public class ClassifyResult
11 {
12
13 public double probability; //分类概率
14 public string classification; //分类
15 public ClassifyResult()
16 {
17 probability = 0;
18 classification = string.Empty;
19 }
20 }
21}
贝叶斯分类器代码:
Co
1using System;
2using System.Collections.Generic;
3using System.Text;
4
5namespace AspxOn.Search.FenLei
6{
7 /**//// <summary>
8 /// 朴素贝叶斯分类器
9 /// </summary>
10 public class BayesClassifier
11 {
12
13 private TrainingDataManager tdm; //训练集合管理器
14 //private string trainingDataPath; //训练集合路径
15 private static float zoomFactor = 10.0F;
16
17 /**//// <summary>
18 /// 默认构造器,初始化训练集合
19 /// </summary>
20 public BayesClassifier()
21 {
22 tdm = new TrainingDataManager();
23 }
24
25 /**//// <summary>
26 /// 计算给定的文本属性向量X在给定的分类Cj中的类条件概率
27 /// </summary>
28 /// <param name="X">文本属性向量X</param>
29 /// <param name="Cj">给定的分类</param>
30 /// <returns>分类条件概率连乘值</returns>
31 protected float CaluProd(string[] X, string Cj)
32 {
33 float ret = 1.0F;
34 for (int i = 0; i < X.Length; i++)
35 {
36 string Xi = X[i];
37 ret *= ClassConditionalProbability.CaculatePxc(Xi, Cj) * zoomFactor;//因为数值过小,因此将连乘值放大10倍(通过乘以zoomFactor)
38 }
39 ret *= PriorProbability.CaculatePc(Cj); //再乘以先验概率
40 return ret;
41 }
42
43 /**//// <summary>
44 /// 对指定文本进行分类
45 /// </summary>
46 /// <param name="text">指定文本</param>
47 /// <returns>分类结果</returns>
48 public List<ClassifyResult> Classify(string text)
49 {
50 string[] terms = ChineseSpliter.Split(text, "|").Split('|'); //中文分词处理(分词结果可能包含停用词)
51 string[] classes = tdm.GetTrainingClassifications(); //分类列表数组
52 float probility = 0.0F;
53 List<ClassifyResult> crs = new List<ClassifyResult>(); //分类结果
54 for (int i = 0; i < classes.Length; i++)
55 {
56 string Ci = classes[i];
57 probility = CaluProd(terms, Ci); //计算给定的文本属性向量terms在给定的分类Ci中的分类条件概率
58 ClassifyResult cr = new ClassifyResult();
59 cr.classification = Ci;
60 cr.probability = probility;
61 crs.Add(cr);
62 }
63 return crs;
64 }
65
66 public string GetMaxNum(List<ClassifyResult> crs)
67 {
68 double ret = 0;
69 string classification = string.Empty;
70 ret = crs[0].probability;
71 for (int i = 0; i < crs.Count; i++)
72 {
73 if (crs[i].probability > ret)
74 {
75 ret = crs[i].probability;
76 classification = crs[i].classification;
77 }
78 }
79 return classification;
80 }
81 }
82}