最近自己写了下文章分析程序,用的是c# 2.0,数据库是mysql 5.0,自己弄两个mysql的Helper类,其中有涉及到线程和委托的东西,我找高手指点了我一下,结果,弄出来了,分析1万篇文章,大概1个小时左右,想要这个程序或者想交流的可以联系我.
帖一下自己的代码:
代码
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.IO;
using System.Data.OleDb;
using WoWExpress.Core;
using MySql.Data.MySqlClient;
using System.Text.RegularExpressions;
using System.Threading;
using Rainsoft.WordSeg;
namespace CSVProject
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
public DataSet GetStopwords()
{
string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152");
string mysqlStr = "select * from stopwords";
DataSet stopwordsDataSet = WoWExpress.Core.MySqlHelper.ExecuteDataset(myConnectionStr, CommandType.Text, mysqlStr);
return stopwordsDataSet;
}
public DataSet GetArticles()
{
string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152");
string mysqlStr = "select * from ccl_addonarticle";
DataSet stopwordsDataSet = WoWExpress.Core.MySqlHelper.ExecuteDataset(myConnectionStr, CommandType.Text, mysqlStr);
return stopwordsDataSet;
}
/*一篇文章一个对象,对象包括文章主题,文章id等,现在就只要两个参数
文章对象放入ArrayList,这样可以循环操作文章
1.对文章使用停用词表,把文章隔开,如何隔开?利用停用词表集合循环的把文章中的停用词给用标识替换(如[%stopword%])
2.直接使用split(artirleBody,[%stopword%])来分隔文章,留下的词就全部分入数组,数组循环判断,从第一个开始,相同就数量加1
* 插入新的对象关键词对象,对象包括关键词id,关键词,关键词在本篇文章数量,关键词在本篇文章的百分比(这个需要在本篇文章循环
* 完才可以计算的出),关键词在本数据库中的数量,关键词在本数据库中的百分比(这个需要在所有文章循环
* 完才可以计算的出)
* 全部循环完之后,需要的数据就是关键词对象,这个也相应的显示出来,并且存到数据库静态化,但是当数据不断增加的时候,每次
* 就需要重新计算一次,得出当前最真实的结果,这样也会导致速度越来越慢,不过这是将来需要处理的。
*/
/*获得文章*/
public List<ArticleInfo> GetMyArticles(string pageLength)
{
List<ArticleInfo> articlesInfo = new List<ArticleInfo>();
string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152");
string mysqlStr = " select aid,body,isDo from ccl_addonarticle where isDo = 0 limit @pageLength";
mysqlStr = mysqlStr.Replace("@pageLength", pageLength);
//Execute the query against the database
using (MySqlDataReader rdr = WoWExpress.Core.MySqlHelper.ExecuteReader(myConnectionStr, CommandType.Text, mysqlStr))
{
// Scroll through the results
while (rdr.Read())
{
//预先分词
this.segment(rdr.GetString(1));
ArticleInfo articleInfo = new ArticleInfo(Convert.ToInt32(rdr.GetString(0)), rdr.GetString(1));
//Add each item to the arraylist
articlesInfo.Add(articleInfo);
}
}
return articlesInfo;
}
/*获得文章总数*/
public int GetArticlesCount()
{
List<ArticleInfo> articlesInfo = new List<ArticleInfo>();
string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152");
string mysqlStr = " select count(*) from ccl_addonarticle where isDo = 0";
int result = 0;
//Execute the query against the database
using (MySqlDataReader rdr = WoWExpress.Core.MySqlHelper.ExecuteReader(myConnectionStr, CommandType.Text, mysqlStr))
{
// Scroll through the results
if (rdr.Read())
{
result = rdr.GetInt32(0);
}
}
return result;
}
/*获得停用词表*/
public List<StopwordsInfo> GetMyStopwords()
{
List<StopwordsInfo> stopwords = new List<StopwordsInfo>();
string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152");
string mysqlStr = "select * from stopwords";
//Execute the query against the database
using (MySqlDataReader rdr = WoWExpress.Core.MySqlHelper.ExecuteReader(myConnectionStr, CommandType.Text, mysqlStr))
{
// Scroll through the results
while (rdr.Read())
{
StopwordsInfo stopwordsInfo = new StopwordsInfo(Convert.ToInt32(rdr.GetString(0)), rdr.GetString(1).Trim());
//Add each item to the arraylist
stopwords.Add(stopwordsInfo);
}
}
return stopwords;
}
/*使用停用词表*/
public List<ArticleInfo> UseStopwords(List<ArticleInfo> articlesInfo, List<StopwordsInfo> stopwords)
{
/*处理过后的文章*/
List<ArticleInfo> targetArticles = new List<ArticleInfo>();
/*循环文章*/
foreach (ArticleInfo articleInfo in articlesInfo)
{
/*每篇文章循环使用停用词表里面的各个词*/
string curArticleBody = articleInfo.ArticleBody.ToString();
/*去除所有html代码*/
curArticleBody = this.stripHtml(curArticleBody);
curArticleBody = this.StripHTML3(curArticleBody);
foreach (StopwordsInfo stopwordsInfo in stopwords)
{
string curStopwords = stopwordsInfo.Stopwords.ToString();
curArticleBody = curArticleBody.Replace(curStopwords, " ");
}
/*处理每篇文章后,在把每篇文章放入新的列表里面等待使用*/
/*去除所有html代码-在处理一次*/
curArticleBody = this.stripHtml(curArticleBody);
articleInfo.ArticleBody = curArticleBody;
targetArticles.Add(articleInfo);
}
return targetArticles;
}
/*分隔文章到单词--这里已经得到了单篇文章的关键词统计*/
public List<SingleKeywords> SplitArticle(List<ArticleInfo> articlesInfo)
{
/*处理过后得到的关键词列表*/
List<SingleKeywords> singleKeywordsArray = new List<SingleKeywords>();
/*循环文章*/
/*这里可以显示分进度*/
//这里已经是处在了线程里面的话,就需要代理了,这里的设置也就需要代理了
//progressBar2.Maximum = articlesInfo.Count;//设置最大长度值-
//progressBar2.Value = 0;//设置当前值
//progressBar2.Step = 1;//设置没次增长多少
OnRrogressBar2Set(articlesInfo.Count);
foreach (ArticleInfo articleInfo in articlesInfo)
{
/*每篇文章循环使用切割*/
string curArticleBody = articleInfo.ArticleBody.ToString().Trim();
int curArticleId = articleInfo.ArticleId;
//切割后得到关键词列表
string[] keywordsArray = curArticleBody.Split(' ');
int keywordsArrayLength = keywordsArray.Length;
/*首先初始入库一个关键词,每篇第一个关键词肯定是要入库的*/
SingleKeywords curKeywords = new SingleKeywords(curArticleId, keywordsArray[0], 1, 0);
singleKeywordsArray.Add(curKeywords);
for (int i = 1; i < keywordsArrayLength - 1; i++)
{
int singlekeywordsLength = singleKeywordsArray.Count;
bool flag = true;
for (int j = 0; j < singlekeywordsLength; j++)
{
string tempSingleKeywords = keywordsArray[i].Trim();
int tempArticleId = curArticleId;
string temp2SingleKeywords = singleKeywordsArray[j].KeywordsStr.Trim();
int temp2ArticleId = singleKeywordsArray[j].ArticleId;
if (tempSingleKeywords.Equals(temp2SingleKeywords) && tempArticleId == temp2ArticleId)
{
singleKeywordsArray[j].SingleCount += 1;
flag = false;
break;
}
}
//true代表没有一个是相同的,allKeywordsArray要加关键词
if (flag)
{
SingleKeywords addSingleKeywords = new SingleKeywords(curArticleId, keywordsArray[i].Trim(), 1, 0);
singleKeywordsArray.Add(addSingleKeywords);
}
}
/*这里进行百分比的计算*/
//todo
/*分进度*/
OnRrogressBarAdd2(progressBar2.Step);
}
/*嵌套到分词的时候,就顺便操作数据库了*/
this.UpdateArticleAndInsertKeywords(singleKeywordsArray);
//string startId = articlesInfo[0].ArticleId.ToString();
//string endId = articlesInfo[articlesInfo.Count - 1].ArticleId.ToString();
//label1.Text = "文章范围:" + startId + "-" + endId + "已经被更新完成!";
return singleKeywordsArray;
}
/*接着数据库里面的操作*/
public string UpdateArticleAndInsertKeywords(List<SingleKeywords> singleKeywordsArray)
{
//数据库连接加了字符集后,问题解决,插入正常
string strSetCharset = "utf8";//System.Text.Encoding.UTF8.HeaderName;//System.Text.Encoding.Default.HeaderName;
//string strSetCharset = "UTF8";
string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152", strSetCharset);
//string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152");
//这样就已经循环更新了,本地使用的程序,不担心安全,快速开发出来-这里需要用事务,待处理
foreach (SingleKeywords singleKeywords in singleKeywordsArray)
{
string mysqlStr = " Update ccl_addonarticle set isDo =1 where aid = @aid";
int articleId = singleKeywords.ArticleId;
mysqlStr = mysqlStr.Replace("@aid", articleId.ToString());
WoWExpress.Core.MySqlHelper.ExecuteNonQuery(myConnectionStr, CommandType.Text, mysqlStr);
string mysqlStr2 = "Insert articlekeywords(articleId,keywords,singleCount,singlePercent) values(?articleId,?KeywordsStr,?singleCount,?singlePercent)";
int articleId2 = singleKeywords.ArticleId;
string keywordsStr = singleKeywords.KeywordsStr;
int singleCount = singleKeywords.SingleCount;
double singlePercent = singleKeywords.SinglePercent;
MySqlParameter[] keywordsParms = new MySqlParameter[] {
new MySqlParameter("?articleId", MySqlDbType.Int32, 4),
new MySqlParameter("?KeywordsStr", MySqlDbType.VarChar),
new MySqlParameter("?singleCount", MySqlDbType.Int32, 4),
new MySqlParameter("?singlePercent", MySqlDbType.Double,4)};
keywordsStr = Traditional2Simplified(keywordsStr);
keywordsParms[0].Value = articleId2;
keywordsParms[1].Value = keywordsStr;
keywordsParms[2].Value = singleCount;
keywordsParms[3].Value = singlePercent;
WoWExpress.Core.MySqlHelper.ExecuteNonQuery(myConnectionStr, CommandType.Text, mysqlStr2, keywordsParms);
}
return "ok";
}
/*辅助程序,改变编码*/
private string DBStringToNormal(string dbStr)
{
byte[] str = new byte[dbStr.Length];
for (int i = 0; i < dbStr.Length; ++i)
str[i] = (byte)(dbStr[i]);
return System.Text.Encoding.Default.GetString(str, 0, dbStr.Length);
}
public string Traditional2Simplified(string str)
{ //繁体转简体
return (Microsoft.VisualBasic.Strings.StrConv(str, Microsoft.VisualBasic.VbStrConv.SimplifiedChinese, 0));
}
/// <summary>
/// 提取HTML代码中文字的C#函数
/// </summary>
public string StripHTML2(string strHtml)
{
string[] aryReg ={
@"<script[^>]*?>.*?</script>",
@"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(http://www.cnblogs.com/oxite/admin/file://[%22%22'tbnr]%7c[%5e/7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",
@"([\r\n])[\s]+",
@"&(quot|#34);",
@"&(amp|#38);",
@"&(lt|#60);",
@"&(gt|#62);",
@"&(nbsp|#160);",
@"&(iexcl|#161);",
@"&(cent|#162);",
@"&(pound|#163);",
@"&(copy|#169);",
@"&#(\d+);",
@"-->",
@"<!--.*\n"
};
string[] aryRep = {
"",
"",
"",
"\"",
"&",
"<",
">",
" ",
"\xa1",//chr(161),
"\xa2",//chr(162),
"\xa3",//chr(163),
"\xa9",//chr(169),
"",
"\r\n",
""
};
string newReg = aryReg[0];
string strOutput = strHtml;
for (int i = 0; i < aryReg.Length; i++)
{
Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase);
strOutput = regex.Replace(strOutput, aryRep[i]);
}
strOutput.Replace("<", "");
strOutput.Replace(">", "");
strOutput.Replace("\r\n", "");
return strOutput;
}
/// <summary>
/// 提取HTML代码中文字的C#函数
/// </summary>
public string StripHTML3(string strHtml)
{
return strHtml.Replace(@"[^A-Za-z0-9\u4E00-\u9FBB]", "");
}
/*利用得到的最原始的关键词列表进行全局计算*/
public List<AllKeywords> ComputeKeywords(List<SingleKeywords> singleKeywords)
{
/*处理过后得到的关键词列表*/
List<AllKeywords> allKeywordsArray = new List<AllKeywords>();
/*把关键词相同的全部加起来,统计全局关键词*/
//初始化全局统计列表
AllKeywords allKeywords = new AllKeywords(singleKeywords[0].KeywordsStr, singleKeywords[0].SingleCount, 0);
allKeywordsArray.Add(allKeywords);
int singleKeywordsCount = singleKeywords.Count;
for (int i = 1; i < singleKeywordsCount - 1; i++)
{
int allkeywordsLength = allKeywordsArray.Count;
bool flag = true;
for (int j = 0; j < allkeywordsLength; j++)
{
string tempSingleKeywords = singleKeywords[i].KeywordsStr.Trim();
string tempAllKeywords = allKeywordsArray[j].KeywordsStr.Trim();
if (tempSingleKeywords.Equals(tempAllKeywords))
{
allKeywordsArray[j].AllCount += 1;
flag = false;
break;
}
}
//true代表没有一个是相同的,allKeywordsArray要加关键词
if (flag)
{
AllKeywords addAllKeywords = new AllKeywords(singleKeywords[i].KeywordsStr, singleKeywords[i].SingleCount, 0);
allKeywordsArray.Add(addAllKeywords);
}
}
/*这里进行百分比的计算*/
//todo
return allKeywordsArray;
}
/**/
/// <summary>
/// 将Html标签转化为空格
/// </summary>
/// <param name="strHtml">待转化的字符串</param>
/// <returns>经过转化的字符串</returns>
private string stripHtml(string strHtml)
{
Regex objRegExp = new Regex("<(.|\n)+?>");
string strOutput = objRegExp.Replace(strHtml, "");
strOutput = strOutput.Replace("<", "<");
strOutput = strOutput.Replace(">", ">");
//把所有空格变为一个空格
Regex r = new Regex(@"\s+");
strOutput = r.Replace(strOutput, " ");
strOutput.Trim();
return strOutput;
}
private void btnUseStopword_Click(object sender, EventArgs e)
{
/*这里分开写,好校验,现在是为了方便*/
List<ArticleInfo> targetArticles = this.UseStopwords(this.GetMyArticles("10"), this.GetMyStopwords());
dataGridView3.DataSource = targetArticles;
}
private void btnGetArticle_Click(object sender, EventArgs e)
{
DataSet articleDS = this.GetArticles();
dataGridView2.DataSource = articleDS.Tables[0];
}
private void btnStopwords_Click(object sender, EventArgs e)
{
//string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152");
//string mysqlStr = "select * from stopwords where stopwordsId = @stopwordsId ";
//MySqlParameter myParameter = new MySqlParameter("@stopwordsId", MySqlDbType.Int32, 4);
//myParameter.Value = 26;
DataSet stopwordsDataSet = this.GetStopwords();
dataGridView1.DataSource = stopwordsDataSet.Tables[0];
}
/*这里循环点击,或者程序自动点击也行*/
private void btnGetKeywords_Click(object sender, EventArgs e)
{
/*这里分开写,好校验,现在是为了方便---分批处理,并且能够自动,判断,如果返回了值,就可以继续循环*/
/*按分页的方法,先统计出一共多少篇文章,规定每次执行的篇数,计算出需要执行的次数,利用返回结果来判断是否当前
处理是否已经完成,完成者继续执行,否者报出错原因*/
int articlesCount = this.GetArticlesCount();
int pageLength = 10;
int doCount = articlesCount/pageLength;
int lastLength = articlesCount % pageLength;
progressBar1.Maximum = doCount;//设置最大长度值
progressBar1.Value = 0;//设置当前值
progressBar1.Step = 1;//设置没次增长多少
System.Threading.Thread thread = new System.Threading.Thread(delegate(object arg) {
//如果总数小于单批长度,直接一次处理,长度为余数
if (articlesCount < pageLength)
{
List<SingleKeywords> singleKeywordsArray = this.SplitArticle(this.UseStopwords(this.GetMyArticles(lastLength.ToString()), this.GetMyStopwords()));
OnGridViewDataBind(singleKeywordsArray);
}
else
{
for (int i = 0; i < doCount; i++)
{
//这里循环操作
List<SingleKeywords> singleKeywordsArray = this.SplitArticle(this.UseStopwords(this.GetMyArticles(pageLength.ToString()), this.GetMyStopwords()));
OnGridViewDataBind(singleKeywordsArray);
OnRrogressBarAdd(progressBar1.Step);
}
//如果有余数,单独处理最后一次
if (lastLength != 0)
{
List<SingleKeywords> singleKeywordsArray = this.SplitArticle(this.UseStopwords(this.GetMyArticles(lastLength.ToString()), this.GetMyStopwords()));
OnGridViewDataBind(singleKeywordsArray);
}
}
});
thread.Start();
}
private void btnAllCompute_Click(object sender, EventArgs e)
{
List<AllKeywords> allKeywordsArray = this.ComputeKeywords(this.SplitArticle(this.UseStopwords(this.GetMyArticles("10"), this.GetMyStopwords())));
dataGridView5.DataSource = allKeywordsArray;
}
/*跨线程的操作*/
//绑定datagridview
protected delegate void GridViewDataBind(object source);
protected void OnGridViewDataBind(object source)
{
if (dataGridView4 == null)
return;
if (dataGridView4.InvokeRequired)
dataGridView4.Invoke(new GridViewDataBind(
delegate(object dataSource)
{
dataGridView4.DataSource = dataSource;
}
), source);
else
dataGridView4.DataSource = source;
}
//设置整体进度条
protected delegate void RrogressBarAdd(int step);
protected void OnRrogressBarAdd(int step)
{
if (progressBar1 == null)
return;
if (progressBar1.InvokeRequired)
progressBar1.Invoke(new RrogressBarAdd(
delegate(int mystep)
{
progressBar1.Value += mystep;//让进度条增加一次
}
), step);
else
progressBar1.Value += step;//让进度条增加一次
}
//设置分进度条
protected delegate void RrogressBarAdd2(int step);
protected void OnRrogressBarAdd2(int step)
{
if (progressBar2 == null)
return;
if (progressBar2.InvokeRequired)
progressBar2.Invoke(new RrogressBarAdd2(
delegate(int mystep)
{
progressBar2.Value += mystep;//让进度条增加一次
}
), step);
else
progressBar2.Value += step;//让进度条增加一次
}
protected delegate void RrogressBar2Set(int maximum);
protected void OnRrogressBar2Set(int maximum)
{
if (progressBar2 == null)
return;
if (progressBar2.InvokeRequired)
progressBar2.Invoke(new RrogressBar2Set(
delegate(int myMaximum)
{
progressBar2.Maximum = myMaximum;//设置最大长度值-
progressBar2.Value = 0;//设置当前值
progressBar2.Step = 1;//设置没次增长多少
}
), maximum);
else
progressBar2.Value += maximum;//让进度条增加一次
}
public string segment(string articleStr)
{
WordSegV1 seg = new WordSegV1();
string s = seg.Segment(articleStr,' ');
return s;
}
}
}
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.IO;
using System.Data.OleDb;
using WoWExpress.Core;
using MySql.Data.MySqlClient;
using System.Text.RegularExpressions;
using System.Threading;
using Rainsoft.WordSeg;
namespace CSVProject
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
public DataSet GetStopwords()
{
string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152");
string mysqlStr = "select * from stopwords";
DataSet stopwordsDataSet = WoWExpress.Core.MySqlHelper.ExecuteDataset(myConnectionStr, CommandType.Text, mysqlStr);
return stopwordsDataSet;
}
public DataSet GetArticles()
{
string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152");
string mysqlStr = "select * from ccl_addonarticle";
DataSet stopwordsDataSet = WoWExpress.Core.MySqlHelper.ExecuteDataset(myConnectionStr, CommandType.Text, mysqlStr);
return stopwordsDataSet;
}
/*一篇文章一个对象,对象包括文章主题,文章id等,现在就只要两个参数
文章对象放入ArrayList,这样可以循环操作文章
1.对文章使用停用词表,把文章隔开,如何隔开?利用停用词表集合循环的把文章中的停用词给用标识替换(如[%stopword%])
2.直接使用split(artirleBody,[%stopword%])来分隔文章,留下的词就全部分入数组,数组循环判断,从第一个开始,相同就数量加1
* 插入新的对象关键词对象,对象包括关键词id,关键词,关键词在本篇文章数量,关键词在本篇文章的百分比(这个需要在本篇文章循环
* 完才可以计算的出),关键词在本数据库中的数量,关键词在本数据库中的百分比(这个需要在所有文章循环
* 完才可以计算的出)
* 全部循环完之后,需要的数据就是关键词对象,这个也相应的显示出来,并且存到数据库静态化,但是当数据不断增加的时候,每次
* 就需要重新计算一次,得出当前最真实的结果,这样也会导致速度越来越慢,不过这是将来需要处理的。
*/
/*获得文章*/
public List<ArticleInfo> GetMyArticles(string pageLength)
{
List<ArticleInfo> articlesInfo = new List<ArticleInfo>();
string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152");
string mysqlStr = " select aid,body,isDo from ccl_addonarticle where isDo = 0 limit @pageLength";
mysqlStr = mysqlStr.Replace("@pageLength", pageLength);
//Execute the query against the database
using (MySqlDataReader rdr = WoWExpress.Core.MySqlHelper.ExecuteReader(myConnectionStr, CommandType.Text, mysqlStr))
{
// Scroll through the results
while (rdr.Read())
{
//预先分词
this.segment(rdr.GetString(1));
ArticleInfo articleInfo = new ArticleInfo(Convert.ToInt32(rdr.GetString(0)), rdr.GetString(1));
//Add each item to the arraylist
articlesInfo.Add(articleInfo);
}
}
return articlesInfo;
}
/*获得文章总数*/
public int GetArticlesCount()
{
List<ArticleInfo> articlesInfo = new List<ArticleInfo>();
string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152");
string mysqlStr = " select count(*) from ccl_addonarticle where isDo = 0";
int result = 0;
//Execute the query against the database
using (MySqlDataReader rdr = WoWExpress.Core.MySqlHelper.ExecuteReader(myConnectionStr, CommandType.Text, mysqlStr))
{
// Scroll through the results
if (rdr.Read())
{
result = rdr.GetInt32(0);
}
}
return result;
}
/*获得停用词表*/
public List<StopwordsInfo> GetMyStopwords()
{
List<StopwordsInfo> stopwords = new List<StopwordsInfo>();
string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152");
string mysqlStr = "select * from stopwords";
//Execute the query against the database
using (MySqlDataReader rdr = WoWExpress.Core.MySqlHelper.ExecuteReader(myConnectionStr, CommandType.Text, mysqlStr))
{
// Scroll through the results
while (rdr.Read())
{
StopwordsInfo stopwordsInfo = new StopwordsInfo(Convert.ToInt32(rdr.GetString(0)), rdr.GetString(1).Trim());
//Add each item to the arraylist
stopwords.Add(stopwordsInfo);
}
}
return stopwords;
}
/*使用停用词表*/
public List<ArticleInfo> UseStopwords(List<ArticleInfo> articlesInfo, List<StopwordsInfo> stopwords)
{
/*处理过后的文章*/
List<ArticleInfo> targetArticles = new List<ArticleInfo>();
/*循环文章*/
foreach (ArticleInfo articleInfo in articlesInfo)
{
/*每篇文章循环使用停用词表里面的各个词*/
string curArticleBody = articleInfo.ArticleBody.ToString();
/*去除所有html代码*/
curArticleBody = this.stripHtml(curArticleBody);
curArticleBody = this.StripHTML3(curArticleBody);
foreach (StopwordsInfo stopwordsInfo in stopwords)
{
string curStopwords = stopwordsInfo.Stopwords.ToString();
curArticleBody = curArticleBody.Replace(curStopwords, " ");
}
/*处理每篇文章后,在把每篇文章放入新的列表里面等待使用*/
/*去除所有html代码-在处理一次*/
curArticleBody = this.stripHtml(curArticleBody);
articleInfo.ArticleBody = curArticleBody;
targetArticles.Add(articleInfo);
}
return targetArticles;
}
/*分隔文章到单词--这里已经得到了单篇文章的关键词统计*/
public List<SingleKeywords> SplitArticle(List<ArticleInfo> articlesInfo)
{
/*处理过后得到的关键词列表*/
List<SingleKeywords> singleKeywordsArray = new List<SingleKeywords>();
/*循环文章*/
/*这里可以显示分进度*/
//这里已经是处在了线程里面的话,就需要代理了,这里的设置也就需要代理了
//progressBar2.Maximum = articlesInfo.Count;//设置最大长度值-
//progressBar2.Value = 0;//设置当前值
//progressBar2.Step = 1;//设置没次增长多少
OnRrogressBar2Set(articlesInfo.Count);
foreach (ArticleInfo articleInfo in articlesInfo)
{
/*每篇文章循环使用切割*/
string curArticleBody = articleInfo.ArticleBody.ToString().Trim();
int curArticleId = articleInfo.ArticleId;
//切割后得到关键词列表
string[] keywordsArray = curArticleBody.Split(' ');
int keywordsArrayLength = keywordsArray.Length;
/*首先初始入库一个关键词,每篇第一个关键词肯定是要入库的*/
SingleKeywords curKeywords = new SingleKeywords(curArticleId, keywordsArray[0], 1, 0);
singleKeywordsArray.Add(curKeywords);
for (int i = 1; i < keywordsArrayLength - 1; i++)
{
int singlekeywordsLength = singleKeywordsArray.Count;
bool flag = true;
for (int j = 0; j < singlekeywordsLength; j++)
{
string tempSingleKeywords = keywordsArray[i].Trim();
int tempArticleId = curArticleId;
string temp2SingleKeywords = singleKeywordsArray[j].KeywordsStr.Trim();
int temp2ArticleId = singleKeywordsArray[j].ArticleId;
if (tempSingleKeywords.Equals(temp2SingleKeywords) && tempArticleId == temp2ArticleId)
{
singleKeywordsArray[j].SingleCount += 1;
flag = false;
break;
}
}
//true代表没有一个是相同的,allKeywordsArray要加关键词
if (flag)
{
SingleKeywords addSingleKeywords = new SingleKeywords(curArticleId, keywordsArray[i].Trim(), 1, 0);
singleKeywordsArray.Add(addSingleKeywords);
}
}
/*这里进行百分比的计算*/
//todo
/*分进度*/
OnRrogressBarAdd2(progressBar2.Step);
}
/*嵌套到分词的时候,就顺便操作数据库了*/
this.UpdateArticleAndInsertKeywords(singleKeywordsArray);
//string startId = articlesInfo[0].ArticleId.ToString();
//string endId = articlesInfo[articlesInfo.Count - 1].ArticleId.ToString();
//label1.Text = "文章范围:" + startId + "-" + endId + "已经被更新完成!";
return singleKeywordsArray;
}
/*接着数据库里面的操作*/
public string UpdateArticleAndInsertKeywords(List<SingleKeywords> singleKeywordsArray)
{
//数据库连接加了字符集后,问题解决,插入正常
string strSetCharset = "utf8";//System.Text.Encoding.UTF8.HeaderName;//System.Text.Encoding.Default.HeaderName;
//string strSetCharset = "UTF8";
string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152", strSetCharset);
//string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152");
//这样就已经循环更新了,本地使用的程序,不担心安全,快速开发出来-这里需要用事务,待处理
foreach (SingleKeywords singleKeywords in singleKeywordsArray)
{
string mysqlStr = " Update ccl_addonarticle set isDo =1 where aid = @aid";
int articleId = singleKeywords.ArticleId;
mysqlStr = mysqlStr.Replace("@aid", articleId.ToString());
WoWExpress.Core.MySqlHelper.ExecuteNonQuery(myConnectionStr, CommandType.Text, mysqlStr);
string mysqlStr2 = "Insert articlekeywords(articleId,keywords,singleCount,singlePercent) values(?articleId,?KeywordsStr,?singleCount,?singlePercent)";
int articleId2 = singleKeywords.ArticleId;
string keywordsStr = singleKeywords.KeywordsStr;
int singleCount = singleKeywords.SingleCount;
double singlePercent = singleKeywords.SinglePercent;
MySqlParameter[] keywordsParms = new MySqlParameter[] {
new MySqlParameter("?articleId", MySqlDbType.Int32, 4),
new MySqlParameter("?KeywordsStr", MySqlDbType.VarChar),
new MySqlParameter("?singleCount", MySqlDbType.Int32, 4),
new MySqlParameter("?singlePercent", MySqlDbType.Double,4)};
keywordsStr = Traditional2Simplified(keywordsStr);
keywordsParms[0].Value = articleId2;
keywordsParms[1].Value = keywordsStr;
keywordsParms[2].Value = singleCount;
keywordsParms[3].Value = singlePercent;
WoWExpress.Core.MySqlHelper.ExecuteNonQuery(myConnectionStr, CommandType.Text, mysqlStr2, keywordsParms);
}
return "ok";
}
/*辅助程序,改变编码*/
private string DBStringToNormal(string dbStr)
{
byte[] str = new byte[dbStr.Length];
for (int i = 0; i < dbStr.Length; ++i)
str[i] = (byte)(dbStr[i]);
return System.Text.Encoding.Default.GetString(str, 0, dbStr.Length);
}
public string Traditional2Simplified(string str)
{ //繁体转简体
return (Microsoft.VisualBasic.Strings.StrConv(str, Microsoft.VisualBasic.VbStrConv.SimplifiedChinese, 0));
}
/// <summary>
/// 提取HTML代码中文字的C#函数
/// </summary>
public string StripHTML2(string strHtml)
{
string[] aryReg ={
@"<script[^>]*?>.*?</script>",
@"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(http://www.cnblogs.com/oxite/admin/file://[%22%22'tbnr]%7c[%5e/7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",
@"([\r\n])[\s]+",
@"&(quot|#34);",
@"&(amp|#38);",
@"&(lt|#60);",
@"&(gt|#62);",
@"&(nbsp|#160);",
@"&(iexcl|#161);",
@"&(cent|#162);",
@"&(pound|#163);",
@"&(copy|#169);",
@"&#(\d+);",
@"-->",
@"<!--.*\n"
};
string[] aryRep = {
"",
"",
"",
"\"",
"&",
"<",
">",
" ",
"\xa1",//chr(161),
"\xa2",//chr(162),
"\xa3",//chr(163),
"\xa9",//chr(169),
"",
"\r\n",
""
};
string newReg = aryReg[0];
string strOutput = strHtml;
for (int i = 0; i < aryReg.Length; i++)
{
Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase);
strOutput = regex.Replace(strOutput, aryRep[i]);
}
strOutput.Replace("<", "");
strOutput.Replace(">", "");
strOutput.Replace("\r\n", "");
return strOutput;
}
/// <summary>
/// 提取HTML代码中文字的C#函数
/// </summary>
public string StripHTML3(string strHtml)
{
return strHtml.Replace(@"[^A-Za-z0-9\u4E00-\u9FBB]", "");
}
/*利用得到的最原始的关键词列表进行全局计算*/
public List<AllKeywords> ComputeKeywords(List<SingleKeywords> singleKeywords)
{
/*处理过后得到的关键词列表*/
List<AllKeywords> allKeywordsArray = new List<AllKeywords>();
/*把关键词相同的全部加起来,统计全局关键词*/
//初始化全局统计列表
AllKeywords allKeywords = new AllKeywords(singleKeywords[0].KeywordsStr, singleKeywords[0].SingleCount, 0);
allKeywordsArray.Add(allKeywords);
int singleKeywordsCount = singleKeywords.Count;
for (int i = 1; i < singleKeywordsCount - 1; i++)
{
int allkeywordsLength = allKeywordsArray.Count;
bool flag = true;
for (int j = 0; j < allkeywordsLength; j++)
{
string tempSingleKeywords = singleKeywords[i].KeywordsStr.Trim();
string tempAllKeywords = allKeywordsArray[j].KeywordsStr.Trim();
if (tempSingleKeywords.Equals(tempAllKeywords))
{
allKeywordsArray[j].AllCount += 1;
flag = false;
break;
}
}
//true代表没有一个是相同的,allKeywordsArray要加关键词
if (flag)
{
AllKeywords addAllKeywords = new AllKeywords(singleKeywords[i].KeywordsStr, singleKeywords[i].SingleCount, 0);
allKeywordsArray.Add(addAllKeywords);
}
}
/*这里进行百分比的计算*/
//todo
return allKeywordsArray;
}
/**/
/// <summary>
/// 将Html标签转化为空格
/// </summary>
/// <param name="strHtml">待转化的字符串</param>
/// <returns>经过转化的字符串</returns>
private string stripHtml(string strHtml)
{
Regex objRegExp = new Regex("<(.|\n)+?>");
string strOutput = objRegExp.Replace(strHtml, "");
strOutput = strOutput.Replace("<", "<");
strOutput = strOutput.Replace(">", ">");
//把所有空格变为一个空格
Regex r = new Regex(@"\s+");
strOutput = r.Replace(strOutput, " ");
strOutput.Trim();
return strOutput;
}
private void btnUseStopword_Click(object sender, EventArgs e)
{
/*这里分开写,好校验,现在是为了方便*/
List<ArticleInfo> targetArticles = this.UseStopwords(this.GetMyArticles("10"), this.GetMyStopwords());
dataGridView3.DataSource = targetArticles;
}
private void btnGetArticle_Click(object sender, EventArgs e)
{
DataSet articleDS = this.GetArticles();
dataGridView2.DataSource = articleDS.Tables[0];
}
private void btnStopwords_Click(object sender, EventArgs e)
{
//string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152");
//string mysqlStr = "select * from stopwords where stopwordsId = @stopwordsId ";
//MySqlParameter myParameter = new MySqlParameter("@stopwordsId", MySqlDbType.Int32, 4);
//myParameter.Value = 26;
DataSet stopwordsDataSet = this.GetStopwords();
dataGridView1.DataSource = stopwordsDataSet.Tables[0];
}
/*这里循环点击,或者程序自动点击也行*/
private void btnGetKeywords_Click(object sender, EventArgs e)
{
/*这里分开写,好校验,现在是为了方便---分批处理,并且能够自动,判断,如果返回了值,就可以继续循环*/
/*按分页的方法,先统计出一共多少篇文章,规定每次执行的篇数,计算出需要执行的次数,利用返回结果来判断是否当前
处理是否已经完成,完成者继续执行,否者报出错原因*/
int articlesCount = this.GetArticlesCount();
int pageLength = 10;
int doCount = articlesCount/pageLength;
int lastLength = articlesCount % pageLength;
progressBar1.Maximum = doCount;//设置最大长度值
progressBar1.Value = 0;//设置当前值
progressBar1.Step = 1;//设置没次增长多少
System.Threading.Thread thread = new System.Threading.Thread(delegate(object arg) {
//如果总数小于单批长度,直接一次处理,长度为余数
if (articlesCount < pageLength)
{
List<SingleKeywords> singleKeywordsArray = this.SplitArticle(this.UseStopwords(this.GetMyArticles(lastLength.ToString()), this.GetMyStopwords()));
OnGridViewDataBind(singleKeywordsArray);
}
else
{
for (int i = 0; i < doCount; i++)
{
//这里循环操作
List<SingleKeywords> singleKeywordsArray = this.SplitArticle(this.UseStopwords(this.GetMyArticles(pageLength.ToString()), this.GetMyStopwords()));
OnGridViewDataBind(singleKeywordsArray);
OnRrogressBarAdd(progressBar1.Step);
}
//如果有余数,单独处理最后一次
if (lastLength != 0)
{
List<SingleKeywords> singleKeywordsArray = this.SplitArticle(this.UseStopwords(this.GetMyArticles(lastLength.ToString()), this.GetMyStopwords()));
OnGridViewDataBind(singleKeywordsArray);
}
}
});
thread.Start();
}
private void btnAllCompute_Click(object sender, EventArgs e)
{
List<AllKeywords> allKeywordsArray = this.ComputeKeywords(this.SplitArticle(this.UseStopwords(this.GetMyArticles("10"), this.GetMyStopwords())));
dataGridView5.DataSource = allKeywordsArray;
}
/*跨线程的操作*/
//绑定datagridview
protected delegate void GridViewDataBind(object source);
protected void OnGridViewDataBind(object source)
{
if (dataGridView4 == null)
return;
if (dataGridView4.InvokeRequired)
dataGridView4.Invoke(new GridViewDataBind(
delegate(object dataSource)
{
dataGridView4.DataSource = dataSource;
}
), source);
else
dataGridView4.DataSource = source;
}
//设置整体进度条
protected delegate void RrogressBarAdd(int step);
protected void OnRrogressBarAdd(int step)
{
if (progressBar1 == null)
return;
if (progressBar1.InvokeRequired)
progressBar1.Invoke(new RrogressBarAdd(
delegate(int mystep)
{
progressBar1.Value += mystep;//让进度条增加一次
}
), step);
else
progressBar1.Value += step;//让进度条增加一次
}
//设置分进度条
protected delegate void RrogressBarAdd2(int step);
protected void OnRrogressBarAdd2(int step)
{
if (progressBar2 == null)
return;
if (progressBar2.InvokeRequired)
progressBar2.Invoke(new RrogressBarAdd2(
delegate(int mystep)
{
progressBar2.Value += mystep;//让进度条增加一次
}
), step);
else
progressBar2.Value += step;//让进度条增加一次
}
protected delegate void RrogressBar2Set(int maximum);
protected void OnRrogressBar2Set(int maximum)
{
if (progressBar2 == null)
return;
if (progressBar2.InvokeRequired)
progressBar2.Invoke(new RrogressBar2Set(
delegate(int myMaximum)
{
progressBar2.Maximum = myMaximum;//设置最大长度值-
progressBar2.Value = 0;//设置当前值
progressBar2.Step = 1;//设置没次增长多少
}
), maximum);
else
progressBar2.Value += maximum;//让进度条增加一次
}
public string segment(string articleStr)
{
WordSegV1 seg = new WordSegV1();
string s = seg.Segment(articleStr,' ');
return s;
}
}
}
程序开发完毕后,我突然发现分词不是那么容易的,找了下,又发现好东西了,c#版本开源的中文分词-ictclas,和一个简单的c#版本的分词组件。中文分词组件 好慢,等申请首页发布我在给出另外下载的代码吧,呵呵,看博客园园长的了。