用mysql数据库写的分词算法代码
我辛苦的整了几天才整好的 拿来给大家分享一下希望可以帮助大家 以下分为四步:每步都有注释说明的
#region 一.先从article表里查询数据 /// <summary> /// 一.先从article表里查询数据 /// </summary> public void fenciBind() { string sql = "select * from article;"; string str = ConfigurationManager.ConnectionStrings["ConnectionString"].ToString(); MySqlConnection con = new MySqlConnection(str); con.Open(); MySqlDataAdapter msda = new MySqlDataAdapter(sql, con); DataTable dt = new DataTable(); msda.Fill(dt); for (int i = 0; i < dt.Rows.Count; i++)//循环数据库里的数据 { string strcon = dt.Rows[i][1].ToString();//标题 strcon += dt.Rows[i][3].ToString();//内容 DateTime strtime = Convert.ToDateTime(dt.Rows[i][4]);//时间 //判断时间 在一天内容不让他进行分词 if (strtime < DateTime.Now.AddDays(-1)) { string artsrt = dt.Rows[i]["id"].ToString(); fencistr(strcon, artsrt); con.Close(); } } } #endregion #region 二.article表里的数据进行分词 /// <summary> /// 二.article表里的数据进行分词 /// </summary> /// <param name="strcon">article表里要分词标题和内容的数据</param> /// <param name="artsrt">article表里数据id编号</param> /// <returns></returns> public string fencistr(string strcon, string artsrt) { StringBuilder sb = new StringBuilder(); sb.Remove(0, sb.Length); string t1 = ""; Analyzer analyzer = new Lucene.China.ChineseAnalyzer(); StringReader sr = new StringReader(strcon); TokenStream stream = analyzer.TokenStream(null, sr); Token t = stream.Next(); while (t != null) { t1 = t.ToString(); //显示格式: (关键词,0,2) ,需要处理 t1 = t1.Replace("(", ""); char[] separator = { ',' }; t1 = t1.Split(separator)[0]; sb.Append("," + t1); t = stream.Next(); } //三.汉字转换拼音 pinyinstr(sb.ToString(), sb.ToString(), artsrt); return strcon; } #endregion #region 三.汉字转换拼音 /// <summary> ///三.汉字转换拼音 /// </summary> /// <param name="sb">用于转化为pingyin的汉字</param> /// <param name="sbstr">用于往keywords表里添加的汉字</param> /// <param name="artsrt">article表里数据id编号</param> /// <returns></returns> public string pinyinstr(string sb, string sbstr, string artsrt) { string pystr = null;//pinyin表用逗号隔开的集合 string s = sb.ToString(); for (int i = 0; i < s.Length; i++) { if (ChineseChar.IsValidChar(s[i])) { ChineseChar CString = new ChineseChar(s[i]); for (int ii = 0; ii < CString.PinyinCount; ii++) { string PinYins = CString.Pinyins[ii].ToString().ToLower(); if (PinYins[PinYins.Length - 1].CompareTo('5') < 0) { pystr += PinYins; } } pystr += "|"; } pystr += ","; } pystr = pystr.Replace("|,", "").TrimEnd(','); Opestr(pystr, sbstr, artsrt);//四.创建拼音数据表并添加数据 return pystr; } #endregion #region 四.创建拼音数据表并添加数据 同时往keywords表里添加数据 /// <summary> /// 创建拼音数据表并添加数据 同时往keywords表里添加数据 /// </summary> /// <param name="pystr">要创建的每个pinyin表</param> /// <param name="sbstr">article表里的数据分词后逗号隔开的字符串</param> /// <param name="artsrt">article表里数据id编号</param> public void Opestr(string pystr, string sbstr, string artsrt) { string[] PinYins = pystr.Trim().Split(','); for (int i = 1; i < PinYins.Length; i++) { //四.创建拼音数据表并添加数据 //************************************1.往keywords表里添加数据********************************************* //(1).查询keywords表,并判断keywords表里pinyin是否存在相同的 string str = ConfigurationManager.ConnectionStrings["ConnectionString"].ToString(); MySqlConnection con = new MySqlConnection(str); con.Open(); string kwssql = "select * from `hww_article_search`.`keywords`"; MySqlDataAdapter kwsmda = new MySqlDataAdapter(kwssql, con); DataTable kwdt = new DataTable(); kwsmda.Fill(kwdt); string[] hzstr = sbstr.Trim().Split(',');//汉字用逗号分割的数据 for (int ii = 1; ii < hzstr.Length; ii++) { if (kwdt.Rows.Count != 0) //(2).如果pinyin表里有数据,则先进行判断是否有相同的pinyin值 { string kwstr = null; for (int ll = 0; ll < kwdt.Rows.Count; ll++) { kwstr += "," + kwdt.Rows[ll]["pinyin"].ToString(); } string kwpy = PinYins[i]; if (!kwstr.Contains(kwpy)) { //(3).不存在相同的pinyin则添加 string kwsql = "INSERT INTO `hww_article_search`.`keywords` (`keyword`, `pinyin`) VALUES ('" + hzstr[ii] + "', '" + PinYins[ii] + "');"; MySqlCommand kwcom = new MySqlCommand(kwsql, con); kwcom.ExecuteNonQuery();//添加 } } else//如果没有数据则添加 { string kwsql = "INSERT INTO `hww_article_search`.`keywords` (`keyword`, `pinyin`) VALUES ('" + hzstr[ii] + "', '" + PinYins[ii] + "');"; MySqlCommand kwcom = new MySqlCommand(kwsql, con); kwcom.ExecuteNonQuery();//添加 } } //************************************************2.建库建表************************************************** //(1).先建库 string sqlcre = "create table if not exists `hww_article_search`.`" + PinYins[i] + "` ( `id` int(10) not null auto_increment, `article_id` int(10) unsigned not null,primary key(`id`));"; MySqlCommand com = new MySqlCommand(sqlcre, con); com.ExecuteNonQuery(); //(2).查询pinyin表,并判断yinpin表里article_id是否存在此相同的id string sqlif = "select * from `hww_article_search`.`" + PinYins[i] + "`"; MySqlDataAdapter msdaif = new MySqlDataAdapter(sqlif, con); DataTable dtif = new DataTable(); msdaif.Fill(dtif); //(3).往pinyin表里添加数据sql语句 string sqladd = " insert into `hww_article_search`.`" + PinYins[i] + "`(`article_id`) values(" + artsrt + ");"; MySqlCommand comadd = new MySqlCommand(sqladd, con); //(4).添加成功后修改article表里的时间为当前时间sql语句 string uptimesql = "update `hww_article_search`.`article` set `update_time`='" + DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss") + "' where `id`=" + artsrt + ";"; MySqlCommand comtime = new MySqlCommand(uptimesql, con); //如果pinyin表里有数据,则先进行判断是否有相同的id值 if (dtif.Rows.Count != 0) { string sdi = null; for (int j = 0; j < dtif.Rows.Count; j++) { sdi += dtif.Rows[j]["article_id"].ToString() + ","; } string sad = artsrt; if (!sdi.Contains(sad)) { //3.不存在相同的id号则添加 comadd.ExecuteNonQuery();//添加 comtime.ExecuteNonQuery();//修改 } } else//如果没有数据则添加 { comadd.ExecuteNonQuery();//添加 comtime.ExecuteNonQuery();//修改 } } } #endregion
运行结果如下图: