Daily Report 2012/11/07 陈伯雄(step 8)
今天针对PIPE组对数据表的修改,对建立倒排索引做了系统的修改,由于表DOC、VEDIO、QUESTION(由QAPAIR修改为QUESTION)的属性并不完全相同,处理数据方法进行少量修改:
DOC表和VEDIO表具有的相同属性:title;
DOC独有属性:author,keywords;
QUESTION独有属性:question;
3个表最后的到的倒排索引结构式相同的,得到WORDLIST和对应ID;
以下功能整合到分词模块和更新倒排索引模块中
//分词 static private List<string> getWords(int type, SqlDataReader reader) { List<string> listall = new List<string>(); if (type == 0) { string title = reader[_Title].ToString(); string keyword = reader[_KeyWords].ToString(); string author = reader[_Author].ToString(); //string description = reader[_Description].ToString(); List<string> list1 = ChineseWordSegmentation.word_segmentation(title); List<string> list2 = keyword.Split(new char[2] { ' ', ':' }, StringSplitOptions.RemoveEmptyEntries).ToList(); List<string> list3 = author.Split(new char[2] { ' ', '.' }, StringSplitOptions.RemoveEmptyEntries).ToList(); //List<string> list4 = ChineseWordSegmentation.word_segmentation(description); //listall = list1.Union(list2).Union(list3).Union(list4).ToList(); listall = list1.Union(list2).Union(list3).ToList(); } else if (type == 1) { string title = reader[_Title].ToString(); //string description = reader[_Description].ToString(); //List<string> list1 = ChineseWordSegmentation.word_segmentation(title); //List<string> list2 = ChineseWordSegmentation.word_segmentation(description); //listall = list1.Union(list2).ToList(); listall = ChineseWordSegmentation.word_segmentation(title); } else { string question = reader[_Question].ToString(); listall = ChineseWordSegmentation.word_segmentation(question); } return listall; } //更新倒排索引 static private void updateIndex(List<string> words, SqlConnection connection, string ID) { SqlCommand cmd = new SqlCommand(); cmd.Connection = connection; foreach (string word in words) { //倒排表中加入新关键词 cmd.CommandText = "SELECT value FROM index3 WHERE value = word"; object val = cmd.ExecuteScalar(); if (val == System.DBNull.Value) //if(cmd.ExecuteScalar() is DBNull) { cmd.CommandText = "INSERT INTO index3 VALUES(word, ID)"; cmd.ExecuteNonQuery(); } //倒排索引中存在的关键词,加上属性ID信息 else { string newValue = val.ToString() + "," + ID; cmd.CommandText = "UPDATE index3 SET value = newValue WHERE key = word"; cmd.ExecuteNonQuery(); } } }
主函数部分:
1 List<Result> resultList = new List<Result>(); 2 string connectionString = GetConnectionString(); //SQL Server链接字符串 3 using (SqlConnection connection = new SqlConnection(connectionString)) //SQL链接类的实例化 4 { 5 connection.Open(); //打开数据库 6 //建立倒排表 7 string sqlstr = "CREATE table index_doc(key varchar(50) primary key, ID varchar(50))"; 8 SqlCommand cmd = new SqlCommand(); 9 cmd.Connection = connection; 10 cmd.CommandText = sqlstr; 11 cmd.ExecuteNonQuery(); 12 sqlstr = "CREATE table index_vedio(key varchar(50) primary key, ID varchar(50))"; 13 cmd.CommandText = sqlstr; 14 cmd.ExecuteNonQuery(); 15 sqlstr = "CREATE table index_question(key varchar(50) primary key, ID varchar(50))"; 16 cmd.CommandText = sqlstr; 17 cmd.ExecuteNonQuery(); 18 19 for (int i = 0; i < 3;i++ ) 20 { 21 string table = ""; 22 if (i == 0) table = _TableDoc; 23 else if (i == 1) table = _TableVideo; 24 else table = _TableQuestion; 25 //读取顺序表 26 sqlstr = "SELECT * FROM" + table; 27 cmd.CommandText = sqlstr; 28 SqlDataReader reader = cmd.ExecuteReader(); 29 try 30 { 31 while (reader.Read()) 32 { 33 string ID = reader[_ID].ToString(); 34 //分词处理 35 List<string> words = getWords(i, reader); 36 //将keyword信息添加到倒排表 37 updateIndex(words, connection, ID); 38 } 39 } 40 finally 41 { 42 // Always call Close when done reading. 43 reader.Close(); 44 } 45 }