Daily Report 2012/11/06 陈伯雄(step 7)
今天完成了中文分词功能,修改了sql建立倒排索引的框架,原来打算建立一个倒排索引,但考虑到数据库存在4个不同的文件类型的表:DOC,PDF,VEDIO,QAPAIR,现在分别针对每个表建立倒排索引,根据用户搜索需求搜索不同的表(效率问题有待商榷)。
1 List<Result> resultList = new List<Result>(); 2 string connectionString = GetConnectionString(); //SQL Server链接字符串 3 using (SqlConnection connection = new SqlConnection(connectionString)) //SQL链接类的实例化 4 { 5 connection.Open(); //打开数据库 6 //建立倒排表 7 string sqlstr = "CREATE table index_doc(key varchar(50) primary key, ID varchar(50))"; 8 SqlCommand cmd = new SqlCommand(); 9 cmd.Connection = connection; 10 cmd.CommandText = sqlstr; 11 cmd.ExecuteNonQuery(); 12 sqlstr = "CREATE table index_pdf(key varchar(50) primary key, ID varchar(50))"; 13 cmd.CommandText = sqlstr; 14 cmd.ExecuteNonQuery(); 15 sqlstr = "CREATE table index_vedio(key varchar(50) primary key, ID varchar(50))"; 16 cmd.CommandText = sqlstr; 17 cmd.ExecuteNonQuery(); 18 sqlstr = "CREATE table index_qapair(key varchar(50) primary key, ID varchar(50))"; 19 cmd.CommandText = sqlstr; 20 cmd.ExecuteNonQuery(); 21 22 for (int i = 0; i < 4;i++ ) 23 { 24 string table = ""; 25 if (i == 0) table = "index_doc"; 26 else if(i == 1) table = "index_pdf"; 27 else if(i == 2) table = "index_vedio"; 28 else table = "index_qapair"; 29 30 //读取顺序表 31 sqlstr = "SELECT * FROM" + table; 32 cmd.CommandText = sqlstr; 33 SqlDataReader reader = cmd.ExecuteReader(); //它的返回类型为SqlDataReader。此方法用于用户进行的查询操作。使用SqlDataReader对象的Read();方法进行逐行读取。 34 try 35 { 36 while (reader.Read()) 37 { 38 string title = reader["title"].ToString(); 39 string keyword = reader["keyword"].ToString(); 40 string ID = reader["ID"].ToString(); 41 //分词处理 42 //string[] words = getWords(title, keyword); 43 List<string> words = getWords(title, keyword); //中文分词详细请看李忠部分。 44 //将keyword信息添加到倒排表 45 foreach (string word in words) 46 { 47 //倒排表中加入新关键词 48 cmd.CommandText = "SELECT value FROM index3 WHERE value = word"; 49 object val = cmd.ExecuteScalar(); 50 if (val == System.DBNull.Value) //if(cmd.ExecuteScalar() is DBNull) 51 { 52 cmd.CommandText = "INSERT INTO index3 VALUES(word, ID)"; 53 cmd.ExecuteNonQuery(); 54 } 55 //倒排索引中存在的关键词,加上属性ID信息 56 else 57 { 58 string newValue = val.ToString() + "," + ID; 59 cmd.CommandText = "UPDATE index3 SET value = newValue WHERE key = word"; 60 cmd.ExecuteNonQuery(); 61 } 62 } 63 } 64 } 65 finally 66 { 67 // Always call Close when done reading. 68 reader.Close(); 69 } 70 } 71 72 73 //匹配(未完成) 74 string userWord = "用户搜索的关键词"; 75 //match(userWord); 76 77 78 connection.Close();