Daily Report 2012/11/07 陈伯雄(step 8)

  今天针对PIPE组对数据表的修改,对建立倒排索引做了系统的修改,由于表DOC、VEDIO、QUESTION(由QAPAIR修改为QUESTION)的属性并不完全相同,处理数据方法进行少量修改:

  DOC表和VEDIO表具有的相同属性:title;

  DOC独有属性:author,keywords;

  QUESTION独有属性:question;

  3个表最后的到的倒排索引结构式相同的,得到WORDLIST和对应ID;

  以下功能整合到分词模块和更新倒排索引模块中

//分词
        static private List<string> getWords(int type, SqlDataReader reader)
        {
            List<string> listall = new List<string>();
            if (type == 0)
            {
                string title = reader[_Title].ToString();
                string keyword = reader[_KeyWords].ToString();
                string author = reader[_Author].ToString();
                //string description = reader[_Description].ToString();
                List<string> list1 = ChineseWordSegmentation.word_segmentation(title);
                List<string> list2 = keyword.Split(new char[2] { ' ', ':' }, StringSplitOptions.RemoveEmptyEntries).ToList();
                List<string> list3 = author.Split(new char[2] { ' ', '.' }, StringSplitOptions.RemoveEmptyEntries).ToList();
                //List<string> list4 = ChineseWordSegmentation.word_segmentation(description);
                //listall = list1.Union(list2).Union(list3).Union(list4).ToList();
                listall = list1.Union(list2).Union(list3).ToList();
            }
            else if (type == 1)
            {
                string title = reader[_Title].ToString();
                //string description = reader[_Description].ToString();
                //List<string> list1 = ChineseWordSegmentation.word_segmentation(title);
                //List<string> list2 = ChineseWordSegmentation.word_segmentation(description);
                //listall = list1.Union(list2).ToList();
                listall = ChineseWordSegmentation.word_segmentation(title);
            }
            else
            {
                string question = reader[_Question].ToString();
                listall = ChineseWordSegmentation.word_segmentation(question);
            }
            return listall;
        }

        //更新倒排索引
        static private void updateIndex(List<string> words, SqlConnection connection, string ID)
        {
            SqlCommand cmd = new SqlCommand();
            cmd.Connection = connection;
            foreach (string word in words)
            {
                //倒排表中加入新关键词                            
                cmd.CommandText = "SELECT value FROM index3 WHERE value = word";
                object val = cmd.ExecuteScalar();
                if (val == System.DBNull.Value)                               //if(cmd.ExecuteScalar() is DBNull)
                {
                    cmd.CommandText = "INSERT INTO index3 VALUES(word, ID)";
                    cmd.ExecuteNonQuery();
                }
                //倒排索引中存在的关键词,加上属性ID信息
                else
                {
                    string newValue = val.ToString() + "," + ID;
                    cmd.CommandText = "UPDATE index3 SET value = newValue WHERE key = word";
                    cmd.ExecuteNonQuery();
                }
            }
        }

主函数部分:

 1  List<Result> resultList = new List<Result>();
 2             string connectionString = GetConnectionString();                            //SQL Server链接字符串   
 3             using (SqlConnection connection = new SqlConnection(connectionString))      //SQL链接类的实例化
 4             {
 5                 connection.Open();                                                      //打开数据库
 6                 //建立倒排表
 7                 string sqlstr = "CREATE table index_doc(key varchar(50) primary key, ID varchar(50))";                
 8                 SqlCommand cmd = new SqlCommand();
 9                 cmd.Connection = connection;
10                 cmd.CommandText = sqlstr;
11                 cmd.ExecuteNonQuery();
12                 sqlstr = "CREATE table index_vedio(key varchar(50) primary key, ID varchar(50))";
13                 cmd.CommandText = sqlstr;
14                 cmd.ExecuteNonQuery();
15                 sqlstr = "CREATE table index_question(key varchar(50) primary key, ID varchar(50))";
16                 cmd.CommandText = sqlstr;
17                 cmd.ExecuteNonQuery();
18 
19                 for (int i = 0; i < 3;i++ )
20                 {
21                     string table = "";
22                     if (i == 0) table = _TableDoc;
23                     else if (i == 1) table = _TableVideo;
24                     else table = _TableQuestion;
25                     //读取顺序表
26                     sqlstr = "SELECT * FROM" + table;
27                     cmd.CommandText = sqlstr;
28                     SqlDataReader reader = cmd.ExecuteReader();                             
29                     try
30                     {
31                         while (reader.Read())
32                         {
33                             string ID = reader[_ID].ToString();
34                             //分词处理
35                             List<string> words = getWords(i, reader);
36                             //将keyword信息添加到倒排表
37                             updateIndex(words, connection, ID);
38                         }
39                     }
40                     finally
41                     {
42                         // Always call Close when done reading.
43                         reader.Close();
44                     }
45                 }
posted @ 2012-11-07 23:36  DOOM_buaascse  阅读(239)  评论(0编辑  收藏  举报