歌词VSM实现!!!

主体VSM_SetUP.cpp;

 1 /*[ar:歌词作者]
 2 [ti:歌词(歌曲)的标题]
 3 */
 4 //编写程序实现lrc(歌词) 文件的检索,检索模型要求采用向量空间模型。请将源程序和文档压缩后,一并上传。
 5 #include<iostream>
 6 
 7 #include"Document_Index.h"
 8 
 9 using namespace std;
10 void main() {
11     cout << "*****本程序实现一个向量空间模型,对(D:\\暂时的)文件夹下的lrc文件进行遍历建立一个对应于歌曲名,作曲者,歌词主体的检索系统,请根据需要进行查询*****" << endl << endl;
12     cout << "*****检索出的文档编号对应的文档名字请在(检索结果.txt)内查找*****" << endl;
13     string query;
14     int select,result;
15     int isornot = 2;
16     Doc_Analysis doc_analysis;
17     doc_analysis.SETUP_Index();
18     ReQuery getResult(&doc_analysis);
19     
20     while (isornot>=1) {
21         cout << "输入查询词项:";
22         if (isornot != 2) {
23             getline(cin, query);
24         }
25         getline(cin, query);
26         
27         cout << "请选择查询模式(1为查歌曲,2为查歌手,3为查歌词主体):";
28         cin >> select;
29         cout << "请选择返回结果的数量:";
30         cin >> result;
31         getResult.Query(query, result, select);
32         cout << "is or not(1表示继续查询,0表示退出查询) ?    " ;
33         cin >> isornot;
34     }
35 }

 

类的保存文件内:Document_Index.cpp和Document_Index.h

  1 #pragma once
  2 #include<iostream>
  3 #include<fstream>
  4 #include<vector>
  5 #include<math.h>
  6 #include<string>
  7 #include<iomanip>
  8 #include <stdio.h>
  9 #include<io.h>
 10 #include <windows.h>
 11 using namespace std;
 12 
 13 const static int Maxsize = 10000;
 14 const static int maxsize = 100;//a line and a smalllist
 15 
 16 //得分和相应的文档编号
 17 struct ScoreandDoc {
 18     float score;
 19     int text_number;
 20 };
 21 
 22 //包含词的所在的文档编号,词项在此文档出现的频率tf
 23 struct Word_Doc {
 24     int text_number;
 25     int text_fre;
 26 };
 27 
 28 //设计保存词项倒排记录头部的一个结构
 29 struct Index_List {//存储每个词的头项,包含单词和指向倒排记录的指针,存储df,df是包含词项的文档的数目,同时也是倒排记录的长度。next指向下一个词项
 30     float  df;
 31     string word;
 32     vector<Word_Doc> head_docID;// = nullptr;
 33     Index_List * next = nullptr;
 34 };
 35 
 36 //文档检索类,VSM的主体
 37 class Doc_Analysis {
 38 
 39     string BTEMP[Maxsize];
 40     string TEMP[maxsize];
 41     int arsize;//
 42     int tisize;
 43     Index_List * arofMusic_idList;  //歌曲作者索引的链表头
 44     Index_List * tiofMusic_idList;  //歌曲名字索引的链表头
 45     Index_List * idList;              //主体歌词倒排索引的链表头
 46     int size; //文档的词项的数目,即文档长度
 47     int allsize;//总的词数
 48      int N;                     //歌词主体文档集的大小,共有N篇文档
 49      int sizeofmusicname;          //歌词名字和歌词作者的大小
 50     string Inp_Temp_Lyrics[Maxsize];    //存放歌词主体
 51     string ti_Temp[maxsize/5];//歌曲名称和歌曲作者的暂时存放之地
 52     string ar_Temp[maxsize / 5];//歌曲名称和歌曲作者的暂时存放之地
 53 public:
 54     Doc_Analysis() {
 55         size = 0;
 56         allsize = 0;
 57         arsize = 0;
 58         tisize = 0;
 59         N = 0;
 60         idList = nullptr;
 61         arofMusic_idList = nullptr;
 62         tiofMusic_idList = nullptr;
 63     };
 64     ~Doc_Analysis() {
 65 
 66     };
 67 
 68 
 69     //把从文档中检索的词插入Inp_Temp_Words[Maxsize],如果不在就直接插入,如果已经存在则加一个
 70 
 71     //打开文件输入歌词,对词项进行分析,把歌曲的作者和歌曲名取出,存入对应的倒排索引,但是这个索引很小,所以可以直接构建倒排索引表。参数分别为:文件名,文档的编号。先对文档内歌词的作者和调用分离出的词项最终存储在Inp_Temp_Words[Maxsize],返回文档的词数
 72     int Doc_input(string filename, int number);
 73 
 74     //被int Doc_input(string filename, int number);调用,将分离出的词项存储在temp_Words[]中,size表示其大小,j表示其从哪一个数开始放入
 75     int Temp_Insert(string temp_words[],char T[],int &size);
 76 
 77     //对此文档的词项的表进行归并排序(按字典序)
 78     void Doc_mergesort(string *inputWord, string* Temp, int left, int right);
 79 
 80     //将此次输入的文档分词排序后得到的词项表存入最终的倒排索引中
 81     Index_List* insert_IndexList(string *inputWord, int n,int NofDoc, Index_List * idList);
 82 
 83     //歌曲名,作者,歌词主体倒排索引总体构建
 84     void SETUP_Index();// {};
 85 
 86     //返回最终查询的文档集大小
 87     int SizeOfDocSet() {
 88         return N;
 89     }
 90 
 91     //返回最终生成的歌词主体倒排索引表
 92     Index_List* tiIndex_head() {
 93         return tiofMusic_idList;
 94     };
 95 
 96     //返回最终生成的歌ming倒排索引表
 97     Index_List* DocIndex_head() {
 98         return idList;
 99     };
100 
101     //返回最终生成的作者倒排索引表
102     Index_List* arIndex_head() {
103         return arofMusic_idList;
104     };
105 
106     //此函数实现寻找指定文件夹下的指定后缀文件,并且保存其完整的路径
107     void GetAllFormatFiles(string path, vector<string>& files, string format);
108 //
109 };
110 
111 //对输入的查询词项进行分析,返回输入结果
112 class ReQuery {
113     Doc_Analysis* LMA;//歌词倒排索引的链表头
114     float *Scores;//每个查询词项的初始得分
115     float *arLength;//每个文档的长度
116     float *tiLength;
117     float *Length;
118     int N;//文档集大小
119 public:
120     ReQuery(Doc_Analysis* TEMP) {//得到Doc_Analysis返回的文档集长度和链表头
121         N = TEMP->SizeOfDocSet();
122         Scores = new float[N];
123         Length = new float[N];
124         
125         LMA = TEMP;
126         for (int i = 0; i < N; i++) {
127             Scores[i] = 0;
128             Length[i] = 0;
129         }
130     };
131     ~ReQuery() {
132         delete[] Scores;
133         delete[] Length;
134         delete[] arLength;
135         delete[] tiLength;
136     };
137 
138     //查询所有词项,对所有倒排索引表遍历一次,将每个向量的长度计算出来,初始化得分数组
139     void initialLength(Index_List * idList, float *tempLength);
140 
141     //输入查询歌词词项,输出查询结果,返回排名前k的文档编号,select 为选择的查询模式,1为查歌名,2为查歌手,3为查歌词主体
142     bool Query(string query, int k,int select=1);
143 
144     //查询某一个词是否在其中,有则返回其df,无则返回0
145     int isInner(string elem, Index_List idList[],Index_List &nowTemp);
146 
147     //某个词项对于suoyou文档的得分
148     float ScoreofaDoc(Index_List *idList, Index_List* word, int iQ);
149 
150     //进行堆排序,将所有的得分进行排序
151     bool HeapSort(float Scores[], int n,int k);
152 
153 };
154 
155 //堆排序,建立最大堆
156 class Max_Heap {
157     ScoreandDoc *Heap;
158     int size;
159     int n;
160     void siftdown(int elem);
161 public:
162     Max_Heap(int num, int max, ScoreandDoc *temp) {
163         n = num;
164         size = max;
165         Heap = temp;
166         buildHeap();
167     };
168     void buildHeap() {
169         for (int i = n / 2 - 1; i >= 0; i--)
170             siftdown(i);
171     };
172     int heapsize()const {
173         return n;
174     }
175     bool isLeaf(int pos)const {
176         return (pos >= n / 2) && (pos < n);
177     }
178     int leftchild(int pos)const {
179         return 2 * pos + 1;
180     }
181     int rightchild(int pos)const {
182         return 2 * pos + 2;
183     }
184     ScoreandDoc removemax(float it);
185 
186 };

——————————————————————————————————————————————————————————————————————————————————

  1 #pragma once
  2 #include"Document_Index.h"
  3 #include<iostream>
  4 #include<fstream>
  5 #include<vector>
  6 #include<math.h>
  7 #include<string>
  8 #include<iomanip>
  9 #include <stdio.h>
 10 #include<io.h>
 11 #include <windows.h>
 12 using namespace std;
 13 
 14 ///////////类:Doc_Analysis///////////////////////
 15 
 16 //打开文件输入歌词,对词项进行分析,把歌曲的作者和歌曲名取出,存入对应的倒排索引,但是这个索引很小,所以可以直接构建倒排索引表。参数分别为:文件名,文档的编号。调用分离出的词项最终存储在Inp_Temp_Words[Maxsize],返回歌词文档的词数
 17 int Doc_Analysis::Doc_input(string filename, int number) {
 18     ifstream fin(filename);
 19     if (!fin.is_open()) {
 20         exit(0);
 21     }
 22     N++;
 23 //    cout << "此文件夹文档数目:" << N << endl;
 24     char c[maxsize] = { '\0' };
 25     int ic = 0, i = 2;
 26     int numberofDoc = 0;
 27     string str;
 28     getline(fin, str);
 29 
 30     //分离出作者
 31     for (; str[i] != ']'; i++) {
 32         if (str[i] == ':'&&str[i - 1] == 'r'&&str[i - 2] == 'a') {
 33             for (; str[i] != ']'; i++) {
 34                 if (((int)str[i] >= 65) && ((int)str[i] <= 91))
 35                     c[ic++] = (int)str[i] + 32;
 36                 else
 37                     c[ic++] = str[i];
 38             }
 39             i--;
 40         }
 41     }
 42     Temp_Insert(ar_Temp,c,arsize);
 43     
 44     Doc_mergesort(ar_Temp, TEMP, 0, arsize - 1);
 45     
 46     
 47     //分离出歌名
 48     getline(fin, str);
 49     i = 2;
 50     ic = 0;
 51     for (; str[i] != ']'; i++) {
 52         if (str[i] == ':'&&str[i - 1] == 'i'&&str[i - 2] == 't') {
 53             for (; str[i] != ']'; i++) {
 54                 if (((int)str[i] >= 65) && ((int)str[i] <= 91))
 55                     c[ic++] = (int)str[i] + 32;
 56                 else
 57                     c[ic++] = str[i];
 58             }
 59             i--;
 60         }
 61     }
 62 
 63     //cout << "歌名:" << c << endl;
 64     Temp_Insert(ti_Temp,c, tisize);
 65     
 66     //遍历整个geci文档主体,每次读取一行,然后进行分析
 67     getline(fin, str);
 68     
 69     do {
 70         //cout <<"收到: " <<str << endl;
 71         ic = 0;
 72         for (i = 0; str[i] != ']'; i++);
 73         for (i++; str[i] != '\r'&&str[i] != '\n'&&str[i] != '\0'; i++) {
 74 
 75             //去掉引号后面的字符,但是如果是t的话就不去
 76             if ((int)str[i] == 39) {
 77                 while (str[i] != ' '&&str[i] != '\r'&&str[i] != '\n'&&str[i] != '\0') {
 78                     i++;
 79                     if (str[i] == 't') {
 80                         i--;
 81                         break;
 82                     }
 83                 }
 84                 if (str[i] == '\r' || str[i] == '\n' || str[i] == '\0')
 85                     break;
 86             }
 87 
 88             //除去大小写
 89             if (((int)str[i] >= 65) && ((int)str[i] <= 91))
 90                 c[ic++] = (int)str[i] + 32;
 91             else
 92                 c[ic++] = str[i];
 93         }
 94         c[ic] = '\0';
 95         
 96         Temp_Insert(Inp_Temp_Lyrics, c, numberofDoc);
 97         getline(fin, str);
 98     } while (!fin.eof());
 99     fin.close();
100     allsize += numberofDoc;
101 
102     //cout << "本文档最终分离出词数:" << size << endl;
103     size = numberofDoc;
104     return numberofDoc;
105 };
106 
107 //被int Doc_input(string filename, int number);调用,将分离出的词项存储在temp_Words[]中,size表示其大小
108 int Doc_Analysis::Temp_Insert(string temp_words[], char T[],int &size) {
109     const char *d = "[] -;,:/?!.()\"";//以这些字符为分界符[] -;,:/?!.()\"
110     char *p = NULL;
111     char *next_p = NULL;
112     p = strtok_s(T, d, &next_p);
113     while (p)
114     {
115         //cout << p << endl;
116         temp_words[size++] = p;//put the char* into temp table
117         p = strtok_s(NULL, d, &next_p);
118     }
119     
120     return size;
121 };
122 
123 //对此文档的词项的表进行归并排序(按字典序)
124 void Doc_Analysis::Doc_mergesort(string *inputWord, string* Temp, int left, int right) {
125     int i, j, k, mid = (left + right) / 2;
126     if (left == right)
127         return;
128     Doc_mergesort(inputWord, Temp, left, mid);
129     Doc_mergesort(inputWord, Temp, mid + 1, right);
130     for (i = mid; i >= left; i--)
131         Temp[i] = inputWord[i];
132     for (j = 1; j <= right - mid; j++)
133         Temp[right - j + 1] = inputWord[j + mid];
134     for (i = left, j = right, k = left; k <= right; k++)
135         if (Temp[i]<= Temp[j])
136             inputWord[k] = Temp[i++];
137         else
138             inputWord[k] = Temp[j--];
139 };
140 
141 //将此次输入的文档分词排序后得到的词项表存入最终的倒排索引中,numberofDoc为此文档分离出的词的数目,(不是词项)NofDoc为文档的编号
142 Index_List* Doc_Analysis::insert_IndexList(string *inputWord, int numberofDoc,int NofDoc, Index_List * idListx) {
143     int i = 0, j = 0;
144     Index_List* pre_idList = idListx,*idList=idListx, *idListHead = idListx;    
145     //cout << "词数" << numberofDoc << endl;
146     if (i < numberofDoc) {
147         //cout << " 当前文档的词: " << inputWord[i] << endl;
148         while ((idList != nullptr)&&(i<numberofDoc)) {//将整个倒排索引在此遍历完全,在文档也未结束的情况下
149             //1.词项和目前监测的节点值一样,则直接在其后的此词项的后面加上本文档的相关信息即可
150             if (inputWord[i] == idList->word) {
151                 //cout << "此时词项" << inputWord[i] << "已存在索引表中" << endl;
152                 Word_Doc *temp = new Word_Doc;
153                 temp->text_number = NofDoc;//这个词项的文档编号,把所有相同的词项合并在一起
154                 temp->text_fre = 0;//肯定已经在这个文档出现了一次
155                 do {
156                     temp->text_fre++;
157                     i++;
158                     if (i == numberofDoc)
159                         break;
160                 } while (inputWord[i] == idList->word);//只有当文档检测的词项不一样时退出
161 
162                 idList->df++;//出现该词项的文档数增1,应该为df
163 
164                 idList->head_docID.push_back(*temp);
165                 
166                 /*cout << (idList->head_docID)[idList->head_docID.size() - 1].text_number << endl;*/
167                 
168                 pre_idList = idList;//前一个链表值
169 
170                 //print2(idList);//查看这个idList的具体值
171 
172                 idList = idList->next;//索引表下移
173             }
174             //2.当这个词项比当前索引的词项小时,说明词项肯定在倒排索引中排在当前词项的前面,则将其插入在其之前,注意区分第一个和中间的
175             else if (inputWord[i] < idList->word) {
176                 //cout << inputWord[i] << "比索引表的——" << idList->word << " 小" << endl;
177                 Index_List* newidList = new Index_List;
178                 vector<Word_Doc> forID ;//因为是单独建一个词项的索引,故建立存储倒排索引的容器
179                 Word_Doc *temp = new Word_Doc;
180                 temp->text_number = NofDoc;
181                 temp->text_fre = 0;//肯定已经出现过一次,把所有相同的词项合并在一起
182                 if (idList->word==pre_idList->word) {
183                     //cout << "这个词即将插入索引头。" << endl;
184                     idListHead = newidList;
185                 }
186                 else {
187                     pre_idList->next = newidList;
188                 }
189                 do {
190                     temp->text_fre++;
191                     i++;
192                     if (i == numberofDoc)
193                         break;
194                 } while (inputWord[i] == inputWord[i - 1]);//只有当文档检测的词项不一样时退出
195                 
196                 forID.push_back(*temp);
197 
198                 newidList->df = 1;
199                 newidList->next = idList;
200                 
201                 pre_idList = newidList;
202                 newidList->word = inputWord[i-1];
203                 newidList->head_docID = forID;
204 
205 
206             }
207             //3.当目前文档的词比索引的词项大时,倒排索引表向后走
208             else {
209                 //cout << inputWord[i] << "比索引表的——" << idList->word << " 大" << endl;
210 
211                 pre_idList = idList;
212                 //cout << idList->word << endl;
213                 idList = idList->next;
214             }
215         }
216         //idList==nullptr,,,if条件句成立意味着倒排索引表已经到达尾部,接下来的所有词项都大于索引表内任何词项,可以直接插入,注意区分第一个和中间的
217         while (i < numberofDoc) {
218             idList = new Index_List;
219             if (idListHead == nullptr) {//如果是
220                 pre_idList = idList;
221                 idListHead = idList;
222             }
223             else {
224                 pre_idList->next = idList;
225             }
226             
227             vector<Word_Doc> forID;//建立存储这个词项的倒排索引的容器
228             Word_Doc *temp = new Word_Doc;
229             temp->text_number = NofDoc;
230             temp->text_fre = 0;
231             do {
232                 temp->text_fre++;
233                 i++;
234                 if (i == numberofDoc)
235                     break;
236                 
237             } while (inputWord[i] == inputWord[i - 1]);//把所有相同的词项合并在一起,只有词项不一致时才退出
238             forID.push_back(*temp);
239 
240             idList->df = 1;
241             
242             pre_idList = idList;
243             idList->word = inputWord[i-1];
244             idList->head_docID = forID;
245 
246             //print2(idList); 
247             idList = idList->next;
248         }
249     }
250     //print1(idListHead);
251     return idListHead;
252 };
253 
254 //歌曲名,作者,歌词主体倒排索引总体构建
255 void Doc_Analysis::SETUP_Index(){
256     int i;
257     string tx_filePath = "\0", filePath = "D:\\暂时的", distAll = "检索结果.txt", format = ".lrc";
258     vector<string> files;
259 
260     GetAllFormatFiles(filePath, files, format);
261     distAll = filePath + "\\" + distAll;
262     ofstream ofn(distAll);
263     int tsize = files.size();
264     cout << "文件夹下的.lrc数目:" << tsize << endl;//查询出文件夹下文档的数目
265     for (i = 0; i < tsize; i++)//一次遍历,每检索一个文档将其存入相应的缓冲区,然后建立倒排索引
266     {
267         ofn <<"文档"<<i<<""<< files[i] << endl; // 写入文件  
268         Doc_input(files[i], i);
269         Doc_mergesort(ar_Temp, TEMP, 0, arsize-1);
270         Doc_mergesort(ti_Temp, TEMP, 0, tisize-1);
271         Doc_mergesort(Inp_Temp_Lyrics,BTEMP, 0, size-1);
272     
273         //插入倒排索引
274 
275         arofMusic_idList = insert_IndexList(ar_Temp, arsize, i, arofMusic_idList);
276         tiofMusic_idList = insert_IndexList(ti_Temp, tisize, i, tiofMusic_idList);
277         idList=insert_IndexList(Inp_Temp_Lyrics, size, i, idList);
278     
279 
280         arsize = 0;
281         tisize = 0;
282         size = 0;
283         
284         //cout << "索引链表内容如下:" << endl;
285         //cout << "作者:" << endl;
286         //print1(arofMusic_idList);
287         //cout << "歌名:" << endl;
288         //print1(tiofMusic_idList);
289         //cout << "主体:" << endl;
290         //print1(idList)
291         //insert_IndexList(ti_Temp, arsize, i, tiofMusic_idList);
292         //insert_IndexList(Inp_Temp_Lyrics, arsize, i, idList);
293     }
294     //至此,索引构建完毕
295     /*cout << "主体:" << endl;
296     print1(idList);*/
297     ofn <<endl<< "文件夹下的.lrc数目:" << tsize << endl;
298     cout << endl;
299     ofn << "检索出词数(非词项数):" << allsize << endl;
300     ofn.close();
301     cout << "一共检索出词数(非词项数):" << allsize << endl;
302     cout << "歌曲名索引构建完毕!!!" << endl;
303     cout << "作曲者索引构建完毕!!!" << endl;
304     cout << "歌词主体索引构建完毕!!!" << endl;
305     cout << endl;
306 };
307 
308 //此函数实现寻找指定文件夹下的指定后缀文件,并且保存其完整的路径
309 void Doc_Analysis::GetAllFormatFiles(string path, vector<string>& files, string format)
310 {
311     //文件句柄    
312     long   hFile = 0;
313     //文件信息    
314     struct _finddata_t fileinfo;
315     string p;
316     if ((hFile = _findfirst(p.assign(path).append("\\*" + format).c_str(), &fileinfo)) != -1)
317     {
318         do
319         {
320             if ((fileinfo.attrib &  _A_SUBDIR))
321             {
322                 if (strcmp(fileinfo.name, ".") != 0 && strcmp(fileinfo.name, "..") != 0)
323                 {
324                     files.push_back(p.assign(path).append("\\").append(fileinfo.name));
325                     GetAllFormatFiles(p.assign(path).append("\\").append(fileinfo.name), files, format);
326                 }
327             }
328             else
329             {
330                 files.push_back(p.assign(path).append("\\").append(fileinfo.name));; //将文件路径保存,也可以只保存文件名:    p.assign(fileinfo.name)
331             }
332         } while (_findnext(hFile, &fileinfo) == 0);
333 
334         _findclose(hFile);
335     }
336 };
337 
338 
339 ////////////////////类:ReQuery////////////////////
340 
341 //查询所有词项,对倒排索引表遍历一次,初始化得分数组
342 
343 void ReQuery::initialLength(Index_List * idList, float *tempLength) {
344     float idf;//记录log N/df
345     int size;
346     int i;
347     
348     while (idList != nullptr) {
349         idf = log(N/idList->df) / log(10);
350         size = idList->head_docID.size();
351         i = 0;
352         for (; i < size; i++) {
353             tempLength[idList->head_docID[i].text_number] += (idf*(1 + log(idList->head_docID[i].text_fre) / log(10)))*(idf*(1 + log(idList->head_docID[i].text_fre) / log(10)));
354         }
355         idList = idList->next;
356     }
357     for (i = 0; i < N; i++)
358         tempLength[i] = sqrt(tempLength[i]);
359     /*for (int i = 0; i < LMA->SizeOfDocSet();i++)
360         cout << "文档" << i <<" 长度为 "<< tempLength[i] << endl;*/
361 }
362 
363 //输入查询词项,输出查询结果,返回排名前k的文档编号,select 为选择的查询模式,1为查歌名,2为查歌手,3为查歌词主体
364 bool ReQuery::Query(string query, int k, int select) {
365     Index_List QUERY[10];
366     char aa[50] = { '\0' };
367     int ia = 0, i = 0, iQ = 0, j = 0;
368     
369     for (; i<query.length(); i++) {
370         while (query[i] != ' '&& i < query.length()) {
371             if (((int)query[i] >= 65) && ((int)query[i] <= 91))
372                 aa[ia++] = query[i++] + 32;
373             else
374                 aa[ia++] = query[i++];
375         }
376             
377         aa[ia] = '\0';
378         for (; j < iQ; j++) {
379             if (QUERY[j].word == aa) {
380                 QUERY[j].df++;
381                 j = -1;
382                 break;
383             }
384         }
385         if (j != -1) {
386             QUERY[iQ++].word = aa;
387             QUERY[iQ-1].df = 1;
388         }
389         j = 0;
390         ia = 0;
391         //cout << f[ic1 - 1] << endl;
392     }
393 
394     //查歌名
395     if (select == 1) {
396         initialLength(LMA->tiIndex_head(), Length);
397         //计算查询de得分
398         ScoreofaDoc(LMA->tiIndex_head(), QUERY, iQ);
399     }
400     //查作者
401     if (select == 2) {
402         initialLength(LMA->arIndex_head(), Length);
403         //计算查询de得分
404         ScoreofaDoc(LMA->arIndex_head(), QUERY, iQ);
405     }
406     //查歌词主体
407     if (select == 3) {
408         initialLength(LMA->DocIndex_head(), Length);
409         //计算查询de得分
410         ScoreofaDoc(LMA->DocIndex_head(), QUERY, iQ);
411     }
412     /*for (int i = 0; i < N; i++) {
413         cout << Scores[i] << endl;
414     }*/
415     //对得分数组建堆,并且返回前K个
416     HeapSort(Scores, N, k);
417     return true;
418 };
419 
420 //词项对于suoyou文档的得分
421 float ReQuery::ScoreofaDoc(Index_List *idList, Index_List Tword[],int iQ) {
422     int size = 0;
423     int df;
424     float idf;
425     Word_Doc TEMPS;
426     Index_List nowTemp;
427     for (int i = 0; i < iQ;i++) {
428         
429         if ((df = isInner(Tword[i].word,idList,nowTemp)) != 0) {
430             idf = log(N/df) / log(10);
431             size =nowTemp.head_docID.size();
432             for (int j = 0; j < size; j++) {
433                 TEMPS = (nowTemp.head_docID)[j];
434                 Scores[TEMPS.text_number] += ((idf)*Tword[i].df)*(idf*(1 + log(nowTemp.head_docID[j].text_fre) / log(10)));
435             }
436         }
437     }
438     cout <<endl <<"各文档依次得分:" << endl;
439     for (int i = 0; i < N; i++) {
440         if(Scores[i]!=0)
441             Scores[i] = Scores[i]/ Length[i];
442         cout << Scores[i] <<" ";
443     }
444     return 0;
445 }
446 
447 //查询某一个词是否在其中,有则返回其df,无则返回0
448 int ReQuery::isInner(string elem, Index_List idList[], Index_List &nowTemp) {
449     int i = 0;
450     //cout << "查找单词" << elem << endl;
451     while (idList != nullptr) {
452         if (idList[i].word == elem) {
453             //cout << "单词" << elem << "在其中,文档编号"<<idList[i].head_docID[0].text_number<<endl;
454             nowTemp = idList[i];
455             return idList->df;
456         }
457         if (idList->word > elem)
458             return 0;
459         idList = idList->next;
460     };
461     return 0;
462 }
463 
464 //进行堆排序,将所有的得分进行排序,找出前k个,n为数组大小
465 bool ReQuery::HeapSort(float Scores[],int n,int k) {
466     float doc = -1;
467     int i = 0;
468     ScoreandDoc *TScores = new ScoreandDoc[n];
469     for (; i < n; i++) {
470         TScores[i].score = Scores[i];
471         TScores[i].text_number = i;
472     }
473 
474     Max_Heap H(n, n, TScores);
475     i = 0;
476     cout << endl << endl << "---------------------向您推荐如下文档--------------------------------" << endl;
477     for (; i < k; i++) {
478         ScoreandDoc temp= H.removemax(doc);
479         cout << "文档编号:" << temp.text_number << " 得分:" << temp.score << endl;
480     }
481 
482     cout << endl;
483     delete[]TScores;
484     return true;
485 
486 };
487 
488 //////////////////////////////lei Max_Heap///////////////////////////
489 //堆建立
490 
491 //建立整堆
492 void Max_Heap::siftdown(int pos) {
493     while (!isLeaf(pos)) {
494         int j = leftchild(pos);
495         int rc = rightchild(pos);
496         if ((rc < n) && (Heap[j].score < Heap[rc].score)) {
497             j = rc;
498         }
499         if (!(Heap[pos].score < Heap[j].score))
500             return;
501         ScoreandDoc xxx = Heap[pos];
502         Heap[pos] = Heap[j];
503         Heap[j] = xxx;
504         pos = j;
505     }
506 };
507 
508 
509 //每次找出移除最大的
510 ScoreandDoc Max_Heap::removemax(float it) {
511     //if (n == 0)
512     //    return ;
513     ScoreandDoc xxx = Heap[--n];
514     Heap[n] = Heap[0];
515     Heap[0] = xxx;
516     if (n != 0)
517         siftdown(0);
518     //it = Heap[n].score;
519     return Heap[n];
520 }

 

posted @ 2016-11-12 00:02  天下岂有长生不灭者  阅读(344)  评论(0编辑  收藏  举报