CDocument

 1 #ifndef _Document_H_040410_
 2 #define _Document_H_040410_
 3 
 4 #include <string>
 5 
 6 typedef struct{
 7         int docid;
 8         int offset;
 9 }DocIdx;
10 
11 using namespace std;
12 
13 class CDocument
14 {
15 public:
16 
17     int m_nDocId;
18     int m_nPos;
19     int m_nLength;
20     string m_sChecksum;
21 
22     string m_sUrl;
23     string m_sRecord;    // a record including a HEAD, a header and body
24     string m_sHead;
25     string m_sHeader;
26     string m_sBody;
27 
28     string m_sBodyNoTags;
29 
30 public:
31     CDocument();
32     ~CDocument();
33 
34     bool ParseRecord(string &content) const;
35     bool CleanBody(string &body) const;
36 
37     void RemoveTags(char *s);
38 };
39 
40 #endif /* _Document_H_040410_ */

 

 

 1 /*Document handling
 2  */
 3 
 4 #include "Document.h"
 5 
 6 CDocument::CDocument()
 7 {
 8     m_nDocId = -1;
 9     m_nPos = -1;
10     m_nLength = 0;
11     m_sChecksum = "";
12 
13     m_sUrl = "";
14 }
15 
16 CDocument::~CDocument()
17 {
18 }
19 
20 bool CDocument::ParseRecord(string &content) const
21 {
22     return true;
23 }
24 
25 bool CDocument::CleanBody(string &body) const
26 {
27     return true;
28 }
29 
30 //把  <...> 删掉
31 void CDocument::RemoveTags(char *s)
32 {
33     int intag;
34     char *p, *q;
35 
36     if (!s || !*s)    return;
37 
38     for (p=q=s, intag=0; *q; q++) {
39         switch (*q){
40         case '<':
41             intag = 1;
42             *p++ = ' ';
43             break;
44         case '>':
45             intag = 0;
46             break;
47         default:
48             if (!intag) {
49                 *p++ = *q;
50             }
51             break;
52         }
53     }
54 
55     *p = '\0';
56 
57 /* second method
58     char *d = s;
59     while (*s) {
60         if (*s == '<') {
61             while (*s && *s!='>') s++;
62             if( *s == '\0') break;
63             s++; 
64             continue; 
65         } 
66 
67         *d++ = *s++; 
68     }
69     *d = 0;
70 */
71 }

posted on 2012-07-14 13:55  kakamilan  阅读(289)  评论(0编辑  收藏  举报

导航