通过tinyxml解析百度热词榜单

百度热词榜单

初学TinyXml,盯上了解析百度热词榜单。通过curl获取rss地址上的数据。

由于TinyXml不支持,所以要转换为UTF-8格式,命令为

iconv -f gb2312 -t utf-8 baidutop.xml > baidutop.xml

写得很烂,写的时候,心里对内存管理部分总是瞻前顾后的,真是奇怪啊。

View Code
#include <iostream>
#include
<tinyxml/tinyxml.h>
#include
<tinyxml/tinystr.h>

#include
<vector>

using namespace std;

struct CHrefLink
{
string word;
string url;
};

struct CBaiDuTopKey
{
int rank;
long account;

string keyWord;
string wordUrl;
vector
<CHrefLink> href;
};



void GetHotRank(vector <CBaiDuTopKey> & hot,const char *xml);
void ExtraHrefLink(TiXmlElement *hrefElement,CHrefLink &link);
void ExtraHrefLink(TiXmlElement *hrefElement,string & word,string & url);
void ExtraKeyWords(TiXmlElement *tableColumn,vector <CHrefLink> &vKey);
int main()
{
//创建一个XML的文档对象。
TiXmlDocument *myDocument = new TiXmlDocument("baidutop.xml");
myDocument
->LoadFile();

//获得根元素
TiXmlElement *RootElement = myDocument->RootElement();

//输出根元素名称
cout << RootElement->Value() << endl;

//解析获得有用的信息
TiXmlElement *channelElement = RootElement->FirstChildElement();
string title = channelElement->FirstChildElement()->GetText();
TiXmlElement
*itemElement = (TiXmlElement *)channelElement->LastChild();


int ttl = atoi(((TiXmlElement *)itemElement->PreviousSibling())->GetText());

TiXmlElement
* descriptionElement = (TiXmlElement *)(itemElement->LastChild());

vector
<CBaiDuTopKey> hotword;

GetHotRank(hotword,descriptionElement
->GetText());



cout
<<"Name :"<<title<<"\t Life Time:"<<ttl<<endl;
for(vector<CBaiDuTopKey>::iterator iter = hotword.begin();
iter
!=hotword.end();++iter)
{
cout
<<"\nrank \t"<<(*iter).rank<<"\t amount:"<<(*iter).account;
cout
<<"\nkeyword\t"<<(*iter).keyWord<<"\nurl\t"<<(*iter).wordUrl;
}
cout
<<endl;
delete myDocument;
return 0;
}


void GetHotRank(vector <CBaiDuTopKey> & hotVector,const char *xml)
{
TiXmlDocument
*doc = new TiXmlDocument();

int row = 0;


doc
->Parse(xml);
TiXmlElement
*tableElement = doc->RootElement();
//
TiXmlElement *tableRoot = tableElement->FirstChildElement();
if(!tableRoot)
{
cout
<<"errro !"<<endl;
return ;
}
//第一个元素为table的表头,剔除
for(TiXmlElement *tableRow = tableRoot->FirstChildElement()->NextSiblingElement();
tableRow;
tableRow
= tableRow->NextSiblingElement())
{
CBaiDuTopKey rankedword;
TiXmlElement
*tableColumn = tableRow->FirstChildElement();

//关键词排名
rankedword.rank=atoi(tableColumn->GetText());
tableColumn
= tableColumn->NextSiblingElement();
//关键词
ExtraHrefLink(tableColumn->FirstChildElement(),rankedword.keyWord,rankedword.wordUrl);
tableColumn
= tableColumn->NextSiblingElement();
//搜索量
rankedword.account=atol(tableColumn->GetText());
//链接
tableColumn = tableColumn->NextSiblingElement();

hotVector.push_back(rankedword);
}
delete doc;
}
void ExtraHrefLink(TiXmlElement *hrefElement,CHrefLink &link)
{
link.url
= hrefElement->FirstAttribute()->Value();
link.word
= hrefElement->GetText();

}
void ExtraHrefLink(TiXmlElement *hrefElement,string & word,string & url)
{
url
= hrefElement->FirstAttribute()->Value();
word
= hrefElement->GetText();

}
void ExtraKeyWords(TiXmlElement *tableColumn,vector <CHrefLink> &vKey)
{
TiXmlElement
*hrefElement = tableColumn->FirstChildElement();
CHrefLink links;
while(hrefElement)
{
ExtraHrefLink(hrefElement,links);
vKey.push_back(links);
hrefElement
= hrefElement->NextSiblingElement();
}
}

posted @ 2011-06-12 17:26  westfly  阅读(428)  评论(0编辑  收藏  举报