通过tinyxml解析百度热词榜单
百度热词榜单
初学TinyXml,盯上了解析百度热词榜单。通过curl获取rss地址上的数据。
由于TinyXml不支持,所以要转换为UTF-8格式,命令为
iconv -f gb2312 -t utf-8 baidutop.xml > baidutop.xml
写得很烂,写的时候,心里对内存管理部分总是瞻前顾后的,真是奇怪啊。
View Code
#include <iostream>
#include <tinyxml/tinyxml.h>
#include <tinyxml/tinystr.h>
#include <vector>
using namespace std;
struct CHrefLink
{
string word;
string url;
};
struct CBaiDuTopKey
{
int rank;
long account;
string keyWord;
string wordUrl;
vector <CHrefLink> href;
};
void GetHotRank(vector <CBaiDuTopKey> & hot,const char *xml);
void ExtraHrefLink(TiXmlElement *hrefElement,CHrefLink &link);
void ExtraHrefLink(TiXmlElement *hrefElement,string & word,string & url);
void ExtraKeyWords(TiXmlElement *tableColumn,vector <CHrefLink> &vKey);
int main()
{
//创建一个XML的文档对象。
TiXmlDocument *myDocument = new TiXmlDocument("baidutop.xml");
myDocument->LoadFile();
//获得根元素
TiXmlElement *RootElement = myDocument->RootElement();
//输出根元素名称
cout << RootElement->Value() << endl;
//解析获得有用的信息
TiXmlElement *channelElement = RootElement->FirstChildElement();
string title = channelElement->FirstChildElement()->GetText();
TiXmlElement *itemElement = (TiXmlElement *)channelElement->LastChild();
int ttl = atoi(((TiXmlElement *)itemElement->PreviousSibling())->GetText());
TiXmlElement * descriptionElement = (TiXmlElement *)(itemElement->LastChild());
vector <CBaiDuTopKey> hotword;
GetHotRank(hotword,descriptionElement->GetText());
cout<<"Name :"<<title<<"\t Life Time:"<<ttl<<endl;
for(vector<CBaiDuTopKey>::iterator iter = hotword.begin();
iter!=hotword.end();++iter)
{
cout<<"\nrank \t"<<(*iter).rank<<"\t amount:"<<(*iter).account;
cout<<"\nkeyword\t"<<(*iter).keyWord<<"\nurl\t"<<(*iter).wordUrl;
}
cout<<endl;
delete myDocument;
return 0;
}
void GetHotRank(vector <CBaiDuTopKey> & hotVector,const char *xml)
{
TiXmlDocument *doc = new TiXmlDocument();
int row = 0;
doc->Parse(xml);
TiXmlElement *tableElement = doc->RootElement();
//
TiXmlElement *tableRoot = tableElement->FirstChildElement();
if(!tableRoot)
{
cout<<"errro !"<<endl;
return ;
}
//第一个元素为table的表头,剔除
for(TiXmlElement *tableRow = tableRoot->FirstChildElement()->NextSiblingElement();
tableRow;
tableRow = tableRow->NextSiblingElement())
{
CBaiDuTopKey rankedword;
TiXmlElement *tableColumn = tableRow->FirstChildElement();
//关键词排名
rankedword.rank=atoi(tableColumn->GetText());
tableColumn = tableColumn->NextSiblingElement();
//关键词
ExtraHrefLink(tableColumn->FirstChildElement(),rankedword.keyWord,rankedword.wordUrl);
tableColumn = tableColumn->NextSiblingElement();
//搜索量
rankedword.account=atol(tableColumn->GetText());
//链接
tableColumn = tableColumn->NextSiblingElement();
hotVector.push_back(rankedword);
}
delete doc;
}
void ExtraHrefLink(TiXmlElement *hrefElement,CHrefLink &link)
{
link.url = hrefElement->FirstAttribute()->Value();
link.word = hrefElement->GetText();
}
void ExtraHrefLink(TiXmlElement *hrefElement,string & word,string & url)
{
url = hrefElement->FirstAttribute()->Value();
word = hrefElement->GetText();
}
void ExtraKeyWords(TiXmlElement *tableColumn,vector <CHrefLink> &vKey)
{
TiXmlElement *hrefElement = tableColumn->FirstChildElement();
CHrefLink links;
while(hrefElement)
{
ExtraHrefLink(hrefElement,links);
vKey.push_back(links);
hrefElement = hrefElement->NextSiblingElement();
}
}