[wbia 1.6] 计算抓取结果的Pagerank
这里,我们得到了网页间的链接关系,可以比较方便的开始进行pagerank的计算了。首先需要预处理得到整个链接关系图。对于一个url的定义如下:
struct url { int id; string urlTxt; int outDegree; vector<int> refList; int inDegree; double score[2]; bool operator > (const url& u) const { return score[scoreIdx]>u.score[scoreIdx]; } };
预处理得到Url集合,为了加速运算,将每个url编号,在图中只存储每个url的编号。
void initUrl() { cout<<nowTime()<<" Begin init user."<<endl; ifstream in(mapPath.c_str()); string urlStr; int id=0; while(in>>urlStr) { if(urlStr.substr(urlStr.size()-4)==".css"||urlStr.substr(urlStr.size()-3)==".js") continue; if(urlId.find(urlStr)==urlId.end()) { urlId[urlStr]=id; url newUrl; newUrl.id=id; newUrl.inDegree=0; newUrl.outDegree=0; newUrl.urlTxt=urlStr; urls.push_back(newUrl); id++; } } in.close(); }
得到整个图关系代码如下:
void initNet() { cout << nowTime() << ": Begin init net." << endl; ifstream in(mapPath.c_str()); string fromStr,toStr,rootStr; in>>rootStr; while(in>>fromStr>>toStr) { if(links.find(fromStr+"#"+toStr)!=links.end()) continue; else links.insert(fromStr+"#"+toStr); if(fromStr.substr(fromStr.size()-4)==".css"||toStr.substr(toStr.size()-3)==".js") { continue; } int from=urlId[fromStr]; int to=urlId[toStr]; urls[to].refList.push_back(from); urls[to].inDegree++; urls[from].outDegree++; } in.close(); cout << nowTime() << ": Complete init net." << endl; }
计算pagerank的代码如下:
int calPagerank() { cout << nowTime() << ": Begin calculate pagerank." << endl; int oldIdx = 0; int newIdx = 1; int urlNum=urls.size(); for(int i=0;i<urlNum;i++) urls[i].score[oldIdx]=sum/urlNum; for(int iteration=0; iteration <= 10000; iteration++) { cout<<nowTime()<<": iteration "<<iteration<<endl; for(int i =0;i<urlNum;i++) { urls[i].score[newIdx] = 0; } double perSum = 0; for(int i = 0; i < urlNum; i ++) { urls[i].score[newIdx] = sum*rem/urlNum; if(urls[i].inDegree> 0) { for(vector<int>::iterator iter = urls[i].refList.begin(); iter != urls[i].refList.end(); iter++) { int id = *iter; urls[i].score[newIdx] += (1-rem)*urls[id].score[oldIdx]/urls[id].outDegree; } } perSum += urls[i].score[newIdx]; } for(int i=0;i<urlNum;i++) { urls[i].score[newIdx] = urls[i].score[newIdx] / perSum; } double dif = 0; for(int i = 0; i < urlNum; i ++) { dif += abs(urls[i].score[newIdx] - urls[i].score[oldIdx]); } if( dif < bound ) { break; } oldIdx = newIdx; newIdx = 1 - oldIdx; } cout << nowTime() << ": Complete calculate pagerank." << endl; return newIdx; }
将得到的pagerank结果写入文件:
void pagerankWriteToFile() { cout << nowTime() << ": Begin write pagerank to file." << endl; ofstream out(rankPath.c_str(), ios::trunc|ios::out); out << setiosflags(ios::fixed) << setprecision(20); sort(urls.begin(),urls.end(),greater<url>()); for(int i=0;i<urls.size();i++) { out<<urls[i].urlTxt<<"\t"<<urls[i].score[scoreIdx]<<"\t"<<urls[i].inDegree<<endl; } out.close(); cout << nowTime() << ": Complete write pagerank to file." << endl; }
进行完这些计算后,和同学交流,发现得到的结果出入较大。交流发现,是对出度为0的点没有处理的原因。由于我抓的图是部分的图,所以出度为0的点,实际上是和其他网络交流的一个Saddle,但是应有的pagerank并没有从这里流出去。这个出度为0的点像黑洞一样只吸收pagerank值却不支出pagerank值。所以,需要将这些出度为0的点进行特殊处理,即认为这些节点向其他每个节点都连了一条边。经过修改后计算pagerank的代码如下:
int calPagerank() { cout << nowTime() << ": Begin calculate pagerank." << endl; int oldIdx = 0; int newIdx = 1; int urlNum=urls.size(); for(int i=0;i<urlNum;i++) urls[i].score[oldIdx]=sum/urlNum; for(int iteration=0; iteration <= 10000; iteration++) { cout<<nowTime()<<": iteration "<<iteration<<endl; for(int i=0;i<urlNum;i++) urls[i].score[newIdx] = sum*rem/urlNum; for(int i=0;i<urlNum;i++) { if(urls[i].outDegree!=0) continue; for(int j=0;j<urlNum;j++) { if(i==j) continue; urls[j].score[newIdx]+=(1-rem)*urls[i].score[oldIdx]/(urlNum-1); } } double perSum = 0; for(int i = 0; i < urlNum; i ++) { if(urls[i].inDegree> 0) { for(vector<int>::iterator iter = urls[i].refList.begin(); iter != urls[i].refList.end(); iter++) { int id = *iter; urls[i].score[newIdx] += (1-rem)*urls[id].score[oldIdx]/urls[id].outDegree; } } perSum += urls[i].score[newIdx]; } for(int i=0;i<urlNum;i++) { urls[i].score[newIdx] = urls[i].score[newIdx] / perSum; } double dif = 0; for(int i = 0; i < urlNum; i ++) { dif += abs(urls[i].score[newIdx] - urls[i].score[oldIdx]); } if( dif < bound ) { break; } oldIdx = newIdx; newIdx = 1 - oldIdx; } cout << nowTime() << ": Complete calculate pagerank." << endl; return newIdx; }
以下是程序完整的C++代码:
#include <iostream> #include <fstream> #include <iomanip> #include <time.h> #include <map> #include <list> #include <cstdlib> #include <cstring> #include <string> #include <memory.h> #include <algorithm> #include <vector> #include <cmath> #include <set> using namespace std; int scoreIdx; struct url { int id; string urlTxt; int outDegree; vector<int> refList; int inDegree; double score[2]; bool operator > (const url& u) const { return score[scoreIdx]>u.score[scoreIdx]; } }; map<string,int> urlId; set<string> links; vector<url> urls; double sum = 1.0; double rem = 0.15; double bound = 0.000001; string mapPath="linkMap.txt"; string rankPath="pagerank.txt"; string nowTime() { char outTime[64]; time_t t = time(0); strftime(outTime, sizeof(outTime), "%Y/%m/%d %X", localtime(&t)); return outTime; } void initUrl() { cout<<nowTime()<<" Begin init user."<<endl; ifstream in(mapPath.c_str()); string urlStr; int id=0; while(in>>urlStr) { if(urlStr.substr(urlStr.size()-4)==".css"||urlStr.substr(urlStr.size()-3)==".js") { continue; } if(urlId.find(urlStr)==urlId.end()) { urlId[urlStr]=id; url newUrl; newUrl.id=id; newUrl.inDegree=0; newUrl.outDegree=0; newUrl.urlTxt=urlStr; urls.push_back(newUrl); id++; } } in.close(); } void initNet() { cout << nowTime() << ": Begin init net." << endl; ifstream in(mapPath.c_str()); string fromStr,toStr,rootStr; in>>rootStr; int cnt=0; while(in>>fromStr>>toStr) { if(links.find(fromStr+"#"+toStr)!=links.end()) continue; else links.insert(fromStr+"#"+toStr); if(fromStr.substr(fromStr.size()-4)==".css"||toStr.substr(toStr.size()-4)==".css"||fromStr.substr(fromStr.size()-3)==".js"||toStr.substr(toStr.size()-3)==".js") { continue; } cnt++; int from=urlId[fromStr]; int to=urlId[toStr]; urls[to].refList.push_back(from); urls[to].inDegree++; urls[from].outDegree++; } cout<<"The link number is " << cnt<<endl; in.close(); cout << nowTime() << ": Complete init net." << endl; } int calPagerank() { cout << nowTime() << ": Begin calculate pagerank." << endl; int oldIdx = 0; int newIdx = 1; int urlNum=urls.size(); for(int i=0;i<urlNum;i++) urls[i].score[oldIdx]=sum/urlNum; for(int iteration=0; iteration <= 10000; iteration++) { cout<<nowTime()<<": iteration "<<iteration<<endl; for(int i=0;i<urlNum;i++) urls[i].score[newIdx] = sum*rem/urlNum; for(int i=0;i<urlNum;i++) { if(urls[i].outDegree!=0) continue; for(int j=0;j<urlNum;j++) { if(i==j) continue; urls[j].score[newIdx]+=(1-rem)*urls[i].score[oldIdx]/(urlNum-1); } } double perSum = 0; for(int i = 0; i < urlNum; i ++) { if(urls[i].inDegree> 0) { for(vector<int>::iterator iter = urls[i].refList.begin(); iter != urls[i].refList.end(); iter++) { int id = *iter; urls[i].score[newIdx] += (1-rem)*urls[id].score[oldIdx]/urls[id].outDegree; } } perSum += urls[i].score[newIdx]; } for(int i=0;i<urlNum;i++) { urls[i].score[newIdx] = urls[i].score[newIdx] / perSum; } double dif = 0; for(int i = 0; i < urlNum; i ++) { dif += abs(urls[i].score[newIdx] - urls[i].score[oldIdx]); } if( dif < bound ) { break; } oldIdx = newIdx; newIdx = 1 - oldIdx; } cout << nowTime() << ": Complete calculate pagerank." << endl; return newIdx; } void pagerankWriteToFile() { cout << nowTime() << ": Begin write pagerank to file." << endl; ofstream out(rankPath.c_str(), ios::trunc|ios::out); out << setiosflags(ios::fixed) << setprecision(20); sort(urls.begin(),urls.end(),greater<url>()); for(int i=0;i<urls.size();i++) { out<<urls[i].urlTxt<<"\t"<<urls[i].score[scoreIdx]<<"\t"<<urls[i].inDegree<<endl; } out.close(); cout << nowTime() << ": Complete write pagerank to file." << endl; } int main() { initUrl(); initNet(); scoreIdx = calPagerank(); pagerankWriteToFile(); return 0; }
经过如此计算后,结果虽然有一定变化,但是变化不大。也许是数据集或者抓取过程的原因,我也没在我的代码以及他的代码中发现什么问题。如果有发现代码问题的,请留言,不胜感激。下面将对得到的结果进行介绍。
posted on 2012-04-16 17:12 liugoodness 阅读(1013) 评论(6) 编辑 收藏 举报