[wbia 1.6] 计算抓取结果的Pagerank

    这里,我们得到了网页间的链接关系,可以比较方便的开始进行pagerank的计算了。首先需要预处理得到整个链接关系图。对于一个url的定义如下:

struct url
{
        int id;
        string urlTxt;
        int outDegree;
        vector<int> refList;
        int inDegree;
        double score[2];
        bool operator > (const url& u) const
   {
           return score[scoreIdx]>u.score[scoreIdx];
   }
};

预处理得到Url集合,为了加速运算,将每个url编号,在图中只存储每个url的编号。

void initUrl()
{
        cout<<nowTime()<<" Begin init user."<<endl;
        ifstream in(mapPath.c_str());
        string urlStr;
        int id=0;
        while(in>>urlStr)
        {
                if(urlStr.substr(urlStr.size()-4)==".css"||urlStr.substr(urlStr.size()-3)==".js")
                        continue;
                if(urlId.find(urlStr)==urlId.end())
                {
                        urlId[urlStr]=id;
                        url newUrl;
                        newUrl.id=id;
                        newUrl.inDegree=0;
                        newUrl.outDegree=0;
                        newUrl.urlTxt=urlStr;
                        urls.push_back(newUrl);
                        id++;
                }
        }
        in.close();
}

得到整个图关系代码如下:

void initNet()
{
    cout << nowTime() << ": Begin init net." << endl;
    ifstream in(mapPath.c_str());
        string fromStr,toStr,rootStr;
        in>>rootStr;
    while(in>>fromStr>>toStr)
    {
                if(links.find(fromStr+"#"+toStr)!=links.end()) continue;
                else links.insert(fromStr+"#"+toStr);
                if(fromStr.substr(fromStr.size()-4)==".css"||toStr.substr(toStr.size()-3)==".js")
                {
                        continue;
                }
                int from=urlId[fromStr];
                int to=urlId[toStr];

                urls[to].refList.push_back(from);
                urls[to].inDegree++;

                urls[from].outDegree++;
    }
    in.close();
    cout << nowTime() << ": Complete init net." << endl;
}

计算pagerank的代码如下:

int calPagerank()
{
    cout << nowTime() << ": Begin calculate pagerank." << endl;
        int oldIdx = 0;
        int newIdx = 1;
        int urlNum=urls.size();
        for(int i=0;i<urlNum;i++) urls[i].score[oldIdx]=sum/urlNum;

        for(int iteration=0; iteration <= 10000; iteration++)
        {
                cout<<nowTime()<<": iteration "<<iteration<<endl;
                for(int i =0;i<urlNum;i++)
                {
                        urls[i].score[newIdx] = 0;
                }
                double perSum = 0;
                for(int i = 0; i < urlNum; i ++)
                {
                        urls[i].score[newIdx] = sum*rem/urlNum;
                        if(urls[i].inDegree> 0)
                        {
                                for(vector<int>::iterator iter = urls[i].refList.begin(); iter != urls[i].refList.end(); iter++)
                                {
                                        int id = *iter;
                                        urls[i].score[newIdx] += (1-rem)*urls[id].score[oldIdx]/urls[id].outDegree;
                                }
                        }
                        perSum += urls[i].score[newIdx];
                }
                for(int i=0;i<urlNum;i++)
                {
                        urls[i].score[newIdx] = urls[i].score[newIdx] / perSum;
                }
                double dif = 0;
                for(int i = 0; i < urlNum; i ++)
                {
                        dif += abs(urls[i].score[newIdx] - urls[i].score[oldIdx]);
                }

                if( dif < bound )
                {
                        break;
                }
                oldIdx = newIdx;
                newIdx = 1 - oldIdx;
        }
        cout << nowTime() << ": Complete calculate pagerank." << endl;
        return newIdx;
}

将得到的pagerank结果写入文件:

void pagerankWriteToFile()
{
        cout << nowTime() << ": Begin write pagerank to file." << endl;
        ofstream out(rankPath.c_str(), ios::trunc|ios::out);

        out << setiosflags(ios::fixed) << setprecision(20);

        sort(urls.begin(),urls.end(),greater<url>());
        for(int i=0;i<urls.size();i++)
        {
                out<<urls[i].urlTxt<<"\t"<<urls[i].score[scoreIdx]<<"\t"<<urls[i].inDegree<<endl;
        }
        out.close();
        cout << nowTime() << ": Complete write pagerank to file." << endl;
}

进行完这些计算后,和同学交流,发现得到的结果出入较大。交流发现,是对出度为0的点没有处理的原因。由于我抓的图是部分的图,所以出度为0的点,实际上是和其他网络交流的一个Saddle,但是应有的pagerank并没有从这里流出去。这个出度为0的点像黑洞一样只吸收pagerank值却不支出pagerank值。所以,需要将这些出度为0的点进行特殊处理,即认为这些节点向其他每个节点都连了一条边。经过修改后计算pagerank的代码如下:

int calPagerank()
{
    cout << nowTime() << ": Begin calculate pagerank." << endl;
        int oldIdx = 0;
        int newIdx = 1;
        int urlNum=urls.size();
        for(int i=0;i<urlNum;i++) urls[i].score[oldIdx]=sum/urlNum;

        for(int iteration=0; iteration <= 10000; iteration++)
        {
                cout<<nowTime()<<": iteration "<<iteration<<endl;
                for(int i=0;i<urlNum;i++) urls[i].score[newIdx] = sum*rem/urlNum;

                for(int i=0;i<urlNum;i++)
                {
                        if(urls[i].outDegree!=0) continue;
                        for(int j=0;j<urlNum;j++)
                        {
                                if(i==j) continue;
                                urls[j].score[newIdx]+=(1-rem)*urls[i].score[oldIdx]/(urlNum-1);
                        }
                }
                double perSum = 0;
                for(int i = 0; i < urlNum; i ++)
                {
                        if(urls[i].inDegree> 0)
                        {
                                for(vector<int>::iterator iter = urls[i].refList.begin(); iter != urls[i].refList.end(); iter++)
                                {
                                        int id = *iter;
                                        urls[i].score[newIdx] += (1-rem)*urls[id].score[oldIdx]/urls[id].outDegree;
                                }
                        }
                        perSum += urls[i].score[newIdx];
                }
                for(int i=0;i<urlNum;i++)
                {
                        urls[i].score[newIdx] = urls[i].score[newIdx] / perSum;
                }
                double dif = 0;
                for(int i = 0; i < urlNum; i ++)
                {

                       dif += abs(urls[i].score[newIdx] - urls[i].score[oldIdx]);
                }

                if( dif < bound )
                {
                        break;
                }
                oldIdx = newIdx;
                newIdx = 1 - oldIdx;
        }
        cout << nowTime() << ": Complete calculate pagerank." << endl;
        return newIdx;
}

以下是程序完整的C++代码:

#include <iostream>
#include <fstream>
#include <iomanip>
#include <time.h>
#include <map>
#include <list>
#include <cstdlib>
#include <cstring>
#include <string>
#include <memory.h>
#include <algorithm>
#include <vector>
#include <cmath>
#include <set>
using namespace std;

int scoreIdx;

struct url
{
    int id;
    string urlTxt;
    int outDegree;
    vector<int> refList;
    int inDegree;
    double score[2];
    bool operator > (const url& u) const
   {
       return score[scoreIdx]>u.score[scoreIdx];
   }
};

map<string,int> urlId;
set<string> links;

vector<url> urls;

double sum = 1.0;
double rem = 0.15;
double bound = 0.000001;

string mapPath="linkMap.txt";
string rankPath="pagerank.txt";


string nowTime()
{
    char outTime[64];
    time_t t = time(0);
    strftime(outTime, sizeof(outTime), "%Y/%m/%d %X", localtime(&t));
    return outTime;
}

void initUrl()
{
    cout<<nowTime()<<" Begin init user."<<endl;
    ifstream in(mapPath.c_str());
    string urlStr;
    int id=0;
    while(in>>urlStr)
    {
        if(urlStr.substr(urlStr.size()-4)==".css"||urlStr.substr(urlStr.size()-3)==".js")
        {
            continue;
        }
        if(urlId.find(urlStr)==urlId.end())
        {
            urlId[urlStr]=id;
            url newUrl;
            newUrl.id=id;
            newUrl.inDegree=0;
            newUrl.outDegree=0;
            newUrl.urlTxt=urlStr;
            urls.push_back(newUrl);
            id++;
        }
    }
    in.close();
}

void initNet()
{
    cout << nowTime() << ": Begin init net." << endl;
    ifstream in(mapPath.c_str());
    string fromStr,toStr,rootStr;
    in>>rootStr;
    int cnt=0;
    while(in>>fromStr>>toStr)
    {
        if(links.find(fromStr+"#"+toStr)!=links.end()) continue;
        else links.insert(fromStr+"#"+toStr);
        if(fromStr.substr(fromStr.size()-4)==".css"||toStr.substr(toStr.size()-4)==".css"||fromStr.substr(fromStr.size()-3)==".js"||toStr.substr(toStr.size()-3)==".js")
        {
            continue;
        }
        cnt++;
        int from=urlId[fromStr];
        int to=urlId[toStr];

        urls[to].refList.push_back(from);
        urls[to].inDegree++;

        urls[from].outDegree++;
    }
    cout<<"The link number is " << cnt<<endl;
    in.close();
    cout << nowTime() << ": Complete init net." << endl;
}

int calPagerank()
{
    cout << nowTime() << ": Begin calculate pagerank." << endl;
    int oldIdx = 0;
    int newIdx = 1;
    int urlNum=urls.size();
    for(int i=0;i<urlNum;i++) urls[i].score[oldIdx]=sum/urlNum;

    for(int iteration=0; iteration <= 10000; iteration++)
    {
        cout<<nowTime()<<": iteration "<<iteration<<endl;
        for(int i=0;i<urlNum;i++) urls[i].score[newIdx] = sum*rem/urlNum;

        for(int i=0;i<urlNum;i++)
        {
            if(urls[i].outDegree!=0) continue;
            for(int j=0;j<urlNum;j++)
            {
                if(i==j) continue;
                urls[j].score[newIdx]+=(1-rem)*urls[i].score[oldIdx]/(urlNum-1);
            }
        }
        double perSum = 0;
        for(int i = 0; i < urlNum; i ++)
        {
            if(urls[i].inDegree> 0)
            {
                for(vector<int>::iterator iter = urls[i].refList.begin(); iter != urls[i].refList.end(); iter++)
                {
                    int id = *iter;
                    urls[i].score[newIdx] += (1-rem)*urls[id].score[oldIdx]/urls[id].outDegree;
                }
            }
            perSum += urls[i].score[newIdx];
        }
        for(int i=0;i<urlNum;i++)
        {
            urls[i].score[newIdx] = urls[i].score[newIdx] / perSum;
        }
        double dif = 0;
        for(int i = 0; i < urlNum; i ++)
        {
            dif += abs(urls[i].score[newIdx] - urls[i].score[oldIdx]);
        }

        if( dif < bound )
        {
            break;
        }
        oldIdx = newIdx;
        newIdx = 1 - oldIdx;
    }
    cout << nowTime() << ": Complete calculate pagerank." << endl;
    return newIdx;
}

void pagerankWriteToFile()
{
    cout << nowTime() << ": Begin write pagerank to file." << endl;
    ofstream out(rankPath.c_str(), ios::trunc|ios::out);

    out << setiosflags(ios::fixed) << setprecision(20);

    sort(urls.begin(),urls.end(),greater<url>());
    for(int i=0;i<urls.size();i++)
    {
        out<<urls[i].urlTxt<<"\t"<<urls[i].score[scoreIdx]<<"\t"<<urls[i].inDegree<<endl;
    }
    out.close();
    cout << nowTime() << ": Complete write pagerank to file." << endl;
}

int main()
{
    initUrl();
    initNet();
    scoreIdx = calPagerank();
    pagerankWriteToFile();
    return 0;
}

经过如此计算后,结果虽然有一定变化,但是变化不大。也许是数据集或者抓取过程的原因,我也没在我的代码以及他的代码中发现什么问题。如果有发现代码问题的,请留言,不胜感激。下面将对得到的结果进行介绍。

posted on 2012-04-16 17:12  liugoodness  阅读(1013)  评论(6编辑  收藏  举报

导航