将WebGraph合并为HostGraph
law实验室提供很多的webgraph,但是没有提供相应的hostgraph。所谓hostgraph就是将webgraph中在同一站点的url合成一个结点。注意到这些webgraph中在同一个站点中的url是连续的,这对我们进行合并提供了很大的方便性。本来想用java来写,但是考虑到java的io效率在windows下比较差,就用C#了。我用IKVM将webgraph.jar和其依赖的jar文件打包成webgraph.dll。
合并算法相对比较简单,分为两步:第一步扫描url文件,建立相应结点的对应关系。第二步,读取webgraph进行合并,并声称hostgraph。
代码如下:
using System;using System.Collections.Generic;using System.Linq;using System.Text;using System.IO;using it.unimi.dsi.webgraph;using org.apache.log4j;namespace indochina_2004host{class Program
{static string basename = @"D:研究数据集law datasetsuk-2007-05uk-2007-05";static Logger logger;
static bool offline = false;static Program()
{Logger.getRootLogger().addAppender(new ConsoleAppender(new TTCCLayout(), ConsoleAppender.SYSTEM_OUT));logger = Logger.getLogger(typeof(Program));}static void Main(string[] args){MergeUrl();MergeGraph();}static void MergeGraph(){logger.info("start merge graph");
string[] maps = File.ReadAllLines(basename + ".map");
BVGraph bg;logger.info("loading graph");
if (offline)
bg = BVGraph.loadOffline(basename);else
bg = BVGraph.load(basename);using (StreamWriter sw = new StreamWriter(basename + ".hostgraph.graph-txt")){NodeIterator it = bg.nodeIterator();// 生成一个map
logger.info("generate map");
int[] map = new int[bg.numNodes()];int[] start = new int[maps.Length];int[] end = new int[maps.Length];for (int k = 0; k < maps.Length; k ++){string line = maps[k];string[] splits = line.Split(' ', '-');int value = Convert.ToInt32(splits[2]);
int st = Convert.ToInt32(splits[0]);
int en = Convert.ToInt32(splits[1]);
start[k] = st;end[k] = en;for (int i = st; i <= en; i++)map[i] = value;maps[k] = null;
}// 回收maps
maps = null;
GC.Collect();logger.info("map length: " + start.Length);
sw.WriteLine(start.Length);// 开始合并
logger.info("merging...");
for (int k = 0; k < start.Length; k ++){logger.info("merging " + start[k] + "-" + end[k] + ": " + map[start[k]]);SortedSet<int> successors = new SortedSet<int>();for (int i = start[k]; i <= end[k]; i++){it.nextInt();LazyIntIterator lit = it.successors();int j;
while ((j = lit.nextInt()) != -1)
successors.Add(j);}SortedSet<int> after = new SortedSet<int>();foreach (int successor in successors)
after.Add(map[successor]);after.Remove(k);int[] ts = after.ToArray();
for (int i = 0; i < ts.Length - 1; i++){sw.Write(ts[i]);sw.Write(' ');}if (ts.Length != 0)
sw.WriteLine(ts[ts.Length - 1]);else
sw.WriteLine();}sw.Flush();}logger.info("end merge graph");
}static void MergeUrl(){logger.info("start merge url");
using (StreamReader urlSr = new StreamReader(basename + ".urls"))using (StreamWriter hostnamesSw = new StreamWriter(basename + ".hostnames.txt"))using (StreamWriter mapSw = new StreamWriter(basename + ".map")){long i = 0;
long j = 0;
long k = 0;
string host = null;
string url = null;
while (!urlSr.EndOfStream)
{url = urlSr.ReadLine();string curhost = url.ToLower().Replace("http://", "");
curhost = curhost.Substring(0, curhost.IndexOf('/')).Trim();if (host != curhost)
{if (host != null){hostnamesSw.WriteLine(host);mapSw.WriteLine(j + "-" + (i - 1) + " " + k);k++;j = i;}host = curhost;}i++;}hostnamesSw.WriteLine(host);mapSw.WriteLine(j + "-" + (i-1) + " " + k);hostnamesSw.Flush();mapSw.Flush();}logger.info("end merge url");
}}}
本文基于署名 2.5 中国大陆许可协议发布,欢迎转载,演绎或用于商业目的,但是必须保留本文的署名小橋流水(包含链接)。如您有任何疑问或者授权方面的协商,请给我发邮件。