HITS算法Java实现
HITS算法是重要的链接分析算法。很多书上是用矩阵的形式来描述HITS算法:
其中,为邻接矩阵,和分别为权威值和中心值,幂法迭代算法如下:
但是为了空间的考虑,我们在存储Web图的时候,一般都是用的邻接矩阵表示。
经过分析发现,一个页面的权威值,其实是指向它的页面的中心值之和;一个页面的中心值,是它指向的页面的权威值的过程。这是一个相互加强的过程。
下面是用Java实现的代码:
package cn.edu.dlut.wisdom;
import it.unimi.dsi.webgraph.ImmutableGraph;
import it.unimi.dsi.webgraph.LazyIntIterator;
import it.unimi.dsi.webgraph.NodeIterator;
import it.unimi.dsi.webgraph.Transform;
import org.apache.log4j.Logger;
/**
** @author You Wang*/public class HITS {/**
* 正向图*/private ImmutableGraph g;
/**
* 反向图*/private ImmutableGraph ig;
/**
* 日志*/private final Logger logger;/**
* 结点数目*/private int numNodes;/**
* 权威分数*/private double[] authorityScores;/**
* 中心分数*/private double[] hubScores;/**
* 两次权威分数之差绝对值的和*/private double authorityNorm;/**
* 两次中心分数之差绝对值的和*/private double hubNorm;/**
* 迭代次数*/private double numIter = 0;/**
* 获取中心差值* @return*/public double getHubNorm() {return hubNorm;
}/**
* 获取权威差值* @return*/public double getAuthorityNorm() {return authorityNorm;
}/**
* 获取权威分数* @return*/public double[] getAuthorityScores() {return authorityScores;
}/**
* 获取中心分数* @return*/public double[] getHubScores() {return hubScores;
}/**
* 构造函数* @param g 要计算的Web图*/public HITS(ImmutableGraph g) {
this.g = g;
ig = Transform.transpose(g);numNodes = g.numNodes();authorityScores = new double[numNodes];hubScores = new double[numNodes];double is = 1.0 / numNodes;
for (int i = 0; i < numNodes; i++) {authorityScores[i] = is;hubScores[i] = is;}logger = Logger.getLogger(HITS.class);
}/**
* 设定初始权威分数* @param scores*/public void setInitialAuthorityScores(double[] scores) {if (scores.length != numNodes)
throw new IllegalArgumentException("array length mismatch");this.authorityScores = scores;
}/**
* 设定初始中心分数* @param scores*/public void setInitialHubScores(double[] scores) {if (scores.length != numNodes)
throw new IllegalArgumentException("array lenght mismatch");this.hubScores = scores;
}/**
* 迭代中的一步*/public void step() {logger.info("iter " + ++numIter);
authorityNorm = 0;hubNorm = 0;NodeIterator nit = g.nodeIterator();NodeIterator init = ig.nodeIterator();double[] as = new double[numNodes];double[] hs = new double[numNodes];while(nit.hasNext() && init.hasNext()) {
int i = nit.nextInt();
int j = init.nextInt();
assert (i == j);LazyIntIterator it = init.successors();as[i] = 0;int k;
while ((k = it.nextInt()) != -1) {
as[i] += hubScores[k];}hs[i] = 0;it = nit.successors();while ((k = it.nextInt()) != -1) {
hs[i] += authorityScores[k];}}// 归一化处理
normalize(as);normalize(hs);authorityNorm = computeNorm(authorityScores, as);hubNorm = computeNorm(hubScores, hs);authorityScores = as;hubScores = hs;logger.info("authority norm: " + authorityNorm);
logger.info("hub norm: " + hubNorm);
}/**
* 归一化* @param a*/private void normalize(double[] a) {double s = 0;
for (double d : a)s += d;for (int i = 0; i < a.length; i++)a[i] /= s;}/**
* 计算绝对差和* @param a* @param b* @return*/private double computeNorm(double[] a, double[] b) {if (a.length != b.length)
throw new IllegalArgumentException("array length mismath");double norm = 0;
for (int i = 0; i < a.length; i++) {norm += Math.abs(a[i] - b[i]);}return norm;
}/**
* 一直迭代,知道达到最大次数限制* @param iter 最大迭代次数*/public void stepUntil(int iter) {while (iter-- > 0)
step();}/**
* 一直迭代,直到达到规定的停止基准* @param stopNorm 停止基准*/public void stepUntil(double stopNorm) {while (authorityNorm > stopNorm || hubNorm > stopNorm)
step();}}
本文基于署名 2.5 中国大陆许可协议发布,欢迎转载,演绎或用于商业目的,但是必须保留本文的署名小橋流水(包含链接)。如您有任何疑问或者授权方面的协商,请给我发邮件。