HITS算法是由Jon Kleinberg在20世纪90年代提出的一种链接分析算法。HITS算法是Hyperlink-Induced Topic Search(基于超链接推演的主题搜索算法)的简称,它的核心思想是对网页如下两个方面的权威程度进行评价。首先,内容权威度(Authority Value),即网页本身内容的受欢迎程序;其次,链接权威度(Hub Value),即网页链接到其他受欢迎资源的程度。
HITS算法的实施包括两个阶段,对用户输入的查询主题而言,首先是通过文本搜索过程获取与此查询主题内容相关的网页集合,并适当扩充该网页集合,以包括尽可能多的结果候选网页,同时使用结果集合网页间的链接结构关系更加完整;随后则是通过一个“迭代—收敛”的过程计算网页集合中每个页面对应的链接权威度和内容权威度数值。算法最后输出的是分别按照链接权威度与内容权威度排序的结果列表,用户可以根据需求不同,选择其中的结果页面进行浏览。
算法描述:
(此图片转载于http://blog.csdn.net/hguisu/article/details/8013489)
(此图片转载于http://blog.csdn.net/hguisu/article/details/8013489)
具体代码实现如下:
Vertex类主要是定义图的节点
graph类主要是对图的一些操作
handle类主要是对数据进行HTIS算法的处理
最后一个类包含主函数,这个类主要工作:读取数据;输出处理结果
Vertex
/// <summary> /// 顶点类 /// </summary> public class Vertex { public string Data; public double Index; public Vertex(string Vertexdata) { Data = Vertexdata; } public Vertex(double Index, string Vertexdata) { this.Data = Vertexdata; this.Index = Index; } }
Graph
public class Graph { //图中所能包含的点上限 private const int Number =10000; //顶点数组 public IList<Vertex> vertiexes = new List<Vertex>(); //邻接矩阵 public int[,] Adjmatrix; //初始化图 public Graph() { //初始化邻接矩阵和顶点数组 Adjmatrix = new int[Number, Number]; //将代表邻接矩阵的表全初始化为0 for (int m = 0; m < Number; m++) { for (int n = 0; n < Number; n++) { Adjmatrix[m, n] = 0; } } } //向图中添加节点 public void AddVertex(String v) { int i = 0; foreach (var n in vertiexes) { if (n.Data==v) { i = 1; break; } } if (i == 0) { Vertex Info = new Vertex(v); vertiexes.Add(Info); } } //向图中添加有向边 public void AddEdge(string vertex1, string vertex2) { int Ai = -1; int Aj = -1; foreach (var Info in vertiexes) { Ai++; if (Info.Data == vertex1) { break; } } foreach (var Info in vertiexes) { Aj++; if (Info.Data == vertex2) { break; } } Adjmatrix[Ai,Aj] = 1; } //输出图 public void PrintGraph() { for (int i = 0; i <500; i++) { for (int j = 0; j <2; j++) { Console.Write(Adjmatrix[i,j]); } Console.WriteLine(); } } }
Handle
public class Handle { private const int Number = 10000; /// <summary> /// size为网页的总个数(每个都是不一样的) /// </summary> private int size; private IList<Vertex> vertiexes; public Handle(int size, IList<Vertex> Vertiexes) { this.size = size; this.vertiexes = Vertiexes; } public Handle(int size) { this.size = size; this.vertiexes = new List<Vertex>(); Vertex a = new Vertex("A"); vertiexes.Add(a); Vertex b = new Vertex("B"); vertiexes.Add(b); Vertex c = new Vertex("C"); vertiexes.Add(c); } public double[] auth = new double[Number]; public double[] hub = new double[Number]; private int[,] net; public int[,] Net { get { return net; } set { net = value; } } //计算数组元素总和(用于进行归一化) public double cal_sum(double[] a) { double res = 0.0; for (int i = 0; i < size; i++) res += a[i]; return res; } //计算两个结果之间的“距离”,判断结果是否收敛 public double dis(double[] a, double[] b) { double sum_dis = 0.0; for (int i = 0; i < size; i++) { if (a[i] - b[i] >= 0) sum_dis = sum_dis + a[i] - b[i]; else sum_dis = sum_dis + b[i] - a[i]; } return sum_dis; } //计算每个页面的authority score和hub score public void cal_hits() { double[] temp_a = new double[size]; double[] temp_h = new double[size]; int count = 0; //初始化 for (int i = 0; i < size; i++) { auth[i] = 1.0; hub[i] = 1.0; temp_a[i] = 0; temp_h[i] = 0; } //迭代进行计算 while (true) { for (int i = 0; i < size; i++) { temp_a[i] = auth[i]; temp_h[i] = hub[i]; } //根据所给公式计算每个页面的authority score for (int i = 0; i < size; i++) { double ta = 0.0; for (int j = 0; j < size; j++) { if (net[j, i] == 1) ta += hub[j]; } auth[i] = ta; } //根据所给公式计算每个页面的hub score for (int i = 0; i < size; i++) { double th = 0.0; for (int j = 0; j < size; j++) { if (net[i, j] == 1) th += auth[j]; } hub[i] = th; } //对结果进行归一化处理 double sum_a = cal_sum(auth); double sum_h = cal_sum(hub); for (int i = 0; i < size; i++) { auth[i] = auth[i] / sum_a; hub[i] = hub[i] / sum_h; } //判断是否已经收敛 //这里认为相邻两次计算的结果连续10次小于0.000001为收敛 //收敛时整个计算过程结束 if (dis(auth, temp_a) < 0.000001 && dis(hub, temp_h) < 0.000001) { count++; if (count >= 10) break; } } //输出每个页面的authority score和hub score //下边主要是把得出的AUTH结果赋值给特定顶点 int test = 0; IList<Vertex> Result = new List<Vertex>(); foreach (var Info in vertiexes) { Vertex In = new Vertex(auth[test], Info.Data); test++; Result.Add(In); } int countNum = 0; foreach (var n in Result.OrderByDescending((Vertex In) => In.Index)) { countNum++; Console.WriteLine(" 网页" + countNum + ":" + n.Data + " 内容权威度:" + n.Index); } } }
Main
public class PrintOut { public static void Main(string[] args) { DateTime start = DateTime.Now; Console.WriteLine("网页根据其自身内容权威度排序如下:"); Graph Gra = Read(); int Size = Gra.vertiexes.Count; int[,] Matrix = Gra.Adjmatrix; Handle Info = new Handle(Size, Gra.vertiexes); Info.Net = Matrix; Info.cal_hits(); //int[,] str = new int[3, 3] { { 1, 1, 1 }, { 1, 0, 1 }, { 0, 1, 0 } }; //Handle Info = new Handle(3); //Info.Net = str; //Info.cal_hits(); DateTime end = DateTime.Now; Console.Write("运行时间:"); Console.Write(end - start); Console.ReadLine(); } /// <summary> /// 添加节点以及边 /// </summary> public static Graph Read() { Graph Gra = new Graph(); DataSet ds = ImportExcel(); DataTable dt = ds.Tables[0]; for (int i = 0; i < dt.Rows.Count; i++) { for (int j = 0; j < 2; j++) { //添加节点 Gra.AddVertex(dt.Rows[i][j].ToString()); } Gra.AddEdge(dt.Rows[i][0].ToString(), dt.Rows[i][1].ToString()); } //Console.WriteLine(dt.Rows.Count); //Gra.PrintGraph(); return Gra; } public static DataSet ImportExcel() { string url = "\\link.xlsx"; DataSet ds = new DataSet(); string strConn = "Provider=Microsoft.ACE.OLEDB.12.0;Data Source=" + url + ";Extended Properties=Excel 12.0"; OleDbConnection Conn = new OleDbConnection(strConn); Conn.Open(); OleDbDataAdapter data = new OleDbDataAdapter("select * from [Sheet1$]", strConn); data.Fill(ds); return ds; } }