在Google这个由10的100次方得名的站点中,各种评估网站的算法层出不穷,而PageRank即是其中之一。
Google的PageRank根据网站的外部链接和内部链接的数量和质量俩衡量网站的价值。PageRank背后的概念是,每个到页面的链接都是对该页面的一次投票,被链接的越多,就意味着被其他网站投票越多。这个就是所谓的“链接流行度”——衡量多少人愿意将他们的网站和你的网站挂钩。PageRank这个概念引自学术中一篇论文的被引述的频度——即被别人引述的次数越多,一般判断这篇论文的权威性就越高。
通常情况下讲,原创内容越多的站点,PageRank越容易提升,反之则相对比较困难,PageRank最大上限值为10。在Google的评估中,能上10的网站真可谓凤毛麟角,即使算上Google,能成就PageRank 10这“伟业”者,望眼环球也不足40家。一般来说,个人站点评估值4即办的不错,商业网站到6以上便算步入正轨了。
网上虽然有不少现成的查询器及源码,但是光用别人的毕竟不符合程序员风格,所以今天自己用Java重造轮子又写了个PageRank查询实现,捎带着把一些常用搜索引擎的网站链接及反向链接查询也加上了。
源码如下:
GooglePageRank.java
- package org.loon.test;
- import java.io.IOException;
- import java.util.Random;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- /**
- * Copyright 2008
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- *
- * @project loonframework
- * @author chenpeng
- * @email:ceponline@yahoo.com.cn
- * @version 0.1
- */
- public class GooglePageRank {
- // google pagerank服务器ip地址列表(最近google小气了很多,反复查询一个封ip)
- final static String[] GoogleServiceIP = new String[] { "64.233.161.100",
- "64.233.161.101", "64.233.183.91", "64.233.189.44", "66.102.1.103",
- "66.102.9.115", "66.249.89.83", "66.249.91.99", "66.249.93.190" };
- // google用识别标记
- final static private int GOOGLE_MAGIC = 0xE6359A60;
- // ch数值混合器
- private class CHMix {
- int a;
- int b;
- int c;
- public CHMix() {
- this(0, 0, 0);
- }
- public CHMix(int a, int b, int c) {
- this.a = a;
- this.b = b;
- this.c = c;
- }
- }
- /**
- * 按google要求混合成ch数据
- *
- * @param mix
- */
- private static void mix(final CHMix mix) {
- mix.a -= mix.b;
- mix.a -= mix.c;
- mix.a ^= mix.c >> 13;
- mix.b -= mix.c;
- mix.b -= mix.a;
- mix.b ^= mix.a << 8;
- mix.c -= mix.a;
- mix.c -= mix.b;
- mix.c ^= mix.b >> 13;
- mix.a -= mix.b;
- mix.a -= mix.c;
- mix.a ^= mix.c >> 12;
- mix.b -= mix.c;
- mix.b -= mix.a;
- mix.b ^= mix.a << 16;
- mix.c -= mix.a;
- mix.c -= mix.b;
- mix.c ^= mix.b >> 5;
- mix.a -= mix.b;
- mix.a -= mix.c;
- mix.a ^= mix.c >> 3;
- mix.b -= mix.c;
- mix.b -= mix.a;
- mix.b ^= mix.a << 10;
- mix.c -= mix.a;
- mix.c -= mix.b;
- mix.c ^= mix.b >> 15;
- }
- /**
- * 获得ch数值混合器
- *
- * @return
- */
- public static CHMix getInnerCHMix() {
- return new GooglePageRank().new CHMix();
- }
- /**
- * 通过url获得googlech(google数据库针对页面的全球唯一标识)
- *
- * @param url
- * @return
- */
- public static String GoogleCH(final String url) {
- // 格式化为google要求的info:url模式
- String nUrl = String.format("info:%s", new Object[] { url });
- // 获得新url字符串格式
- char[] urls = nUrl.toCharArray();
- // 获得新url长度
- int length = urls.length;
- // 获得一个ch数值混合器
- CHMix chMix = GooglePageRank.getInnerCHMix();
- // 为c注入google识别标识
- chMix.c = GOOGLE_MAGIC;
- // 为a、b项注入google要求的初始标识
- chMix.a = chMix.b = 0x9E3779B9;
- int k = 0;
- int len = length;
- while (len >= 12) {
- chMix.a += (int) (urls[k + 0] + (urls[k + 1] << 8)
- + (urls[k + 2] << 16) + (urls[k + 3] << 24));
- chMix.b += (int) (urls[k + 4] + (urls[k + 5] << 8)
- + (urls[k + 6] << 16) + (urls[k + 7] << 24));
- chMix.c += (int) (urls[k + 8] + (urls[k + 9] << 8)
- + (urls[k + 10] << 16) + (urls[k + 11] << 24));
- // 获得混合运算后的数据
- GooglePageRank.mix(chMix);
- k += 12;
- len -= 12;
- }
- chMix.c += length;
- // 产生googlech的11位标识
- switch (len) {
- case 11:
- chMix.c += (int) (urls[k + 10] << 24);
- case 10:
- chMix.c += (int) (urls[k + 9] << 16);
- case 9:
- chMix.c += (int) (urls[k + 8] << 8);
- case 8:
- chMix.b += (int) (urls[k + 7] << 24);
- case 7:
- chMix.b += (int) (urls[k + 6] << 16);
- case 6:
- chMix.b += (int) (urls[k + 5] << 8);
- case 5:
- chMix.b += (int) (urls[k + 4]);
- case 4:
- chMix.a += (int) (urls[k + 3] << 24);
- case 3:
- chMix.a += (int) (urls[k + 2] << 16);
- case 2:
- chMix.a += (int) (urls[k + 1] << 8);
- case 1:
- chMix.a += (int) (urls[k + 0]);
- break;
- default:
- break;
- }
- // 获得混合运算后的数据
- GooglePageRank.mix(chMix);
- // 获得未修订的CH
- String tch = String.valueOf(chMix.c);
- // 矫正差值后反馈正确CH
- return String
- .format("6%s", new Object[] { tch.length() < 10 ? ("-" + tch)
- .intern() : tch });
- }
- /**
- * 正则匹配pagerank结果
- *
- * @param value
- * @return
- */
- private static String MatchRank(final String value) {
- Pattern pattern = Pattern.compile("Rank_1:[0-9]:([0-9]+)");
- Matcher matcher = pattern.matcher(value);
- if (matcher.find()) {
- return matcher.group(1);
- }
- return "0";
- }
- /**
- * 获得指定页面的google pagerank值
- *
- * @param url
- * @return
- */
- public static String GooglePR(final String url) {
- String rip = GoogleServiceIP[new Random()
- .nextInt(GoogleServiceIP.length)];
- return GooglePR(url, rip);
- }
- /**
- * 以指定的google服务器获得指定页面的google pagerank值
- *
- * @param url
- * @param ip
- * @return
- */
- public static String GooglePR(final String url, final String ip) {
- // 产生查询用唯一标识
- String checksum = GoogleCH(url);
- // 产生查询用url
- String queryUrl = String
- .format(
- "http://%s/search?client=navclient-auto&ch=%s&features=Rank&q=info:%s",
- new Object[] { ip, checksum, url });
- String response;
- try {
- response = SimpleWebClient.getRequestHttp(queryUrl);
- } catch (IOException e) {
- response = "";
- }
- if (response.length() == 0) {
- return "0";
- } else {
- return GooglePageRank.MatchRank(response);
- }
- }
- }
SimpleWebClient.java
- package org.loon.test;
- import java.io.BufferedInputStream;
- import java.io.ByteArrayOutputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.io.OutputStreamWriter;
- import java.net.HttpURLConnection;
- import java.net.URL;
- import java.util.HashMap;
- import java.util.Iterator;
- import java.util.Map;
- import java.util.Set;
- import java.util.Map.Entry;
- import sun.misc.BASE64Encoder;
- /**
- * Copyright 2008
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- *
- * @project loonframework
- * @author chenpeng
- * @email:ceponline@yahoo.com.cn
- * @version 0.1
- */
- public class SimpleWebClient {
- /**
- * 向指定url发送请求并获得响应数据
- *
- * @param urlString
- * @return
- * @throws IOException
- */
- public static String getRequestHttp(String urlString) throws IOException {
- return getRequestHttp(urlString, "utf-8");
- }
- /**
- * 向指定url发送请求并获得响应数据
- *
- * @param urlString
- * @param encoding
- * @return
- * @throws IOException
- */
- public static String getRequestHttp(String urlString, String encoding)
- throws IOException {
- return getRequestHttp(urlString, encoding, null, 5000);
- }
- /**
- * 向指定url发送请求并获得响应数据
- *
- * @param urlString
- * @param encoding
- * @param parameter
- * @return
- * @throws IOException
- */
- public static String getRequestHttp(final String urlString,
- final String encoding, final Map parameter, final int timeout)
- throws IOException {
- String nURL = (urlString.startsWith("http://") || urlString
- .startsWith("https://")) ? urlString : ("http:" + urlString)
- .intern();
- String user = null;
- String password = null;
- String method = "GET";
- String post = null;
- String digest = null;
- String responseContent = "ERROR";
- boolean foundRedirect = false;
- Map headers = new HashMap();
- if (parameter != null) {
- Set entrySet = parameter.entrySet();
- for (Iterator it = entrySet.iterator(); it.hasNext();) {
- Entry header = (Entry) it.next();
- String key = (String) header.getKey();
- String value = (String) header.getValue();
- if ("user".equals(key)) {
- user = value;
- } else if ("pass".equals(key)) {
- password = value;
- } else if ("method".equals(key)) {
- method = value;
- } else if ("post".equals(key)) {
- post = value;
- } else {
- headers.put(key, value);
- }
- }
- }
- URL url = new URL(nURL);
- if (user != null && password != null) {
- BASE64Encoder base64 = new BASE64Encoder();
- digest = "Basic "
- + base64.encode((user + ":" + password).getBytes());
- }
- do {
- HttpURLConnection urlConnection = (HttpURLConnection) url
- .openConnection();
- // 添加访问授权
- if (digest != null) {
- urlConnection.setRequestProperty("Authorization", digest);
- }
- urlConnection.setDoOutput(true);
- urlConnection.setDoInput(true);
- urlConnection.setUseCaches(false);
- urlConnection.setInstanceFollowRedirects(false);
- urlConnection.setRequestMethod(method);
- if (timeout > 0) {
- urlConnection.setConnectTimeout(timeout);
- }
- //模拟http头文件
- urlConnection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0;)");
- urlConnection.setRequestProperty("Accept", "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/msword, application/vnd.ms-excel, application/vnd.ms-powerpoint, */*");
- //追加http头文件
- Set headersSet = headers.entrySet();
- for (Iterator it = headersSet.iterator(); it.hasNext();) {
- Entry entry = (Entry) it.next();
- urlConnection.setRequestProperty((String) entry.getKey(),
- (String) entry.getValue());
- }
- if (post != null) {
- OutputStreamWriter outRemote = new OutputStreamWriter(
- urlConnection.getOutputStream());
- outRemote.write(post);
- outRemote.flush();
- }
- // 获得响应状态
- int responseCode = urlConnection.getResponseCode();
- // 获得返回的数据长度
- int responseLength = urlConnection.getContentLength();
- if (responseCode == 302) {
- // 重定向
- String location = urlConnection.getHeaderField("Location");
- url = new URL(location);
- foundRedirect = true;
- } else {
- BufferedInputStream in;
- if (responseCode == 200 || responseCode == 201) {
- in = new BufferedInputStream(urlConnection.getInputStream());
- } else {
- in = new BufferedInputStream(urlConnection.getErrorStream());
- }
- int size = responseLength == -1 ? 4096 : responseLength;
- if (encoding != null) {
- responseContent = SimpleWebClient.read(in, size, encoding);
- } else {
- ByteArrayOutputStream out = new ByteArrayOutputStream();
- byte[] bytes = new byte[size];
- int read;
- while ((read = in.read(bytes)) >= 0) {
- out.write(bytes, 0, read);
- }
- responseContent = new String(out.toByteArray());
- in.close();
- out.close();
- }
- foundRedirect = false;
- }
- // 如果重定向则继续
- } while (foundRedirect);
- return responseContent;
- }
- /**
- * 转化InputStream为String
- *
- * @param in
- * @param size
- * @return
- * @throws IOException
- */
- private static String read(final InputStream in, final int size,
- final String encoding) throws IOException {
- StringBuilder sbr = new StringBuilder();
- int nSize = size;
- if (nSize == 0) {
- nSize = 1;
- }
- char[] buffer = new char[nSize];
- int offset = 0;
- InputStreamReader isr = new InputStreamReader(in, encoding);
- while ((offset = isr.read(buffer)) != -1) {
- sbr.append(buffer, 0, offset);
- }
- in.close();
- isr.close();
- return sbr.toString();
- }
- }
WebAppraise.java
- package org.loon.test;
- import java.io.IOException;
- /**
- * Copyright 2008
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- *
- * @project loonframework
- * @author chenpeng
- * @email:ceponline@yahoo.com.cn
- * @version 0.1
- */
- public class WebAppraise {
- private String googleSum;
- private String baiduSum;
- private String msnSum;
- private String altaVistaSum;
- private String allTheWebSum;
- private String yahooSum;
- private String testURL;
- public WebAppraise(final String url) {
- if (url != null && !"".equals(url)) {
- this.testURL = url.trim();
- if (this.testURL.startsWith("http://")) {
- this.testURL = this.testURL.substring(7);
- }
- if (this.testURL.startsWith("https://")) {
- this.testURL = this.testURL.substring(8);
- }
- } else {
- throw new RuntimeException("url is NULL!");
- }
- }
- /**
- * 分析指定链接结果,并返回整型数值
- *
- * @param searchURL
- * @param anchor
- * @param trail
- * @return
- */
- private static int getLinks(final String searchURL, final String anchor,
- final String trail) {
- int count = 0;
- String serverResponse;
- try {
- // 我国特色……
- if (searchURL.startsWith("http://www.baidu.com")) {
- // 永不离休的gb2312同志(-_-||)
- serverResponse = SimpleWebClient.getRequestHttp(searchURL,
- "gb2312");
- } else {
- serverResponse = SimpleWebClient.getRequestHttp(searchURL);
- }
- } catch (IOException e) {
- serverResponse = e.getMessage();
- }
- int pos = serverResponse.indexOf(anchor);
- if (pos > 1) {
- serverResponse = serverResponse.substring(pos + anchor.length());
- pos = serverResponse.indexOf(trail);
- String value = serverResponse.substring(0, pos).trim();
- value = value.replace(",", "");
- value = value.replace(".", "");
- count = Integer.parseInt(value);
- }
- return count;
- }
- public String getAllTheWebSite() {
- return getAllTheWebSite(false);
- }
- public String getAllTheWebSite(boolean isDomain) {
- try {
- String allTheWeb;
- if (isDomain) {
- allTheWeb = "http://www.alltheweb.com/search?cat=web&cs=utf8&rys=0&itag=crv&_sb_lang=any&q=linkdomain%3A"
- + this.testURL;
- } else {
- allTheWeb = "http://www.alltheweb.com/search?cat=web&cs=utf-8&q=link%3Ahttp%3A%2F%2F"
- + this.testURL + "&_sb_lang=any";
- }
- allTheWebSum = ""
- + getLinks(allTheWeb, "<span class=/"ofSoMany/">",
- "</span>");
- } catch (Exception ex) {
- allTheWebSum = ex.getMessage();
- }
- return allTheWebSum;
- }
- public String getAltaVistaSite() {
- return getAltaVistaSite(false);
- }
- public String getAltaVistaSite(boolean isDomain) {
- try {
- String altaVista;
- if (isDomain) {
- altaVista = "http://www.altavista.com/web/results?itag=ody&q=link%3A"
- + this.testURL + "&kgs=0&kls=0";
- } else {
- altaVista = "http://www.altavista.com/web/results?itag=ody&kgs=0&kls=0&q=site%3A"
- + this.testURL;
- }
- altaVistaSum = "" + getLinks(altaVista, "AltaVista found ", " ");
- } catch (Exception ex) {
- altaVistaSum = ex.getMessage();
- }
- return altaVistaSum;
- }
- public String getGooglePR() {
- return GooglePageRank.GooglePR(this.testURL);
- }
- public String getGoogleSite() {
- return getGoogleSite(false);
- }
- public String getGoogleSite(final boolean isDomian) {
- try {
- String google;
- // 反向链接
- if (isDomian) {
- google = "http://www.google.com/search?hl=en&q=link%3A"
- + this.testURL;
- } else {
- google = "http://www.google.com/search?hl=en&q=site%3A"
- + this.testURL + "&btnG=Google+Search&aq=f&oq=";
- }
- googleSum = "" + getLinks(google, "about <b>", "</b>");
- } catch (Exception ex) {
- googleSum = ex.getMessage();
- }
- return googleSum;
- }
- public String getBaiduSite() {
- return getBaiduSite(false);
- }
- public String getBaiduSite(final boolean isDomian) {
- try {
- String baidu;
- if (isDomian) {
- baidu = "http://www.baidu.com/s?wd=domain%3A" + this.testURL
- + "&cl=3";
- } else {
- baidu = "http://www.baidu.com/s?wd=site%3A" + this.testURL;
- }
- baiduSum = "" + getLinks(baidu, "找到相关网页", "篇");
- } catch (Exception ex) {
- String baidu;
- if (isDomian) {
- baidu = "http://www.baidu.com/s?wd=domain%3A" + this.testURL
- + "&cl=3";
- } else {
- baidu = "http://www.baidu.com/s?wd=site%3A" + this.testURL;
- }
- baiduSum = "" + getLinks(baidu, "找到相关网页约", "篇");
- }
- return baiduSum;
- }
- public String getYahooSite() {
- return getYahooSite(false);
- }
- public String getYahooSite(final boolean isDomian) {
- try {
- String yahoo;
- if (isDomian) {
- yahoo = "http://sitemap.cn.yahoo.com/search?p=" + this.testURL
- + "&bwm=i";
- yahooSum = "" + getLinks(yahoo, "<strong>", "</strong>");
- } else {
- yahoo = "http://www.yahoo.cn/s?p=site%3A" + this.testURL
- + "&pid=hp&v=web";
- yahooSum = "" + getLinks(yahoo, "找到相关网页约", "条");
- }
- } catch (Exception ex) {
- yahooSum = ex.getMessage();
- }
- return yahooSum;
- }
- public String getMsnSite() {
- return getMsnSite(false);
- }
- public String getMsnSite(boolean isDomain) {
- try {
- String msn;
- if (isDomain) {
- msn = "http://cnweb.search.live.com/results.aspx?q=link%3A"
- + this.testURL + "&mkt=zh-cn&scope=&FORM=LIVSO";
- } else {
- msn = "http://cnweb.search.live.com/results.aspx?q=site%3A"
- + this.testURL + "&go=&form=QBRE";
- }
- msnSum = "" + getLinks(msn, "共", "条搜索结果");
- } catch (Exception ex) {
- msnSum = ex.getMessage();
- }
- return msnSum;
- }
- public String getTestURL() {
- return testURL;
- }
- }
Test.java
- package org.loon.test;
- /**
- * Copyright 2008
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- *
- * @project loonframework
- * @author chenpeng
- * @email:ceponline@yahoo.com.cn
- * @version 0.1
- */
- public class Test {
- public static void main(String[] args) {
- WebAppraise appraise = new WebAppraise("http://blog.csdn.net/cping1982");
- System.out.println("GooglePagerRank值:" + appraise.getGooglePR());
- System.out.println("google收录:" + appraise.getGoogleSite());
- System.out.println("google反向收录:" + appraise.getGoogleSite(true));
- System.out.println("yahoo收录:" + appraise.getYahooSite());
- System.out.println("yahoo反向收录:" + appraise.getYahooSite(true));
- System.out.println("baidu收录:" + appraise.getBaiduSite());
- System.out.println("baidu反向收录:" + appraise.getBaiduSite(true));
- System.out.println("msn收录:" + appraise.getMsnSite());
- System.out.println("msn反向收录:" + appraise.getMsnSite(true));
- System.out.println("AllTheWeb收录:" + appraise.getAllTheWebSite());
- System.out.println("AllTheWeb反向收录:" + appraise.getAllTheWebSite(true));
- System.out.println("AltaVista收录:" + appraise.getAltaVistaSite());
- System.out.println("AltaVista反向收录:" + appraise.getAltaVistaSite(true));
- }
- }
检测http://blog.csdn.net/cping1982运行结果如下图:

源码下载地址:http://download.csdn.net/source/929348