爬虫程序判断是否已抓URL
http://www.cnblogs.com/yuandong/archive/2008/08/28/Web_Spider_Url_Index.html
看了这篇文章后的灵感,不过他是用C++实现的,我是用C#实现。。不说那么多,帖代码了。。
判断URLIdentity类,用于对已抓取的URL进行标识,判断URL是否已抓取
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Collections;
using System.Security.Cryptography;
namespace TestMD5
{
public class URLIdentity
{
private BitArray []SegmentArray = new BitArray[4096];
private int GetSegmentIndex(int hashValue)
{
return hashValue / 25000;
}
private int GetSegmentOffset(int hashValue)
{
return hashValue % 25000;
}
public int GetIntHashCode(string url)
{
byte[] tmpByte;
MD5 md5 = new MD5CryptoServiceProvider();
tmpByte = md5.ComputeHash(Encoding.Default.GetBytes(url));
StringBuilder sb = new StringBuilder();
for (int i = 0; i < tmpByte.Length; i++)
{
sb.Append(tmpByte[i]);
}
string temp = sb.ToString().Substring(0, 8);
return Int32.Parse(temp);
}
public void SetUrlIndentity(string url)
{
int HashNum = GetIntHashCode(url);
int SegIndex = GetSegmentIndex(HashNum);
int SegOffset = GetSegmentOffset(HashNum);
if (SegmentArray[SegIndex] == null)
{
SegmentArray[SegIndex] = new BitArray(25000);
}
SegmentArray[SegIndex][SegOffset] = true;
}
public bool GetUrlIdentity(string url)
{
int HashNum = GetIntHashCode(url);
int SegIndex = GetSegmentIndex(HashNum);
int SegOffset = GetSegmentOffset(HashNum);
if (SegmentArray[SegIndex] == null)
{
return false;
}
else
{
return SegmentArray[SegIndex][SegOffset];
}
}
}
}
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Collections;
using System.Security.Cryptography;
namespace TestMD5
{
public class URLIdentity
{
private BitArray []SegmentArray = new BitArray[4096];
private int GetSegmentIndex(int hashValue)
{
return hashValue / 25000;
}
private int GetSegmentOffset(int hashValue)
{
return hashValue % 25000;
}
public int GetIntHashCode(string url)
{
byte[] tmpByte;
MD5 md5 = new MD5CryptoServiceProvider();
tmpByte = md5.ComputeHash(Encoding.Default.GetBytes(url));
StringBuilder sb = new StringBuilder();
for (int i = 0; i < tmpByte.Length; i++)
{
sb.Append(tmpByte[i]);
}
string temp = sb.ToString().Substring(0, 8);
return Int32.Parse(temp);
}
public void SetUrlIndentity(string url)
{
int HashNum = GetIntHashCode(url);
int SegIndex = GetSegmentIndex(HashNum);
int SegOffset = GetSegmentOffset(HashNum);
if (SegmentArray[SegIndex] == null)
{
SegmentArray[SegIndex] = new BitArray(25000);
}
SegmentArray[SegIndex][SegOffset] = true;
}
public bool GetUrlIdentity(string url)
{
int HashNum = GetIntHashCode(url);
int SegIndex = GetSegmentIndex(HashNum);
int SegOffset = GetSegmentOffset(HashNum);
if (SegmentArray[SegIndex] == null)
{
return false;
}
else
{
return SegmentArray[SegIndex][SegOffset];
}
}
}
}
DEMO:
Code