mini爬虫程序
Code
class MiniCrawler
{
// Find a link in a content string.
static string FindLink(string htmlstr, ref int startloc)
{
int i;
int start, end;
string uri = null;
string lowcasestr = htmlstr.ToLower();
i = lowcasestr.IndexOf("href=\"http", startloc);
if (i != -1)
{
start = htmlstr.IndexOf('"', i) + 1;
end = htmlstr.IndexOf('"', start);
uri = htmlstr.Substring(start, end - start);
startloc = end;
}
return uri;
}
public static void Crawle(string uristr)
{
string link = null;
string str;
string answer;
int curloc; // holds current location in response
try
{
do
{
Console.WriteLine("Linking to " + uristr);
// 创建一个指定URI的WebRequest
HttpWebRequest req = (HttpWebRequest)
WebRequest.Create(uristr);
// 发送reques得到返回的response.
HttpWebResponse resp = (HttpWebResponse)req.GetResponse();
// 从返回的内容中获得数据流
Stream istrm = resp.GetResponseStream();
StreamReader rdr = new StreamReader(istrm);
// 读取整个页面
str = rdr.ReadToEnd();
curloc = 0;
do
{
// 查找下一个uri
link = FindLink(str, ref curloc);
if (link != null)
{
Console.WriteLine("发现链接: " + link);
Console.Write("Link, More, Quit?");
answer = Console.ReadLine();
if (string.Compare(answer, "L", true) == 0)
{
uristr = string.Copy(link);
break;
}
else if (string.Compare(answer, "Q", true) == 0)
{
break;
}
else if (string.Compare(answer, "M", true) == 0)
{
Console.WriteLine("Searching for another link.");
}
}
else
{
Console.WriteLine("No link found.");
break;
}
} while (link.Length > 0);
// Close the response.
resp.Close();
} while (uristr != null);
}
catch (WebException exc)
{
Console.WriteLine("Network Error: " + exc.Message +
"\nStatus code: " + exc.Status);
}
catch (ProtocolViolationException exc)
{
Console.WriteLine("Protocol Error: " + exc.Message);
}
catch (UriFormatException exc)
{
Console.WriteLine("URI Format Error: " + exc.Message);
}
catch (NotSupportedException exc)
{
Console.WriteLine("Unknown Protocol: " + exc.Message);
}
catch (IOException exc)
{
Console.WriteLine("I/O Error: " + exc.Message);
}
Console.WriteLine("Terminating MiniCrawler.");
}
}
class MiniCrawler
{
// Find a link in a content string.
static string FindLink(string htmlstr, ref int startloc)
{
int i;
int start, end;
string uri = null;
string lowcasestr = htmlstr.ToLower();
i = lowcasestr.IndexOf("href=\"http", startloc);
if (i != -1)
{
start = htmlstr.IndexOf('"', i) + 1;
end = htmlstr.IndexOf('"', start);
uri = htmlstr.Substring(start, end - start);
startloc = end;
}
return uri;
}
public static void Crawle(string uristr)
{
string link = null;
string str;
string answer;
int curloc; // holds current location in response
try
{
do
{
Console.WriteLine("Linking to " + uristr);
// 创建一个指定URI的WebRequest
HttpWebRequest req = (HttpWebRequest)
WebRequest.Create(uristr);
// 发送reques得到返回的response.
HttpWebResponse resp = (HttpWebResponse)req.GetResponse();
// 从返回的内容中获得数据流
Stream istrm = resp.GetResponseStream();
StreamReader rdr = new StreamReader(istrm);
// 读取整个页面
str = rdr.ReadToEnd();
curloc = 0;
do
{
// 查找下一个uri
link = FindLink(str, ref curloc);
if (link != null)
{
Console.WriteLine("发现链接: " + link);
Console.Write("Link, More, Quit?");
answer = Console.ReadLine();
if (string.Compare(answer, "L", true) == 0)
{
uristr = string.Copy(link);
break;
}
else if (string.Compare(answer, "Q", true) == 0)
{
break;
}
else if (string.Compare(answer, "M", true) == 0)
{
Console.WriteLine("Searching for another link.");
}
}
else
{
Console.WriteLine("No link found.");
break;
}
} while (link.Length > 0);
// Close the response.
resp.Close();
} while (uristr != null);
}
catch (WebException exc)
{
Console.WriteLine("Network Error: " + exc.Message +
"\nStatus code: " + exc.Status);
}
catch (ProtocolViolationException exc)
{
Console.WriteLine("Protocol Error: " + exc.Message);
}
catch (UriFormatException exc)
{
Console.WriteLine("URI Format Error: " + exc.Message);
}
catch (NotSupportedException exc)
{
Console.WriteLine("Unknown Protocol: " + exc.Message);
}
catch (IOException exc)
{
Console.WriteLine("I/O Error: " + exc.Message);
}
Console.WriteLine("Terminating MiniCrawler.");
}
}
作者:Lance
出处:http://www.cnblogs.com/nuaalfm/
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利。