随笔- 157 文章- 2 评论- 640 阅读- 38万

mini爬虫程序

Code
class MiniCrawler
    {
        // Find a link in a content string.
        static string FindLink(string htmlstr, ref int startloc)
        {
            int i;
            int start, end;
            string uri = null;
            string lowcasestr = htmlstr.ToLower();
            i = lowcasestr.IndexOf("href=\"http", startloc);
            if (i != -1)
            {
                start = htmlstr.IndexOf('"', i) + 1;
                end = htmlstr.IndexOf('"', start);
                uri = htmlstr.Substring(start, end - start);
                startloc = end;
            }
            return uri;

        }
        public static void Crawle(string uristr)
        {
            string link = null;
            string str;
            string answer;
            int curloc; // holds current location in response
            try
            {
                do
                {
                    Console.WriteLine("Linking to " + uristr);
                    // 创建一个指定URI的WebRequest
                    HttpWebRequest req = (HttpWebRequest)
                    WebRequest.Create(uristr);

                    // 发送reques得到返回的response.
                    HttpWebResponse resp = (HttpWebResponse)req.GetResponse();
                    // 从返回的内容中获得数据流
                    Stream istrm = resp.GetResponseStream();

                    StreamReader rdr = new StreamReader(istrm);
                    // 读取整个页面
                    str = rdr.ReadToEnd();
                    curloc = 0;
                    do
                    {
                        // 查找下一个uri
                        link = FindLink(str, ref curloc);
                        if (link != null)
                        {
                            Console.WriteLine("发现链接: " + link);
                            Console.Write("Link, More, Quit?");
                            answer = Console.ReadLine();
                            if (string.Compare(answer, "L", true) == 0)
                            {
                                uristr = string.Copy(link);
                                break;
                            }
                            else if (string.Compare(answer, "Q", true) == 0)
                            {
                                break;
                            }
                            else if (string.Compare(answer, "M", true) == 0)
                            {
                                Console.WriteLine("Searching for another link.");
                            }
                        }
                        else
                        {
                            Console.WriteLine("No link found.");
                            break;
                        }
                    } while (link.Length > 0);
                    // Close the response.
                    resp.Close();
                } while (uristr != null);
            }
            catch (WebException exc)
            {
                Console.WriteLine("Network Error: " + exc.Message +
                "\nStatus code: " + exc.Status);
            }
            catch (ProtocolViolationException exc)
            {
                Console.WriteLine("Protocol Error: " + exc.Message);
            }
            catch (UriFormatException exc)
            {
                Console.WriteLine("URI Format Error: " + exc.Message);
            }
            catch (NotSupportedException exc)
            {
                Console.WriteLine("Unknown Protocol: " + exc.Message);
            }
            catch (IOException exc)
            {
                Console.WriteLine("I/O Error: " + exc.Message);
            }
            Console.WriteLine("Terminating MiniCrawler.");
        }
    }