通过HtmlAgilityPack实现网页信息抓取。

1. 下载Html Agility Pack,解压保存到本地 下载地址: http://htmlagilitypack.codeplex.com/

 void caijisoufun()
2     {
3         try
4         {
5            
6             HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
7             String str = "http://esf.wuxi.soufun.com/agent/agent/AloneHouseList.aspx?agentid=160148311&housetype=esf&price=&roomtype=&district=&page=1";
8         
9             String htmlstr = fhttp2(str);//获取html页面的源文件
10             doc.LoadHtml(htmlstr);
11             HtmlNode navNode = doc.GetElementbyId("right");//获取id为right的节点
12             //print(navNode.InnerHtml);
13             HtmlNodeCollection categoryNodeList = navNode.SelectNodes("//div[1]/table/tr[1]/td[1]/a[1]"); //分析html结构
14
15             HtmlNode temp = null;
16
17             foreach (HtmlNode categoryNode in categoryNodeList)
18             {
19                 temp = HtmlNode.CreateNode(categoryNode.OuterHtml);
20               
21                 String url = "http://esf.wuxi.soufun.com" + temp.Attributes["href"].Value;
22                 println(url);//其实就是个Response.Write
23                 String showstr = fhttp2(url);
24                 HtmlAgilityPack.HtmlDocument doc2 = new HtmlAgilityPack.HtmlDocument();
25                 doc2.LoadHtml(showstr);
26                 HtmlNode cnode = doc2.GetElementbyId("wrap");
27                 HtmlNode title = cnode.SelectSingleNode("//div[2]/div[1]/h1[1]/font[1]");
28                 println(title.InnerText);//
29                 //这里就可以做很多事情了,包括楼盘户型全部可以通过抓取获得信息,导入自己的数据库。
30                 flush();
31                 sleep(10);
32              
33                 //println(temp.Attributes["href"].Value);
34             }
35            
36         }
37         catch (Exception ex)
38         {
39             println(ex);
40         }
41     }
42
43     String fhttp2(String url)
44     {
45         try
46         {
47             WebRequest rGet = WebRequest.Create(url);
48             WebResponse rSet = rGet.GetResponse();
49             Stream s = rSet.GetResponseStream();
50             StreamReader sr = new StreamReader(s, Encoding.GetEncoding("GB2312"));
51             StringBuilder sb = new StringBuilder();
52             String Str;
53
54             while ((Str = sr.ReadLine()) != null)
55             {
56                 sb.Append(Str + "\n");
57             }
58
59             sr.Close();
60             s.Close();
61             rSet.Close();
62
63             return tostr(sb);
64         }
65         catch (Exception e)
66         {
67             return "";
68         }
69     }

posted @ 2012-10-21 21:00  brokge  阅读(311)  评论(0编辑  收藏  举报