通过HtmlAgilityPack实现网页信息抓取
1. 下载Html Agility Pack,解压保存到本地 下载地址: http://htmlagilitypack.codeplex.com/
1 void caijisoufun()
2 {
3 try
4 {
5
6 HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
7 String str = "http://esf.wuxi.soufun.com/agent/agent/AloneHouseList.aspx?agentid=160148311&housetype=esf&price=&roomtype=&district=&page=1";
8
9 String htmlstr = fhttp2(str);//获取html页面的源文件
10 doc.LoadHtml(htmlstr);
11 HtmlNode navNode = doc.GetElementbyId("right");//获取id为right的节点
12 //print(navNode.InnerHtml);
13 HtmlNodeCollection categoryNodeList = navNode.SelectNodes("//div[1]/table/tr[1]/td[1]/a[1]"); //分析html结构
14
15 HtmlNode temp = null;
16
17 foreach (HtmlNode categoryNode in categoryNodeList)
18 {
19 temp = HtmlNode.CreateNode(categoryNode.OuterHtml);
20
21 String url = "http://esf.wuxi.soufun.com" + temp.Attributes["href"].Value;
22 println(url);//其实就是个Response.Write
23 String showstr = fhttp2(url);
24 HtmlAgilityPack.HtmlDocument doc2 = new HtmlAgilityPack.HtmlDocument();
25 doc2.LoadHtml(showstr);
26 HtmlNode cnode = doc2.GetElementbyId("wrap");
27 HtmlNode title = cnode.SelectSingleNode("//div[2]/div[1]/h1[1]/font[1]");
28 println(title.InnerText);//
29 //这里就可以做很多事情了,包括楼盘户型全部可以通过抓取获得信息,导入自己的数据库。
30 flush();
31 sleep(10);
32
33 //println(temp.Attributes["href"].Value);
34 }
35
36 }
37 catch (Exception ex)
38 {
39 println(ex);
40 }
41 }
42
43 String fhttp2(String url)
44 {
45 try
46 {
47 WebRequest rGet = WebRequest.Create(url);
48 WebResponse rSet = rGet.GetResponse();
49 Stream s = rSet.GetResponseStream();
50 StreamReader sr = new StreamReader(s, Encoding.GetEncoding("GB2312"));
51 StringBuilder sb = new StringBuilder();
52 String Str;
53
54 while ((Str = sr.ReadLine()) != null)
55 {
56 sb.Append(Str + "\n");
57 }
58
59 sr.Close();
60 s.Close();
61 rSet.Close();
62
63 return tostr(sb);
64 }
65 catch (Exception e)
66 {
67 return "";
68 }
69 }