关于抓取百度搜索内容
在一次模拟抓取百度搜索内容的实战中,记录了该文章。。。。以记录几个问题
(嘿嘿,没什么技术含量,只是记录遇到的麻烦和思路)
在C#中,为了实现网页内容的抓取,我们一般使用 System.Net 下的一些常用类,如
HttpWebRequest、HttpWebResponse等等。。。。。
为了抓取网页内容,首先我们要根据我们的目录来分析这个网站。。。
目的:根据关键字来抓取百度搜索的前50条记录(排除推广)
抓取内容:标题、连接、显示内容
分析结果:
搜索连接 : http://www.baidu.com/s?wd=关键字&rn=pagesize
显示结果 :采用每条记录一个table来显示内容,table的 id从1开始递增。。
根据我们的目的和分析结果,我们就知道这次抓取任务很简单啦,由于我们只有抓取50条记录,那么就可以把连接
修改成 http://www.baidu.com/s?wd=关键字&rn=50 ,然后直接带上关键字访问这个连接就可以了。。
此时第一个问题出现:
中文编码:
刚开始我以为使用 UrlEncode把中文编码一下就可以了,结果发现返回的结果出现乱码,又用firebug看了下返回的编码是utf8的,于是修改我使用的编码,采用utf8,结果发现除了我的关键字其他都是正确的,晕啦,还有这种事情哇?
终于在一次无意中查看到原来正常请求都是按照gb2312来传的,所以我们的把 “关键字”改成gb2312的格式再去请求就可以了
方法一:
string keyWord = System.Web.HttpUtility.UrlEncode(_keyWord, System.Text.UnicodeEncoding.GetEncoding("GB2312")).ToUpper();
方法二:(一个朋友的方法)
public string ConvertText(string _str, string enconding) { byte[] buffer = Encoding.GetEncoding(enconding).GetBytes(_str); string tmpCity = ""; foreach (byte b in buffer) tmpCity += string.Format("%{0:X}", b); return tmpCity; }
当我们正确请求拿到html字符串的时候就可以去分析处理了,原先我采用的Httpwebrequest来请求并返回html的字符串方式,发现取里面的内容挺麻烦,尤其正则表达式写的还挺烦的呢。
此时出现第二个问题:(也不算问题啦,只能算思路)
改用WebBrowser代替Httpwebrequest来请求
由于我们知道我们要根据结果table的 id来取得里面的内容,所以我放弃了原先的请求方式,采用了webbrowser,原因很简单,webbrowser里可以直接根据id来获取对象来直接操作,嘿嘿,岂不是比我们手写正则来匹配方便?而且正则还不太好些,还真有点麻烦。。。
使用 webbrowser 遇到的问题,就是 DocumentCompleted事件会调用多次
解决办法:
判断ReadyState == WebBrowserReadyState.Complete
if (myBrowser.ReadyState != WebBrowserReadyState.Complete)
{
return;
}
一般都可以解决,如果还是有问题,就像我的一样,那就定义一个变量判断变量的值好了。。。嘿嘿
到此为止 这次实战存在的2个问题全部得以解决,顺利抓取到想要的数据。。。
附:封装的httphelper
1 using System;
2 using System.Data;
3 using System.Configuration;
4 using System.Web;
5 using System.Net;
6 using System.IO;
7 using System.Threading;
8 using System.Text;
9
10 namespace End.AdWords
11 {
12 /// <summary>
13 /// HttpHelper 类
14 /// </summary>
15 public class HttpHelper
16 {
17 #region 私有变量
18 private CookieContainer cc;
19 private string contentType = "application/x-www-form-urlencoded";
20 private string accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/x-silverlight, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-ms-application, application/x-ms-xbap, application/vnd.ms-xpsdocument, application/xaml+xml, application/x-silverlight-2-b1, */*";
21 private string userAgent = "Mozilla/4.0 (compatible; MSIE 7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729)";//"Googlebot/2.1 (+http://www.google.com/bot.html)";
22 private Encoding encoding = Encoding.GetEncoding("UTF-8");
23 //private Encoding encoding = Encoding.GetEncoding("GB2312");
24 private int delay = 500;
25 private int maxTry = 3;
26 private int currentTry = 0;
27 #endregion
28
29 #region 属性
30 /// <summary>
31 /// Cookie容器
32 /// </summary>
33 public CookieContainer CookieContainer
34 {
35 get
36 {
37 return cc;
38 }
39 }
40
41 /// <summary>
42 /// 获取网页源码时使用的编码
43 /// </summary>
44 /// <value></value>
45 public Encoding Encoding
46 {
47 get
48 {
49 return encoding;
50 }
51 set
52 {
53 encoding = value;
54 }
55 }
56
57 public int NetworkDelay
58 {
59 get
60 {
61 return delay;
62 }
63 set
64 {
65 delay = value;
66 }
67 }
68
69 public int MaxTry
70 {
71 get
72 {
73 return maxTry;
74 }
75 set
76 {
77 maxTry = value;
78 }
79 }
80 #endregion
81
82 #region 构造函数
83 /// <summary>
84 /// Initializes a new instance of the <see cref="HttpHelper"/> class.
85 /// </summary>
86 public HttpHelper()
87 {
88 cc = new CookieContainer();
89 }
90
91 /// <summary>
92 /// Initializes a new instance of the <see cref="HttpHelper"/> class.
93 /// </summary>
94 /// <param name="cc">The cc.</param>
95 public HttpHelper(CookieContainer cc)
96 {
97 this.cc = cc;
98 }
99
100 /// <summary>
101 /// Initializes a new instance of the <see cref="HttpHelper"/> class.
102 /// </summary>
103 /// <param name="contentType">Type of the content.</param>
104 /// <param name="accept">The accept.</param>
105 /// <param name="userAgent">The user agent.</param>
106 public HttpHelper(string contentType, string accept, string userAgent)
107 {
108 this.contentType = contentType;
109 this.accept = accept;
110 this.userAgent = userAgent;
111 }
112
113 /// <summary>
114 /// Initializes a new instance of the <see cref="HttpHelper"/> class.
115 /// </summary>
116 /// <param name="cc">The cc.</param>
117 /// <param name="contentType">Type of the content.</param>
118 /// <param name="accept">The accept.</param>
119 /// <param name="userAgent">The user agent.</param>
120 public HttpHelper(CookieContainer cc, string contentType, string accept, string userAgent)
121 {
122 this.cc = cc;
123 this.contentType = contentType;
124 this.accept = accept;
125 this.userAgent = userAgent;
126 }
127 #endregion
128
129 #region 公共方法
130 /// <summary>
131 /// 获取指定页面的HTML代码
132 /// </summary>
133 /// <param name="url">指定页面的路径</param>
134 /// <param name="postData">回发的数据</param>
135 /// <param name="isPost">是否以post方式发送请求</param>
136 /// <param name="cookieCollection">Cookie集合</param>
137 /// <returns></returns>
138 public string GetHtml(string url, string postData, bool isPost, CookieContainer cookieContainer)
139 {
140
141 string html = "";
142
143
144 if (string.IsNullOrEmpty(postData))
145 {
146 return GetHtml(url, cookieContainer);
147 }
148
149 try
150 {
151 byte[] byteRequest = Encoding.Default.GetBytes(postData);
152 HttpWebRequest httpWebRequest;
153 httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url);
154 httpWebRequest.CookieContainer = cookieContainer;
155 httpWebRequest.ContentType = contentType;
156 httpWebRequest.Referer = url;
157 httpWebRequest.Accept = accept;
158 httpWebRequest.UserAgent = userAgent;
159 httpWebRequest.Method = isPost ? "POST" : "GET";
160 httpWebRequest.ContentLength = byteRequest.Length;
161 //Stream stream = httpWebRequest.GetRequestStream();
162 httpWebRequest.Timeout = 50000;
163 Stream stream = null;
164 try
165 {
166
167 stream = httpWebRequest.GetRequestStream();
168
169 }
170 catch (Exception ex)
171 {
172 string ss = ex.Message;
173
174
175 }
176
177
178
179 stream.Write(byteRequest, 0, byteRequest.Length);
180 stream.Close();
181 HttpWebResponse httpWebResponse;
182 httpWebRequest.Timeout = 20000;
183
184 try
185 {
186 httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
187 //判断HTTP响应状态
188 if (httpWebResponse.StatusCode != HttpStatusCode.OK)
189 {
190 httpWebResponse.Close();
191 html = "访问失败!";
192 return html;
193 }
194
195 Stream responseStream = null;
196 try
197 {
198 responseStream = httpWebResponse.GetResponseStream();
199 }
200 catch (Exception ex)
201 {
202 string aaa = ex.Message.ToString();
203 }
204 StreamReader streamReader = new StreamReader(responseStream, encoding);
205 html = streamReader.ReadToEnd();
206 streamReader.Close();
207 responseStream.Close();
208
209 currentTry = 0;
210 return html;
211 }
212 catch
213 {
214 html = "操作超时";
215 return html;
216 }
217
218
219 }
220 catch (Exception e)
221 {
222 Console.ForegroundColor = ConsoleColor.Red;
223 Console.WriteLine(DateTime.Now.ToString("HH:mm:ss ") + e.Message);
224 Console.ForegroundColor = ConsoleColor.White;
225
226 if (currentTry <= maxTry)
227 {
228 GetHtml(url, postData, isPost, cookieContainer);
229 }
230
231 currentTry = 0;
232 return string.Empty;
233 }
234 }
235
236 #region 暂时不用
237 ///// <summary>
238 ///// 获取指定页面的HTML代码
239 ///// </summary>
240 ///// <param name="url">指定页面的路径</param>
241 ///// <param name="postData">回发的数据</param>
242 ///// <param name="isPost">是否以post方式发送请求</param>
243 ///// <param name="cookieCollection">Cookie集合</param>
244 ///// <returns></returns>
245 //public string GetHtml(string url, string postData, bool isPost, CookieContainer cookieContainer)
246 //{
247 // if (string.IsNullOrEmpty(postData))
248 // {
249 // return GetHtml(url, cookieContainer);
250 // }
251
252 // //Thread.Sleep(NetworkDelay);
253
254 // currentTry++;
255
256 // try
257 // {
258 // byte[] byteRequest = Encoding.Default.GetBytes(postData);
259
260 // HttpWebRequest httpWebRequest;
261 // httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url);
262 // httpWebRequest.CookieContainer = cookieContainer;
263 // httpWebRequest.ContentType = contentType;
264 // httpWebRequest.Referer = url;
265 // httpWebRequest.Accept = accept;
266 // httpWebRequest.UserAgent = userAgent;
267 // httpWebRequest.Method = isPost ? "POST" : "GET";
268 // httpWebRequest.ContentLength = byteRequest.Length;
269
270 // Stream stream = httpWebRequest.GetRequestStream();
271 // stream.Write(byteRequest, 0, byteRequest.Length);
272 // stream.Close();
273
274 // HttpWebResponse httpWebResponse;
275 // httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
276 // Stream responseStream = httpWebResponse.GetResponseStream();
277 // StreamReader streamReader = new StreamReader(responseStream, encoding);
278 // string html = streamReader.ReadToEnd();
279 // streamReader.Close();
280 // responseStream.Close();
281
282 // currentTry = 0;
283 // return html;
284 // }
285 // catch (Exception e)
286 // {
287 // Console.ForegroundColor = ConsoleColor.Red;
288 // Console.WriteLine(DateTime.Now.ToString("HH:mm:ss ") + e.Message);
289 // Console.ForegroundColor = ConsoleColor.White;
290
291 // if (currentTry <= maxTry)
292 // {
293 // GetHtml(url, postData, isPost, cookieContainer);
294 // }
295
296 // currentTry = 0;
297 // return string.Empty;
298 // }
299 //}
300 #endregion
301
302 /// <summary>
303 /// 获取指定页面的HTML代码
304 /// </summary>
305 /// <param name="url">指定页面的路径</param>
306 /// <param name="cookieCollection">Cookie集合</param>
307 /// <returns></returns>
308 public string GetHtml(string url, CookieContainer cookieContainer)
309 {
310 currentTry++;
311
312 try
313 {
314 HttpWebRequest httpWebRequest;
315 httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url);
316 httpWebRequest.CookieContainer = cookieContainer;
317 httpWebRequest.ContentType = contentType;
318 // httpWebRequest.Referer = url;
319 //httpWebRequest.Referer
320 httpWebRequest.Accept = accept;
321 httpWebRequest.UserAgent = userAgent;
322 httpWebRequest.Method = "GET";
323 //httpWebRequest.Headers.Add("Accept-Charset", "gb2312");
324
325 HttpWebResponse httpWebResponse;
326 httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
327 Stream responseStream = httpWebResponse.GetResponseStream();
328 StreamReader streamReader = new StreamReader(responseStream, encoding);
329
330 //string html = streamReader.ReadToEnd();
331
332 StringBuilder sb = new StringBuilder();
333 while (streamReader.Peek() > 0)
334 {
335 sb.Append(streamReader.ReadLine());
336 }
337 //string html = sb.ToString();
338 streamReader.Close();
339 responseStream.Close();
340
341 currentTry = 0;
342 //return html;
343 return sb.ToString();
344 }
345 catch (Exception e)
346 {
347 Console.ForegroundColor = ConsoleColor.Red;
348 Console.WriteLine(DateTime.Now.ToString("HH:mm:ss ") + e.Message);
349 Console.ForegroundColor = ConsoleColor.White;
350
351 if (currentTry <= maxTry)
352 {
353 GetHtml(url, cookieContainer);
354 }
355
356 currentTry = 0;
357 return string.Empty;
358 }
359 }
360
361 /// <summary>
362 /// 获取指定页面的HTML代码
363 /// </summary>
364 /// <param name="url">指定页面的路径</param>
365 /// <returns></returns>
366 public string GetHtml(string url)
367 {
368 return GetHtml(url, cc);
369 }
370
371 /// <summary>
372 /// 获取指定页面的HTML代码
373 /// </summary>
374 /// <param name="url">指定页面的路径</param>
375 /// <returns></returns>
376 public string GetHtmlByXiAHang(string url)
377 {
378 return GetHtmlByXiAHang(url, cc);
379 }
380
381 /// <summary>
382 /// 获取指定页面的HTML代码
383 /// </summary>
384 /// <param name="url">指定页面的路径</param>
385 /// <param name="cookieCollection">Cookie集合</param>
386 /// <returns></returns>
387 public string GetHtmlByXiAHang(string url, CookieContainer cookieContainer)
388 {
389 currentTry++;
390
391 try
392 {
393 HttpWebRequest httpWebRequest;
394 httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url);
395 httpWebRequest.Headers.Add("content", "text/html; charset=GB2312");
396 httpWebRequest.CookieContainer = cookieContainer;
397 httpWebRequest.ContentType = contentType;
398 httpWebRequest.Accept = accept;
399 httpWebRequest.UserAgent = userAgent;
400 httpWebRequest.Method = "GET";
401
402
403
404
405 HttpWebResponse httpWebResponse;
406 httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
407 Stream responseStream = httpWebResponse.GetResponseStream();
408 StreamReader streamReader = new StreamReader(responseStream, encoding);
409 string html = streamReader.ReadToEnd();
410 streamReader.Close();
411 responseStream.Close();
412
413 currentTry = 0;
414 return html;
415 }
416 catch (Exception e)
417 {
418 Console.ForegroundColor = ConsoleColor.Red;
419 Console.WriteLine(DateTime.Now.ToString("HH:mm:ss ") + e.Message);
420 Console.ForegroundColor = ConsoleColor.White;
421
422 if (currentTry <= maxTry)
423 {
424 GetHtml(url, cookieContainer);
425 }
426
427 currentTry = 0;
428 return string.Empty;
429 }
430 }
431
432
433 /// <summary>
434 /// 获取指定页面的HTML代码
435 /// </summary>
436 /// <param name="url">指定页面的路径</param>
437 /// <param name="postData">回发的数据</param>
438 /// <param name="isPost">是否以post方式发送请求</param>
439 /// <returns></returns>
440 public string GetHtml(string url, string postData, bool isPost)
441 {
442 return GetHtml(url, postData, isPost, cc);
443 }
444
445 /// <summary>
446 /// 获取指定页面的Stream
447 /// </summary>
448 /// <param name="url">指定页面的路径</param>
449 /// <param name="postData">回发的数据</param>
450 /// <param name="isPost">是否以post方式发送请求</param>
451 /// <param name="cookieCollection">Cookie集合</param>
452 /// <returns></returns>
453 public Stream GetStream(string url, CookieContainer cookieContainer)
454 {
455 //Thread.Sleep(delay);
456
457 currentTry++;
458
459 try
460 {
461 HttpWebRequest httpWebRequest;
462 httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url);
463 httpWebRequest.CookieContainer = cookieContainer;
464 httpWebRequest.ContentType = contentType;
465 httpWebRequest.Referer = url;
466 httpWebRequest.Accept = accept;
467 httpWebRequest.UserAgent = userAgent;
468 httpWebRequest.Method = "GET";
469
470
471 HttpWebResponse httpWebResponse;
472 httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
473 Stream responseStream = httpWebResponse.GetResponseStream();
474 currentTry = 0;
475 return responseStream;
476 }
477 catch (Exception e)
478 {
479 Console.ForegroundColor = ConsoleColor.Red;
480 Console.WriteLine(DateTime.Now.ToString("HH:mm:ss ") + e.Message);
481 Console.ForegroundColor = ConsoleColor.White;
482
483 if (currentTry <= maxTry)
484 {
485 GetHtml(url, cookieContainer);
486 }
487
488 currentTry = 0;
489 return null;
490 }
491 }
492
493 #endregion
494 }
495 }