HttpWebRequest 模拟浏览器访问网站

最近抓网页时报错:
要么返回 The remote server returned an error: (442)
要么返回: 非法访问,您的行为已被WAF系统记录!
想了想,就当是人家加了抓网页的东西,于是改了一下方法 加上Request.Header 之类的东西就行了。
具体加什么,咱们可以先用 fildder 抓一下包就可以了如:
 
GET http://www.baidu.com/ HTTP/1.1
Host: www.baidu.com
Connection: keep-alive
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9
  

  

 1 public static string GetHtml()
 2         {
 3             string url = "http://www.baidu.com";
 4             string Html = string.Empty;//初始化新的webRequst
 5             HttpWebRequest Request = (HttpWebRequest)WebRequest.Create(url);
 6             Request.Timeout = 300000;
 7             Request.ReadWriteTimeout = 300000;
 8          //   Request.ImpersonationLevel = TokenImpersonationLevel.Anonymous;
 9           
10             Request.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
11           //  Request.Headers.Add("Accept-Encoding", "gzip, deflate");
12       
13             Request.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;           
14             Request.KeepAlive = true;
15             Request.ProtocolVersion = HttpVersion.Version11;
16             Request.Method = "GET";
17             Request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8";
18             Request.Host = "www.baidu.com";
19             //Request.Accept = "text/json,*/*;q=0.5";
20             //Request.Headers.Add("Accept-Charset", "utf-8;q=0.7,*;q=0.7");
21             //Request.Headers.Add("Accept-Encoding", "gzip, deflate, x-gzip, identity; q=0.9");
22             Request.UserAgent = @"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36";
23             Request.Referer = url;
24             Request.IfModifiedSince = DateTime.UtcNow;
25 
26             HttpWebResponse htmlResponse = (HttpWebResponse)Request.GetResponse();
27             //从Internet资源返回数据流
28              Stream htmlStream = htmlResponse.GetResponseStream();
29            // Stream htmlStream = new System.IO.Compression.GZipStream(htmlResponse.GetResponseStream(), System.IO.Compression.CompressionMode.Decompress);
30             //读取数据流
31             StreamReader weatherStreamReader = new StreamReader(htmlStream, Encoding.GetEncoding("gb2312"));
32             //读取数据
33             Html = weatherStreamReader.ReadToEnd();
34             weatherStreamReader.Close();
35             htmlStream.Close();
36             htmlResponse.Close();
37             //针对不同的网站查看html源文件
38             return Html;
39         }      

 

再加一段PHP的代码: 在不修改本页面utf-8编码的情况下如何让抓取的gb2312页面不乱码。

$headers = array();
$headers[] = 'X-Apple-Tz: 0';
$headers[] = 'X-Apple-Store-Front: 143444,12';
$headers[] = 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
$headers[] = 'Accept-Encoding: gzip, deflate';
$headers[] = 'Accept-Language: en-US,en;q=0.5';
$headers[] = 'Cache-Control: no-cache';
$headers[] = 'Content-Type: application/x-www-form-urlencoded; charset=gb2312';//utf-8
$headers[] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36';

$dat = cUrlGetData($url, $post_fields, $headers);
function cUrlGetData($url, $post_fields = null, $headers = null) { $ch = curl_init(); $timeout = 50000; curl_setopt($ch, CURLOPT_URL, $url); if ($post_fields && !empty($post_fields)) { curl_setopt($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_POSTFIELDS, $post_fields); } if ($headers && !empty($headers)) { curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); } curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');//这个是解释gzip内容................. $data = curl_exec($ch); if (curl_errno($ch)) { echo 'Error:' . curl_error($ch); } curl_close($ch); return $data; } //php脚本开始 /*POST请求远程内容函数*/ function ppost($url,$data,$ref){ // 模拟提交数据函数 $curl = curl_init(); // 启动一个CURL会话 curl_setopt($curl, CURLOPT_URL, $url); // 要访问的地址 curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, 0); // 对认证证书来源的检查 curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 1); // 从证书中检查SSL加密算法是否存在 curl_setopt($curl, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']); // 模拟用户使用的浏览器 curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1); // 使用自动跳转 curl_setopt($curl, CURLOPT_REFERER, $ref); curl_setopt($curl, CURLOPT_POST, 1); // 发送一个常规的Post请求 curl_setopt($curl, CURLOPT_POSTFIELDS, $data); // Post提交的数据包 curl_setopt($curl, CURLOPT_COOKIEFILE,$GLOBALS ['cookie_file']); // 读取上面所储存的Cookie信息 curl_setopt($curl, CURLOPT_COOKIEJAR, $GLOBALS['cookie_file']); // 存放Cookie信息的文件名称 curl_setopt($curl, CURLOPT_HTTPHEADER,array('Accept-Encoding: gzip, deflate')); curl_setopt($curl, CURLOPT_ENCODING, 'gzip,deflate');//这个是解释gzip内容................. curl_setopt($curl, CURLOPT_TIMEOUT, 30); // 设置超时限制防止死循环 curl_setopt($curl, CURLOPT_HEADER, 0); // 显示返回的Header区域内容 curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); // 获取的信息以文件流的形式返回 $tmpInfo = curl_exec($curl); // 执行操作 if (curl_errno($curl)) { echo 'Errno'.curl_error($curl); } curl_close($curl); // 关键CURL会话 return $tmpInfo; // 返回数据 }

 

 

posted on 2018-06-22 12:01  newr2006  阅读(3822)  评论(1编辑  收藏  举报

导航