HttpWebRequest 模拟浏览器访问网站
最近抓网页时报错:
要么返回 The remote server returned an error: (442)
要么返回: 非法访问,您的行为已被WAF系统记录!
要么返回: 非法访问,您的行为已被WAF系统记录!
想了想,就当是人家加了抓网页的东西,于是改了一下方法 加上Request.Header 之类的东西就行了。
具体加什么,咱们可以先用 fildder 抓一下包就可以了如:
GET http://www.baidu.com/ HTTP/1.1
Host: www.baidu.com
Connection: keep-alive
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9
Host: www.baidu.com
Connection: keep-alive
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9
1 public static string GetHtml() 2 { 3 string url = "http://www.baidu.com"; 4 string Html = string.Empty;//初始化新的webRequst 5 HttpWebRequest Request = (HttpWebRequest)WebRequest.Create(url); 6 Request.Timeout = 300000; 7 Request.ReadWriteTimeout = 300000; 8 // Request.ImpersonationLevel = TokenImpersonationLevel.Anonymous; 9 10 Request.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5"); 11 // Request.Headers.Add("Accept-Encoding", "gzip, deflate"); 12 13 Request.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; 14 Request.KeepAlive = true; 15 Request.ProtocolVersion = HttpVersion.Version11; 16 Request.Method = "GET"; 17 Request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"; 18 Request.Host = "www.baidu.com"; 19 //Request.Accept = "text/json,*/*;q=0.5"; 20 //Request.Headers.Add("Accept-Charset", "utf-8;q=0.7,*;q=0.7"); 21 //Request.Headers.Add("Accept-Encoding", "gzip, deflate, x-gzip, identity; q=0.9"); 22 Request.UserAgent = @"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"; 23 Request.Referer = url; 24 Request.IfModifiedSince = DateTime.UtcNow; 25 26 HttpWebResponse htmlResponse = (HttpWebResponse)Request.GetResponse(); 27 //从Internet资源返回数据流 28 Stream htmlStream = htmlResponse.GetResponseStream(); 29 // Stream htmlStream = new System.IO.Compression.GZipStream(htmlResponse.GetResponseStream(), System.IO.Compression.CompressionMode.Decompress); 30 //读取数据流 31 StreamReader weatherStreamReader = new StreamReader(htmlStream, Encoding.GetEncoding("gb2312")); 32 //读取数据 33 Html = weatherStreamReader.ReadToEnd(); 34 weatherStreamReader.Close(); 35 htmlStream.Close(); 36 htmlResponse.Close(); 37 //针对不同的网站查看html源文件 38 return Html; 39 }
再加一段PHP的代码: 在不修改本页面utf-8编码的情况下如何让抓取的gb2312页面不乱码。
$headers = array(); $headers[] = 'X-Apple-Tz: 0'; $headers[] = 'X-Apple-Store-Front: 143444,12'; $headers[] = 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'; $headers[] = 'Accept-Encoding: gzip, deflate'; $headers[] = 'Accept-Language: en-US,en;q=0.5'; $headers[] = 'Cache-Control: no-cache'; $headers[] = 'Content-Type: application/x-www-form-urlencoded; charset=gb2312';//utf-8 $headers[] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'; $dat = cUrlGetData($url, $post_fields, $headers);
function cUrlGetData($url, $post_fields = null, $headers = null) { $ch = curl_init(); $timeout = 50000; curl_setopt($ch, CURLOPT_URL, $url); if ($post_fields && !empty($post_fields)) { curl_setopt($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_POSTFIELDS, $post_fields); } if ($headers && !empty($headers)) { curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); } curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');//这个是解释gzip内容................. $data = curl_exec($ch); if (curl_errno($ch)) { echo 'Error:' . curl_error($ch); } curl_close($ch); return $data; } //php脚本开始 /*POST请求远程内容函数*/ function ppost($url,$data,$ref){ // 模拟提交数据函数 $curl = curl_init(); // 启动一个CURL会话 curl_setopt($curl, CURLOPT_URL, $url); // 要访问的地址 curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, 0); // 对认证证书来源的检查 curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 1); // 从证书中检查SSL加密算法是否存在 curl_setopt($curl, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']); // 模拟用户使用的浏览器 curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1); // 使用自动跳转 curl_setopt($curl, CURLOPT_REFERER, $ref); curl_setopt($curl, CURLOPT_POST, 1); // 发送一个常规的Post请求 curl_setopt($curl, CURLOPT_POSTFIELDS, $data); // Post提交的数据包 curl_setopt($curl, CURLOPT_COOKIEFILE,$GLOBALS ['cookie_file']); // 读取上面所储存的Cookie信息 curl_setopt($curl, CURLOPT_COOKIEJAR, $GLOBALS['cookie_file']); // 存放Cookie信息的文件名称 curl_setopt($curl, CURLOPT_HTTPHEADER,array('Accept-Encoding: gzip, deflate')); curl_setopt($curl, CURLOPT_ENCODING, 'gzip,deflate');//这个是解释gzip内容................. curl_setopt($curl, CURLOPT_TIMEOUT, 30); // 设置超时限制防止死循环 curl_setopt($curl, CURLOPT_HEADER, 0); // 显示返回的Header区域内容 curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); // 获取的信息以文件流的形式返回 $tmpInfo = curl_exec($curl); // 执行操作 if (curl_errno($curl)) { echo 'Errno'.curl_error($curl); } curl_close($curl); // 关键CURL会话 return $tmpInfo; // 返回数据 }