爬虫学习 错误获取
http://blog.csdn.net/column/details/why-bug.html
1.模拟浏览器获取数据
import urllib.request
req = urllib.request.Request('http://www.baidu.com')
response = urllib.request.urlopen(req)
the_page = response.read()
print (the_page)
结果;
b'<!DOCTYPE html><html><body><script type="text/javascript">var u=\'https://www.baidu.com/?tn=91774473_hao_pg&lans=132\',ua=navigator.userAgent.toLowerCase();if(u.indexOf(\'baidu.com\')>0||u.indexOf(\'360.cn\')>0||u.indexOf(\'hao123.com\')>0){var cc = document.cookie.split(\';\');for(var i=0;i<cc.length;i++){var name = cc[i].split("=")[0];document.cookie = name + \'=; path=/; domain=.baidu.com; expires=Thu, 01 Jan 1970 00:00:01 GMT;\';}}if(ua.indexOf(\'applewebkit\')>0){var h = document.createElement(\'a\');h.rel = \'noreferrer\';h.href = u;document.body.appendChild(h);var evt = document.createEvent("MouseEvents");evt.initEvent("click", true, true);h.dispatchEvent(evt);} else {document.write(\'<meta http-equiv="Refresh" Content="0; Url=\' + u + \'" >\');}</script></body></html>'
2.在上传的同时,上传一部分的参数
下面这个不适合3以上版本
import urllib
import urllib2
url = 'http://www.someserver.com/register.cgi'
values = {'name' : 'WHY',
'location' : 'SDU',
'language' : 'Python' }
data = urllib.urlencode(values) # 编码工作
req = urllib2.Request(url, data) # 发送请求同时传data表单
response = urllib2.urlopen(req) #接受反馈的信息
the_page = response.read() #读取反馈的内容
这个可以作为3的版本:
import
urllib.parse
import
urllib.request
url
=
'http://www.baidu.com/s'
user_agent
=
'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values
=
{
'name'
:
'WHY'
,
'location'
:
'SDU'
,
'language'
:
'Python'
,
'ie'
:
'utf-8'
,
'wd'
:
'python'
}
headers
=
{
'User-Agent'
: user_agent }
data
=
urllib.parse.urlencode(values)
#data=data.encode(encoding='UTF8')
req
=
urllib.request.Request(url
+
'?'
+
data)
#, data, headers)
response
=
urllib.request.urlopen(req)
the_page
=
response.read()
print
(the_page.decode(
'UTF8'
))
import urllib.error
import urllib.request
req = urllib.request.Request('http://www.baibai.com')
try: urllib.request.urlopen(req)
except urllib.error.URLError as e:
print (e.reason)
(2)Http Error
import urllib.request
import urllib.error
req = urllib.request.Request('http://bbs.csdn.net/callmewhy')
try:
urllib.request.urlopen(req)
except urllib.error.HTTPError as e:
print (e.code)
//
(3)同时获取,注意http一定要放在url前面,子类的关系
第一种方式:
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
req = Request('http://bbs.csdn.net/callmewhy')
try:
response = urlopen(req)
except HTTPError as e:
print ('The server couldn\'t fulfill the request.')
print ('Error code: ', e.code)
except URLError as e:
print ('We failed to reach a server.')
print ('Reason: ', e.reason)
else:
print ('No exception was raised.')
# everything is fine
第二种方式:
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
req = Request('http://bbs.csdn.net/callmewhy')
try:
response = urlopen(req)
except URLError as e:
if hasattr(e, 'code'):
print ('The server couldn\'t fulfill the request.')
print ('Error code: ', e.code)
elif hasattr(e, 'reason'):
print ('We failed to reach a server.')
print ('Reason: ', e.reason)
else:
print ('No exception was raised.')
# everything is fine