熊咪

爬虫学习 错误获取

http://blog.csdn.net/column/details/why-bug.html

 

1.模拟浏览器获取数据

import urllib.request

req = urllib.request.Request('http://www.baidu.com')
response = urllib.request.urlopen(req)
the_page = response.read()
print (the_page)

 

结果;

b'<!DOCTYPE html><html><body><script type="text/javascript">var u=\'https://www.baidu.com/?tn=91774473_hao_pg&lans=132\',ua=navigator.userAgent.toLowerCase();if(u.indexOf(\'baidu.com\')>0||u.indexOf(\'360.cn\')>0||u.indexOf(\'hao123.com\')>0){var cc = document.cookie.split(\';\');for(var i=0;i<cc.length;i++){var name = cc[i].split("=")[0];document.cookie = name + \'=; path=/; domain=.baidu.com; expires=Thu, 01 Jan 1970 00:00:01 GMT;\';}}if(ua.indexOf(\'applewebkit\')>0){var h = document.createElement(\'a\');h.rel = \'noreferrer\';h.href = u;document.body.appendChild(h);var evt = document.createEvent("MouseEvents");evt.initEvent("click", true, true);h.dispatchEvent(evt);} else {document.write(\'<meta http-equiv="Refresh" Content="0; Url=\' + u + \'" >\');}</script></body></html>'

2.在上传的同时,上传一部分的参数

下面这个不适合3以上版本

import urllib
import urllib2

url = 'http://www.someserver.com/register.cgi'

values = {'name' : 'WHY',
'location' : 'SDU',
'language' : 'Python' }

data = urllib.urlencode(values) # 编码工作
req = urllib2.Request(url, data) # 发送请求同时传data表单
response = urllib2.urlopen(req) #接受反馈的信息
the_page = response.read() #读取反馈的内容

 

这个可以作为3的版本:

import urllib.parse

import urllib.request
url = 'http://www.baidu.com/s'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values = {'name' 'WHY',    
         'location' 'SDU',    
         'language' 'Python',
         'ie' 'utf-8',
         'wd' 'python' }
headers = 'User-Agent' : user_agent }
data = urllib.parse.urlencode(values)
#data=data.encode(encoding='UTF8')
req = urllib.request.Request(url+'?'+data)
#, data, headers)
response = urllib.request.urlopen(req)
the_page = response.read()
print(the_page.decode('UTF8'))
 
结果:
<script>var cc = document.cookie.split(';');for(var i=0;i<cc.length;i++){var name = cc[i].split("=")[0];document.cookie = name + '=; path=/; domain=.baidu.com; expires=Thu, 01 Jan 1970 00:00:01 GMT;';}document.cookie = 'BDRCVFR[sxyN8Jn90s_]=mk3SLVN4HKm; path=/; domain=.baidu.com';window.location.href='http://www.baidu.com/s?location=SDU&wd=python&language=Python&name=WHY&ie=utf-8&tn=91783609_hao_pg&oras=1&tn=sogouie_dg';</script>
 
 
3.错误捕捉
 
(1)Url Error
 

import urllib.error
import urllib.request


req = urllib.request.Request('http://www.baibai.com')

try: urllib.request.urlopen(req)

except urllib.error.URLError as e:
print (e.reason)

 

(2)Http Error

import urllib.request
import urllib.error

req = urllib.request.Request('http://bbs.csdn.net/callmewhy')

try:
urllib.request.urlopen(req)

except urllib.error.HTTPError as e:

print (e.code)

//

 

(3)同时获取,注意http一定要放在url前面,子类的关系

第一种方式:

from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError

req = Request('http://bbs.csdn.net/callmewhy')

try:

response = urlopen(req)

except HTTPError as e:

print ('The server couldn\'t fulfill the request.')

print ('Error code: ', e.code)

except URLError as e:

print ('We failed to reach a server.')

print ('Reason: ', e.reason)

else:
print ('No exception was raised.')
# everything is fine

 

 

第二种方式:

from urllib.request import Request, urlopen

from urllib.error import URLError, HTTPError

req = Request('http://bbs.csdn.net/callmewhy')

try:

response = urlopen(req)

except URLError as e:

if hasattr(e, 'code'):

print ('The server couldn\'t fulfill the request.')

print ('Error code: ', e.code)

elif hasattr(e, 'reason'):

print ('We failed to reach a server.')

print ('Reason: ', e.reason)


else:
print ('No exception was raised.')
# everything is fine

posted on 2015-05-14 16:46  熊咪  阅读(250)  评论(0编辑  收藏  举报