Python 爬虫 (三)

 

#对第一章的百度翻译封装的函数进行更新
1
from urllib import request, parse 2 from urllib.error import HTTPError, URLError 3 # 保存cookie 4 from http import cookiejar 5 6 class session(object): 7 def __init__(self): 8 cookie_object = cookiejar.CookieJar() 9 # handler 对应着一个操作 10 handler = request.HTTPCookieProcessor(cookie_object) 11 # opener 遇到有cookie的response的时候, 12 # 调用handler内部的一个函数, 存储到cookie object 13 self.opener = request.build_opener(handler) 14 15 def get(self, url, headers=None): 16 return get(url, headers, self.opener) 17 18 def post(self, url, form, headers=None): 19 return post(url, form, headers, self.opener) 20 21 #a. get(url, headers=None) 22 23 def get(url, headers=None, opener = None): 24 return urlrequests(url, headers=headers, opener = opener) 25 26 def post(url, form, headers=None, opener = None): 27 return urlrequests(url, form, headers=headers, opener=opener) 28 29 #b. post(url, form, headers=None) 30 31 #1. 传入url 32 #2. user_agent 33 #3. headers 34 #4. 定义Request 35 #5. urlopen 36 #6. 返回byte数组 37 def urlrequests(url, form=None, headers=None, opener = None): 38 39 user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 40 # 如果用户需要自行传入headers, 则覆盖之前的headers 41 if headers == None: 42 headers = { 43 'User-Agent': user_agent 44 } 45 html_bytes = b'' 46 try: 47 if form: 48 # POST 49 # 2.1 转换成str 50 form_str = parse.urlencode(form, encoding='utf-8') 51 #print(form_str) 52 # 2.2 转换成bytes 53 form_bytes = form_str.encode('utf-8') 54 req = request.Request(url, data=form_bytes, headers=headers) 55 else: 56 # GET 57 req = request.Request(url, headers=headers) 58 if opener: 59 response = opener.open(req) 60 else: 61 response = request.urlopen(req) 62 html_bytes = response.read() 63 except HTTPError as e: 64 print(e) 65 except URLError as e: 66 print(e) 67 68 return html_bytes 69 70 if __name__ == '__main__': 71 # url = 'http://fanyi.baidu.com/sug' 72 # form = { 73 # 'kw': '呵呵' 74 # } 75 # html_bytes = post(url, form=form) 76 # print(html_bytes) 77 78 url = 'http://www.baidu.com' 79 html_byte = get(url) 80 print(html_byte)

 

posted @ 2018-08-14 22:42  razryang  阅读(260)  评论(2编辑  收藏  举报