requests模块学习
- 基于如下5点展开requests模块的学习
-
什么是requests模块
- requests模块是python中原生的基于网络请求的模块,其主要作用是用来模拟浏览器发起请求。功能强大,用法简洁高效。在爬虫领域中占据着半壁江山的地位。
-
为什么要使用requests模块
- 因为在使用urllib模块的时候,会有诸多不便之处,总结如下:
- 手动处理url编码
- 手动处理post请求参数
- 处理cookie和代理操作繁琐
- ......
- 使用requests模块:
- 自动处理url编码
- 自动处理post请求参数
- 简化cookie和代理操作
- ......
- 因为在使用urllib模块的时候,会有诸多不便之处,总结如下:
-
如何使用requests模块
- 安装:
- pip install requests
- 使用流程
- 指定url
- 基于requests模块发起请求
- 获取响应对象中的数据值
- 持久化存储
- 安装:
-
通过5个基于requests模块的爬虫项目对该模块进行学习和巩固
- 基于requests模块的get请求
- 需求:爬取搜狗指定词条搜索后的页面数据
- 基于requests模块的post请求
- 需求:登录豆瓣电影,爬取登录成功后的页面数据
- 基于requests模块ajax的get请求
- 需求:爬取豆瓣电影分类排行榜 https://movie.douban.com/中的电影详情数据
- 基于requests模块ajax的post请求
- 需求:爬取肯德基餐厅查询http://www.kfc.com.cn/kfccda/index.aspx中指定地点的餐厅数据
- http://www.kfc.com.cn/kfccda/storelist/index.aspx
- 综合练习
- 需求:爬取搜狗知乎指定词条指定页码下的页面数据
- 基于requests模块的get请求
requests返回值常用属性
import requests url='https://www.sogou.com/' response = requests.get(url=url) page_data = response.text # content获取的是response对象中的二进制(byte)类型的页面数据 print(response.content) # 返回一个响应状态码 print(response.status_code) # 返回响应头信息 print(response.headers) # 获取请求的url print(response.url)
- 代码展示
需求:爬取搜狗指定词条搜索后的页面数据
import requests url = 'https://www.sogou.com/web?query=周杰伦&ie=utf-8' response = requests.get(url=url) page_text = response.text with open('./sougou.html','w',encoding='utf-8') as fp: fp.write(page_text)
import requests import os #指定搜索关键字 word = input('enter a word you want to search:') #自定义请求头信息 headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', } #指定url url = 'https://www.sogou.com/web' #封装get请求参数 prams = { 'query':word, 'ie':'utf-8' } #发起请求 response = requests.get(url=url,params=param) #获取响应数据 page_text = response.text with open('./sougou.html','w',encoding='utf-8') as fp: fp.write(page_text)
需求:登录豆瓣电影,爬取登录成功后的页面数据
import requests import os url = 'https://accounts.douban.com/login' #封装请求参数 data = { "source": "movie", "redir": "https://movie.douban.com/", "form_email": "15027900535", "form_password": "bobo@15027900535", "login": "登录", } #自定义请求头信息 headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', } response = requests.post(url=url,data=data) page_text = response.text with open('./douban111.html','w',encoding='utf-8') as fp: fp.write(page_text)
需求:爬取豆瓣电影分类排行榜 https://movie.douban.com/中的电影详情数据
#!/usr/bin/env python # -*- coding:utf-8 -*- import requests import urllib.request if __name__ == "__main__": #指定ajax-get请求的url(通过抓包进行获取) url = 'https://movie.douban.com/j/chart/top_list?' #定制请求头信息,相关的头信息必须封装在字典结构中 headers = { #定制请求头中的User-Agent参数,当然也可以定制请求头中其他的参数 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', } #定制get请求携带的参数(从抓包工具中获取) param = { 'type':'5', 'interval_id':'100:90', 'action':'', 'start':'0', 'limit':'20' } #发起get请求,获取响应对象 response = requests.get(url=url,headers=headers,params=param) #获取响应内容:响应内容为json串 print(response.text)
需求:爬取肯德基餐厅查询http://www.kfc.com.cn/kfccda/index.aspx中指定地点的餐厅数据
#!/usr/bin/env python # -*- coding:utf-8 -*- import requests import urllib.request if __name__ == "__main__": #指定ajax-post请求的url(通过抓包进行获取) url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword' #定制请求头信息,相关的头信息必须封装在字典结构中 headers = { #定制请求头中的User-Agent参数,当然也可以定制请求头中其他的参数 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', } #定制post请求携带的参数(从抓包工具中获取) data = { 'cname':'', 'pid':'', 'keyword':'北京', 'pageIndex': '1', 'pageSize': '10' } #发起post请求,获取响应对象 response = requests.get(url=url,headers=headers,data=data) #获取响应内容:响应内容为json串 print(response.text)
需求:爬取搜狗知乎指定词条指定页码下的页面数据
import requests import os #指定搜索关键字 word = input('enter a word you want to search:') #指定起始页码 start_page = int(input('enter start page num:')) end_page = int(input('enter end page num:')) #自定义请求头信息 headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', } #指定url url = 'https://zhihu.sogou.com/zhihu' #创建文件夹 if not os.path.exists('./sougou'): os.mkdir('./sougou') for page in range(start_page,end_page+1): #封装get请求参数 params = { 'query':word, 'ie':'utf-8', 'page':str(page) } #发起post请求,获取响应对象 response = requests.get(url=url,params=params) #获取页面数据 page_text = response.text fileName = word+'_'+str(page)+'.html' filePath = './sougou/'+fileName with open(filePath,'w',encoding='utf-8') as fp: fp.write(page_text) print('爬取'+str(page)+'页结束')
需求:微博登录
# 这种登陆方式是参考别的网友的,虽然效率很高,但我觉得普适性不强 import time import base64 import rsa import math import random import binascii import requests import re from urllib.parse import quote_plus from code_verification import code_verificate # 构造 Request headers agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' cookie = 'SINAGLOBAL=4474016812274.648.1551834025909; Ugrow-G0=e66b2e50a7e7f417f6cc12eec600f517; login_sid_t=4155ebc7ff7585d4cb8b72f5606940c9; cross_origin_proto=SSL; YF-V5-G0=da1eb9ea7ccc47f9e865137ccb4cf9f3; _s_tentry=passport.weibo.com; Apache=2362810556711.141.1552442918345; ULV=1552442918361:2:2:1:2362810556711.141.1552442918345:1551834025935; WBtopGlobal_register_version=ae9a9ec008078a68; un=18301386736; YF-Page-G0=140ad66ad7317901fc818d7fd7743564; UOR=vjudge.net,widget.weibo.com,login.sina.com.cn; TC-V5-G0=b993e9b6e353749ed3459e1837a0ae89; TC-Page-G0=45685168db6903150ce64a1b7437dbbb; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5_Eisgje0ZlIg_NsklAL7I5JpX5K2hUgL.FoM7e054So.Ee0q2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMESheReKzfehqc; SCF=Aqjpvvx8Bbc0McLP1gE6KUoWNHFOzdA7-lIGd4nYy09L5SNxg2aaU8WG1oV1A6QNZB4DWWnkJABYucqfe0hhomQ.; SUB=_2A25xjM37DeRhGeFO6FIY9ifOyDqIHXVS-7gzrDV8PUNbmtAKLRjWkW9NQWM3Io3wOsFpIJ1BUaHJtRVTa3YkPDam; SUHB=0YhRTIYRErWBOm; wvr=6; wb_view_log_7030969236=1366*7681; webim_unReadCount=%7B%22time%22%3A1552521886375%2C%22dm_pub_total%22%3A0%2C%22chat_group_pc%22%3A0%2C%22allcountNum%22%3A9%2C%22msgbox%22%3A0%7D' headers = { 'User-Agent': agent, 'Cookie': cookie, } session = requests.session() # 访问 初始页面带上 cookie index_url = "http://weibo.com/login.php" yundama_username = '' yundama_password = '' verify_code_path = './pincode.png' def get_pincode_url(pcid): size = 0 url = "http://login.sina.com.cn/cgi/pin.php" pincode_url = '{}?r={}&s={}&p={}'.format(url, math.floor(random.random() * 100000000), size, pcid) return pincode_url def get_img(url): resp = requests.get(url, headers=headers, stream=True) with open(verify_code_path, 'wb') as f: for chunk in resp.iter_content(1000): f.write(chunk) def get_su(username): """ 对 email 地址和手机号码 先 javascript 中 encodeURIComponent 对应 Python 3 中的是 urllib.parse.quote_plus 然后在 base64 加密后decode """ username_quote = quote_plus(username) username_base64 = base64.b64encode(username_quote.encode("utf-8")) return username_base64.decode("utf-8") # 预登陆获得 servertime, nonce, pubkey, rsakv def get_server_data(su): pre_url = "http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=" pre_url = pre_url + su + "&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.18)&_=" prelogin_url = pre_url + str(int(time.time() * 1000)) pre_data_res = session.get(prelogin_url, headers=headers) sever_data = eval(pre_data_res.content.decode("utf-8").replace("sinaSSOController.preloginCallBack", '')) return sever_data # 这一段用户加密密码,需要参考加密文件 def get_password(password, servertime, nonce, pubkey): rsaPublickey = int(pubkey, 16) key = rsa.PublicKey(rsaPublickey, 65537) # 创建公钥, message = str(servertime) + '\t' + str(nonce) + '\n' + str(password) # 拼接明文js加密文件中得到 message = message.encode("utf-8") passwd = rsa.encrypt(message, key) # 加密 passwd = binascii.b2a_hex(passwd) # 将加密信息转换为16进制。 return passwd def login(username, password): # su 是加密后的用户名 su = get_su(username) sever_data = get_server_data(su) servertime = sever_data["servertime"] nonce = sever_data['nonce'] rsakv = sever_data["rsakv"] pubkey = sever_data["pubkey"] password_secret = get_password(password, servertime, nonce, pubkey) postdata = { 'entry': 'weibo', 'gateway': '1', 'from': '', 'savestate': '7', 'useticket': '1', 'pagerefer': "http://login.sina.com.cn/sso/logout.php?entry=miniblog&r=http%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl", 'vsnf': '1', 'su': su, 'service': 'miniblog', 'servertime': servertime, 'nonce': nonce, 'pwencode': 'rsa2', 'rsakv': rsakv, 'sp': password_secret, 'sr': '1366*768', 'encoding': 'UTF-8', 'prelt': '115', 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack', 'returntype': 'META' } need_pin = sever_data['showpin'] if need_pin == 1: # 你也可以改为手动填写验证码 if not yundama_username: raise Exception('由于本次登录需要验证码,请配置顶部位置云打码的用户名{}和及相关密码'.format(yundama_username)) pcid = sever_data['pcid'] postdata['pcid'] = pcid img_url = get_pincode_url(pcid) get_img(img_url) verify_code = code_verificate(yundama_username, yundama_password, verify_code_path) postdata['door'] = verify_code login_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)' login_page = session.post(login_url, data=postdata, headers=headers) login_loop = (login_page.content.decode("GBK")) pa = r'location\.replace\([\'"](.*?)[\'"]\)' loop_url = re.findall(pa, login_loop)[0] login_index = session.get(loop_url, headers=headers) uuid = login_index.text uuid_pa = r'"uniqueid":"(.*?)"' uuid_res = re.findall(uuid_pa, uuid, re.S)[0] web_weibo_url = "http://weibo.com/%s/profile?topnav=1&wvr=6&is_all=1" % uuid_res weibo_page = session.get(web_weibo_url, headers=headers) weibo_pa = r'<title>(.*?)</title>' user_name = re.findall(weibo_pa, weibo_page.content.decode("utf-8", 'ignore'), re.S)[0] print('登陆成功,你的用户名为:'+user_name) print(uuid_res) print(uuid) # # 保存首页的数据 # web_weibo_url22= r'"redirect":"(.*?)"' # uuid_res22 = re.findall(web_weibo_url22, uuid, re.S)[0] # uuid_res22 = uuid_res22.replace("\\",'') # print(uuid_res22) # print(loop_url) # response = requests.get(url='https://weibo.com/u/7030969236/home',headers=headers) # print(time.ctime()) # time.sleep(2) # print(time.ctime()) # print(response.content) # print('---------------------------------------------------------------------------------------------------------------------') # print(response.text) # page_content = response.content # page_text = response.text # page_text1= r'"html":"(.*?)"' # page_text1 = re.findall(page_text1, page_text, re.S) # for i in page_text1: # print(i) # with open('./douban111.html','wb') as fp: # fp.write( page_content) # with open('./weibo.html','w',encoding='utf-8') as fp: # fp.write( page_text) if __name__ == "__main__": username = input('微博用户名:') password = input('微博密码:') login(username, password)
需求: 肯德基数据爬取并存入csv中(表格形式)
import requests import json import csv url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword' agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' headers = { 'User-Agent': agent, } city = input('城市名:') size = input('结束页:') data={ 'cname':'' , 'pid': '', 'keyword': city, 'pageIndex': '1', 'pageSize': size, } response = requests.post(url,data=data,headers=headers) print(response.text) dict_text = json.loads(response.text) # print(type(dict_text)) dict_list = dict_text.get('Table1') fileName = './kdj.csv' # # 把数据键值对n列写入csv # with open(fileName, "w", newline="") as csvFile: # csvWriter = csv.writer(csvFile) # for data_di in dict_list: # for k,v in data_di.items(): # csvWriter.writerow([k,v]) # csvFile.close() # 把数据写入csv,表格形式 with open(fileName ,'w',newline='') as csvf: fieldnames=['rownum','storeName','addressDetail','pro','provinceName','cityName'] writer=csv.DictWriter(csvf,fieldnames=fieldnames) writer.writeheader() # writer.writerow({'id':1,'name':'lisii','age':22,'date':20180627}) writer.writerows(dict_list)