python之路_爬虫之requests模块补充
一、响应Response
1、response属性
import requests respone=requests.get('http://www.jianshu.com') # respone属性 print(respone.text) #获得文本内容 print(respone.content) #获得二进制文本内容 print(respone.status_code) #获得状态码 print(respone.headers) #获得响应头 print(respone.cookies) #获得cookies print(respone.cookies.get_dict()) #获得cookies的字典形式 print(respone.cookies.items()) #获得cookies的元组形式 print(respone.url) print(respone.history) print(respone.encoding) #关闭:response.close() from contextlib import closing with closing(requests.get('xxx',stream=True)) as response: for line in response.iter_content(): pass
2、响应编码
如上response.encoding可以获得返回response的编码,通过给其赋值,可以更改其编码,如下实例:
import requests response=requests.get('http://www.autohome.com/news') response.encoding='gbk' print(response.text) #注:汽车之家网站返回的页面内容为gb2312编码的,而requests的默认编码为ISO-8859-1,如果不设置成gbk则中文乱码
3、获取二进制数据
方式一:response.content
response=requests.get('https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000.jpg') with open('a.jpg','wb') as f: f.write(response.content)
方式二:stream=True参数
一点一点的取,比如下载视频时,如果视频100G,用response.content然后一下子写到文件中是不合理的,我们需要用到stream=True参数,逐行的读取数据,然后再写到文件里面。
response=requests.get('https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000.jpg',stream=True) with open('b.jpg','wb') as f: for line in response.iter_content(): f.write(line)
4、json解析
import requests response=requests.get('http://httpbin.org/get') import json res1=json.loads(response.text) #太麻烦 res2=response.json() #直接获取json数据 print(res1 == res2) #True
二、高级用法介绍
1、SSL认证
#证书验证(大部分网站都是https) import requests respone=requests.get('https://www.12306.cn') #如果是ssl请求,首先检查证书是否合法,不合法则报错,程序终端 #改进1:去掉报错,但是会报警告 import requests respone=requests.get('https://www.12306.cn',verify=False) #不验证证书,报警告,返回200 print(respone.status_code) #改进2:去掉报错,并且去掉警报信息 import requests from requests.packages import urllib3 urllib3.disable_warnings() #关闭警告 respone=requests.get('https://www.12306.cn',verify=False) print(respone.status_code) #改进3:加上证书 #很多网站都是https,但是不用证书也可以访问,大多数情况都是可以携带也可以不携带证书 #知乎\百度等都是可带可不带 #有硬性要求的,则必须带,比如对于定向的用户,拿到证书后才有权限访问某个特定网站 import requests respone=requests.get('https://www.12306.cn', cert=('/path/server.crt', '/path/key')) print(respone.status_code)
2、使用代理
#官网链接: http://docs.python-requests.org/en/master/user/advanced/#proxies #代理设置:先发送请求给代理,然后由代理帮忙发送(封ip是常见的事情) import requests proxies={ 'http':'http://egon:123@localhost:9743', #带用户名密码的代理,@符号前是用户名与密码 'http':'http://localhost:9743', 'https':'https://localhost:9743', } respone=requests.get('https://www.12306.cn',proxies=proxies) print(respone.status_code) #支持socks代理,安装:pip install requests[socks] import requests proxies = { 'http': 'socks5://user:pass@host:port', 'https': 'socks5://user:pass@host:port' } respone=requests.get('https://www.12306.cn',proxies=proxies) print(respone.status_code)
3、超时设置
#两种超时:float or tuple #timeout=0.1 #代表接收数据的超时时间 #timeout=(0.1,0.2)#0.1代表链接超时 0.2代表接收数据的超时时间 import requests respone=requests.get('https://www.baidu.com',timeout=0.0001)
三、拉勾网职位爬取实例
本实例主要包含如下两步,先通过登录验证(登录验证是为了投递简历的步骤需要),然后搜索符合要求的职位,并逐一自动投递简历。如下:
import requests import re #一、登录认证 session = requests.session() # 第一步,get方式请求登录页面 response = session.get("http://passport.lagou.com/login/login.html", headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'} ) # 从返回页面获得登录时需要的验证值 X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", response.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", response.text, re.S)[0] print(X_Anti_Forge_Token, X_Anti_Forge_Code) # 第二步,post请求方式进行提交登录 response = session.post("http://passport.lagou.com/login/login.json", headers={ 'Referer': 'http://passport.lagou.com/login/login.html', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest' }, data={ 'isValidate': True, 'username': '18829916141', 'password': 'f19ae09c9070aa9906ec6b287ed86583' # 密码输对,用户名输错的方式就可以抓到正确密码的密文形式 } ) #第三步,get请求验证登录状态 session.get('https://passport.lagou.com/grantServiceTicket/grant.html', headers={ 'Referer': 'https://passport.lagou.com/login/login.html', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', } ) # 二:爬取职位 from urllib.parse import urlencode # 关键字编码,并拼接成referer形式的url params = {'kd': '爬虫工程师'} res = urlencode(params, encoding='utf-8').split('=')[-1] # 编码后的结果:kd=%E7%88%AC%E8%99%AB%E5%B7%A5%E7%A8%8B%E5%B8%88 print(res) url = "https://www.lagou.com/jobs/list_" + res # 第一步,根据条件发送post请求,获得查询结果(拉勾网的职位数据是由get请求的页面发送的ajax post请求获得) response = session.post("https://www.lagou.com/jobs/positionAjax.json", headers={ 'Referer':url, 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', }, params={ 'gj': '5-10年', 'xl': '本科', 'px': 'default', 'yx': '25k-50k', 'city': '北京' }) print(response.json()) result = response.json()['content']['positionResult']['result'] for comanpy_info in result: fullname = comanpy_info['companyFullName'] emp_num = comanpy_info['companySize'] salary = comanpy_info['salary'] workyear = comanpy_info['workYear'] positionName = comanpy_info['positionName'] positionId = comanpy_info['positionId'] detail_url = 'https://www.lagou.com/jobs/%s.html' % (positionId) #职位详细页url#请求职位详细页 response=session.get(detail_url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', }) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", response.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", response.text, re.S)[0] #发送简历 session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', 'Referer': detail_url, 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest', }, data={ 'positionId': positionId, 'type': 1, 'force': True } ) print('投递成功', detail_url)