requests(爬虫常用)库的使用
Requests库的使用
基于urllib改写的库
示例:
import requests response=requests.get('http://www.baidu.com')#get请求 print(response.status_code,response.url,response.cookies,response.text,sep='\n')
import requests response=requests.post('http://httpbin.org/post')#post请求 print(response.text)#.text得到的都是字符串类型的值
带参数的get请求
import requests data={ 'name':'abc', 'age':15 } response=requests.get('http://httpbin.org/get',params=data)#post的参数请求时data=data print(response.text)
----------------------------------------------
#或者直接将参数拼接在url上
import requests response=requests.get('http://httpbin.org/get?name=adas&age=12') print(response.text)
将返回的结果变为json格式
import requests import json response=requests.get('http://httpbin.org/get') print(response.json()) print(json.loads(response.text))#等价于上面的
获取二进制数据
1 import requests 2 response=requests.get('https://weibo.com/favicon.ico') 3 print(response.content) 4 with open('weibo.ico','wb')as f: 5 f.write(response.content)
添加http的headers属性
1 import requests 2 headers={ 3 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36" 4 } 5 response=requests.get('https://zhihu.com/',headers=headers) 6 print(response.status_code) 7 print(response.text)
带参数的post请求
import requests data={ 'name':'wang', 'age':88 } headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36" } response=requests.post('http://httpbin.org/post',data=data,headers=headers) print(response.text)
response的一些属性
1 import requests 2 response=requests.get('http://www.baidu.com') 3 print(response.headers) 4 print(response.text) 5 print(response.status_code) 6 print(response.content) 7 print(response.cookies) 8 #...还有就省略了
response状态码的判断
1 import requests 2 response=requests.get('http://www.baidu.com') 3 # if response.status_code==requests.codes.ok: 4 if response.status_code==200: 5 print('ok') 6 else: 7 print('error')
文件上传
import requests files={ 'file':open('weibo.ico','rb') } response=requests.post('http://httpbin.org/post',files=files) print(response.text)
获取cookie
import requests response=requests.get('htt cook=response.cookies print(type(cook)) for key,value in cook.item print(key,'=',value)
维持会话
模拟登陆
import requests s=requests.Session()#声明session对象,通过session请求网站 s.get('http://httpbin.org/cookies/set/name/123') response=s.get('http://httpbin.org/cookies') print(response.text)
证书验证
import requests from requests.packages import urllib3 urllib3.disable_warnings()#去除py警告 response=requests.get('https://www.12306.cn',verify=False)#去除证书验证 print(response.status_code)
代理ip
import requests proxies={ 'http': 'http://47.89.10.103:80/' } response=requests.get('http://www.geogle.com',proxies=proxies) print(response.text)
有密码的代理ip
import requests proxies={ 'http':'http://user.password@47.89.10.103:80' } response=requests.get('http://www.geogle.com',proxies=proxies) print(response.text)
超时设置
import requests from requests.exceptions import ReadTimeout try: response=requests.get('https://taobao.com',timeout=0.1) except ReadTimeout as e: print('timeout')
认证设置(需要直接登陆才能查看网站)
import requests from requests import HTTPBasicAuth response=requests.get('http://115.44.48.789:8888',auth=HTTPBasicAuth('user','123')) print(response.status_code)
异常处理
import requests from requests import ReadTimeout,ConnectionError,RequestException try: response=requests.get('http://www.baidu.com',timeout=0.5) print(response.status_code) except ReadTimeout: print('timeout') except ConnectionError: print('connectionerror') except RequestException: print('requesterror')