python3爬虫之requests库基本使用
官方文档链接(中文)
https://2.python-requests.org/zh_CN/latest/
requests 基于 urllib3 ,python编写。
安装 pip install requests (python3)
anaconda 版本 用pip安装 要在 anaconda prompt 里打入安装命令
提示Requirement already satisfied: requests in xxxxxx 表示 已经安装了
import requests response=requests.get('http://www.baidu.com') #打印类型 print(type(response)) #打印状态码 print(response.status_code) #打印网页源码类型 #字符串类型,不需要decode(区别于urllib) print(type(response.text)) #打印网页源码 print(response.text) #打印cookie print(response.cookies)
requests.get
import requests data={'name':'germey', 'age':22} response=requests.get('http://httpbin.org/get',params=data) print(response.text)
输出结果:
{ "args": { "age": "22", "name": "germey" }, "headers": { "Accept": "*/*", "Accept-Encoding": "gzip, deflate", "Host": "httpbin.org", "User-Agent": "python-requests/2.21.0" }, "origin": "xxx.xxx.xxx.xxx, xxx.xxx.xxx.xxx", "url": "https://httpbin.org/get?name=germey&age=22" }
可以看到,在原始网站上增加了
?name=germey&age=22
问号后的便是params数据
解析json
import requests import json response=requests.get('http://httpbin.org/get') print(type(response.text)) #下面两句结果一样 print(response.json()) print(json.loads(response.text)) print(type(response.json()))
输出结果:
<class 'str'> {'args': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.21.0'}, 'origin': xxx.xxx.xxx.xxx 'url': 'https://httpbin.org/get'} {'args': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.21.0'}, 'origin': 'xxx.xxx.xxx.xxx 'url': 'https://httpbin.org/get'} <class 'dict'>
可以看到用json解析后的为字典类型
获取二进制文件
#获取二进制文件 import requests response=requests.get('https://github.com/favicon.ico') print(type(response.content)) #保存二进制文件 with open('favicon.ico','wb') as f: f.write(response.content) #将response.content写入favicon.ico f.close() #关闭文件 #favicon.ico为文件名 #wb参数中w表示写入,b表示二进制文件,r表示读取。
headers
import requests #返回200 headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0' } response=requests.get('https://www.zhihu.com/explore',headers=headers) print(response.status_code) #返回400 response=requests.get('https://www.zhihu.com/explore') print(response.status_code)
User-Agent表示的是 浏览器的客户端信息
http://www.useragentstring.com/
中可以查询
#响应的属性
response=requests.get('http://www.baidu.com')
print(response.status_code)
print(response.headers)
print(response.cookies)
print(response.url)
print(response.history)
#文件上传
files={'file':open('favicon.ico','rb')}
response=requests.post('http://httpbin.org/post',files=files)
print(response.text)
#获取cookies
import requests
response=requests.get('http://www.baidu.com')
print(response.cookies)
for key,value in response.cookies.items():
print(key+ '=' + value)
#其他 #代理设置样例,根据选择需要其中一种 proxies={ "http":"http://user:password@xxx.xxx.xxx.xxx:xxxx/: } proxies={ "http":"socks5://xxx.xxx.xxx.xxx:xxxx", "https":"socks5://xxx.xxx.xxx.xxx:xxxx" } proxies={ "http":"http://xxx.xxx.xxx.xxx:xxxx", "https":"https://xxx.xxx.xxx.xxx:xxxx" } r=requests.get('url',proxies=proxies,timeout=1) #timeout表示超时设置。单位为秒 #超出时间,将会终端程序 #使用try except from requests.exceptions import ReadTimeout try: r=requests.get('url',proxies=proxies,timeout=1) print(r.status_code) except ReadTimeout: print('timeout')
#认证设置 import requests from requests.auth import HTTPBasicAuth r=requests.get('http://xxx.xxx.xxx.xxx:xxxx') print(r.status_code) #返回401表示需要认证 r=requests.get('http://xxx.xxx.xxx.xxx:xxxx',auth=('user','passwd') print(r.status_code)