二 . 爬虫 requests模块使用 urllib模块 和 请求响应相关参数
一 . requests模块使用 和 请求响应相关参数
https://www.cnblogs.com/wupeiqi/articles/6283017.html
1. requests get请求相关参数
import requests url = 'http://httpbin.org/get?name=bob' # 请求url,?后面拼接的是参数 params = {'name': 'nick','age': '18'} # 参数,与url上的参数同时存在,没有优先级,若key相同,则值以列表形式存在 cookies = {'xxx': '111','yyy': '222'} # cookie值,若headers中有cookie,则使用headers中的cookie headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36", # 若未设置User-Agent,则会检测为python请求 "Cookie": 'aaa=aaa;bbb=bbb' } proxies = {'http':'http://ip:端口'} # 代理地址,固定的格式,如果是https就用https timeout = 0.5 # 设置超时时间,若请求超出时间则报错 allow_redirects = True # 重定向中使用,是否允许跳转 res = requests.get(url=url,headers=headers,params=params,cookies=cookies,timeout=timeout,allow_redirects=allow_redirects) print(res.text) { "args": { "age": "18", "name": [ "bob", "nick" ] }, "headers": { "Accept": "*/*", "Accept-Encoding": "gzip, deflate", "Cookie": "aaa=aaa;bbb=bbb", "Host": "httpbin.org", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36" }, "origin": "117.172.254.245, 117.172.254.245", "url": "https://httpbin.org/get?name=bob&name=nick&age=18" }
2. requests post请求相关参数
import requests url = 'http://httpbin.org/post' # 请求url data = { 'name': 'nick', # form数据 'age': '18', } json = {"sex":'man'} # json格式数据,如果有data时,json为null files = {'file':open('aa','rt',encoding='utf8')} # 文件数据 cookies = { 'xxx': 'xxx', 'yyy': 'yyy' } headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36", "Cookie": 'aaa=aaa;bbb=bbb' } timeout = 0.5 # 超时时间 allow_redirects = True # 是否允许重定向 res = requests.post(url=url, headers=headers, data=data,cookies=cookies,json=json,files=files) print(res.text) { "args": {}, "data": "", "files": { "file": "1111111111111111111111111111\u5a03\u5a03\u8ba4\u4e3a\u4eba" }, "form": { "age": "18", "name": "nick" }, "headers": { "Accept": "*/*", "Accept-Encoding": "gzip, deflate", "Content-Length": "356", "Content-Type": "multipart/form-data; boundary=e4ee34734e2325fdc6fa1eb84d070882", "Cookie": "aaa=aaa;bbb=bbb", "Host": "httpbin.org", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36" }, "json": null, "origin": "117.172.254.245, 117.172.254.245", "url": "https://httpbin.org/post" }
3. request请求
import requests requests.request(method,url,**kwargs) # method为请求方式,url请求地址。get,post请求本质上是继承了request请求
4. 响应相关参数
import requests r = requests.get(....) r.url # 请求的url r.text # 获得响应体文本信息 r.encoding = 'gbk' # 设置编码方式,用来解决乱码 r.content # 二进制信息 r.json # 相当于json.loads(r.text),若返回的不是json数据,会报错 r.status_code # 响应状态码 r.headers # 响应头 r.cookies # 拿cookie r.history # 有重定向时,取到的是 [响应对象1,响应对象2...]
5. 自动保存cookie的请求
session = requests.session() r = session.get(...) # 会将cookie保存在seesion中,发次发请求时会带上cookie # 补充(保存cookie到本地) import http.cookiejar as cookiejar import requests session = requests.session() session.cookies = cookiejar.LWPCookieJar() session.cookies.load(filename='cookie.txt') # 取cookie res = session.get('http://www.baidu.com') session.cookies.save(filename='cookie.txt') # 存cookie
import requests session = requests.Session() i1 = session.get(url="http://dig.chouti.com/help/service") i2 = session.post( url="http://dig.chouti.com/login", data={ 'phone': "8615131255089", 'password': "xxooxxoo", 'oneMonth': "" } ) i3 = session.post( url="http://dig.chouti.com/link/vote?linksId=8589523" ) print(i3.text)
# 1. 方法 requests.get requests.post requests.put requests.delete ... requests.request(method='POST') # 2. 参数 2.1 url 2.2 headers 2.3 cookies 2.4 params 2.5 data,传请求体 requests.post( ..., data={'user':'alex','pwd':'123'} ) GET /index http1.1\r\nhost:c1.com\r\n\r\nuser=alex&pwd=123 2.6 json,传请求体 requests.post( ..., json={'user':'alex','pwd':'123'} ) GET /index http1.1\r\nhost:c1.com\r\nContent-Type:application/json\r\n\r\n{"user":"alex","pwd":123} 2.7 代理 proxies # 无验证 proxie_dict = { "http": "61.172.249.96:80", "https": "http://61.185.219.126:3128", } ret = requests.get("https://www.proxy360.cn/Proxy", proxies=proxie_dict) # 验证代理 from requests.auth import HTTPProxyAuth proxyDict = { 'http': '77.75.105.165', 'https': '77.75.106.165' } auth = HTTPProxyAuth('用户名', '密码') r = requests.get("http://www.google.com",data={'xxx':'ffff'} proxies=proxyDict, auth=auth) print(r.text) ----------------------------------------------------------------------------------------- 2.8 文件上传 files # 发送文件 file_dict = { 'f1': open('xxxx.log', 'rb') } requests.request( method='POST', url='http://127.0.0.1:8000/test/', files=file_dict ) 2.9 认证 auth 内部: 用户名和密码,用户和密码加密,放在请求头中传给后台。 - "用户:密码" - base64("用户:密码") - "Basic base64("用户|密码")" - 请求头: Authorization: "basic base64("用户|密码")" from requests.auth import HTTPBasicAuth, HTTPDigestAuth ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf')) print(ret.text) 2.10 超时 timeout # ret = requests.get('http://google.com/', timeout=1) # print(ret) # ret = requests.get('http://google.com/', timeout=(5, 1)) # print(ret) 2.11 允许重定向 allow_redirects ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False) print(ret.text) 2.12 大文件下载 stream from contextlib import closing with closing(requests.get('http://httpbin.org/get', stream=True)) as r1: # 在此处理响应。 for i in r1.iter_content(): print(i) 2.13 证书 cert - 百度、腾讯 => 不用携带证书(系统帮你做了) - 自定义证书 requests.get('http://127.0.0.1:8000/test/', cert="xxxx/xxx/xxx.pem") requests.get('http://127.0.0.1:8000/test/', cert=("xxxx/xxx/xxx.pem","xxx.xxx.xx.key")) 2.14 确认 verify =False requests.get('http://127.0.0.1:8000/test/', cert="xxxx/xxx/xxx.pem")
二 .urllib模块使用 和 请求响应相关参数
https://www.cnblogs.com/meipu/p/11181754.html
https://www.cnblogs.com/zhangxinqi/p/9170312.html
其主要包括一下模块:
urllib.request 请求模块
urllib.error 异常处理模块
urllib.parse url解析模块
urllib.robotparser robots.txt解析模块
import urllib.request response = urllib.request.urlopen('https://www.python.org') print(response.read().decode('utf-8')) print(response.status) print(response.getheaders()) print(response.getheader('Server')) print("#####################################################################################3333") from urllib import request, parse url = 'http://httpbin.org/post' headers = { 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 'Host': 'httpbin.org' } dict = { 'name': 'Germey' } data = bytes(parse.urlencode(dict), encoding='utf8') req = request.Request(url=url, data=data, headers=headers, method='POST') response = request.urlopen(req) print(response.read().decode('utf-8'))