1.1 自动携带cookie 的session对象
-res.cookies
-转成字典 res.cookies.get_dict()
import requests
header = {
'Referer' : 'http://www.aa7a.cn/user.php?&ref=http%3A%2F%2Fwww.aa7a.cn%2F' ,
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
data = {
'username' : '616564099@qq.com' ,
'password' : 'lqz123' ,
'captcha' : 'xxxx' ,
'remember' : 1 ,
'ref' : ' http://www.aa7a.cn/' ,
'act' : 'act_login'
}
session=requests.session()
res = session.post('http://www.aa7a.cn/user.php' , data=data, headers=header)
res1 = session.get('http://www.aa7a.cn/' )
1.2 响应Response
respone = requests.get('http://www.aa7a.cn/' )
print (type (respone))
from requests.models import Response
print (respone.text)
print (respone.content)
print (respone.status_code)
print (respone.headers)
print (respone.cookies)
print (respone.cookies.get_dict())
print (respone.cookies.items())
print (respone.url)
print (respone.history)
print (respone.encoding)
关闭:response.close()
respone.iter_content()
1.3下载图片/视频到本地
res=requests.get('http://pic.imeitou.com/uploads/allimg/220520/5-220520095649.jpg' )
res=requests.get('https://vd2.bdstatic.com/mda-pfbcdfzec56w6bkn/1080p/cae_h264/1686576386781438049/mda-pfbcdfzec56w6bkn.mp4' )
with open ('母猪的产后护理.mp4' ,'wb' ) as f:
for line in res.iter_content(chunk_size=1024 ):
f.write(line)
1.4 编码问题
直接打印res.text 字符串形式-----》从网络过来是二进制----》转成字符串涉及到编码---》默认以utf-8 ,---》现在会自动识别页面的编码,自动转成对应的
res.encoding='gbk'
print (res.text)
1.5 解析json
import requests
response=requests.get('http://httpbin.org/get' )
import json
res1=json.loads(response.text)
res2=response.json()
print (res1 == res2)
res=requests.post('http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword' ,data='cname=&pid=&keyword=%E5%91%A8%E6%B5%A6&pageIndex=1&pageSize=10' ,headers={
'Content-Type' :'application/x-www-form-urlencoded; charset=UTF-8'
})
res = requests.post('http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword' , data={
'cname' : '' ,
'pid' : '' ,
'keyword' : '周浦' ,
'pageIndex' : 1 ,
'pageSize' : 10 ,
})
for item in res.json()['Table1' ]:
print ('餐厅名字:%s,餐厅地址:%s' %(item['storeName' ],item['addressDetail' ]))
1.6 ssl认证(了解)
import requests
respone=requests.get('https://www.12306.cn' )
import requests
respone=requests.get('https://www.12306.cn' ,verify=False )
print (respone.status_code)
import requests
from requests.packages import urllib3
urllib3.disable_warnings()
respone=requests.get('https://www.12306.cn' ,verify=False )
print (respone.status_code)
import requests
respone=requests.get('https://www.12306.cn' ,
cert=('/path/server.crt' ,
'/path/key' ))
print (respone.status_code)
1.7 使用代理(重要)
import requests
proxies={
'http' :'http://egon:123@localhost:9743' ,
'http' :'http://localhost:9743' ,
'https' :'https://localhost:9743' ,
}
respone=requests.get('https://www.12306.cn' ,
proxies=proxies)
print (respone.status_code)
import requests
proxies = {
'http' : 'socks5://user:pass@host:port' ,
'https' : 'socks5://user:pass@host:port'
}
respone=requests.get('https://www.12306.cn' ,
proxies=proxies)
print (respone.status_code)
1.8 超时设置
import requests
respone=requests.get('https://www.baidu.com' ,
timeout=0.0001 )
1.9 异常处理
import requests
from requests.exceptions import *
try :
r=requests.get('http://www.baidu.com' ,timeout=0.00001 )
except ReadTimeout:
print ('===:' )
except RequestException:
print ('Error' )
1.10 上传文件
import requests
files={'file' :open ('a.jpg' ,'rb' )}
respone=requests.post('127.0.0.1:8000' ,files=files)
print (respone.status_code)
http 和 https区别
https://zhuanlan.zhihu.com/p/561907474
'''
https 是 http+ssl/tls 他们的端口一个是443一个是80
https 保证了传输过程中数据的安全,可以防止中间人的攻击
'''
2 代理池搭建
import requests
proxies = {
'http' : '104.193.88.77:80' ,
}
respone=requests.get('http://127.0.0.1:8000/' ,proxies=proxies)
print (respone)
-1 使用爬虫技术,爬取网上免费的代理
-2 爬完回来做验证,如果能用,存到redis中
python proxyPool.py schedule
-3 使用flask启动服务,对外开放了几个接口,向某个接口发请求,就能随机获取一个代理
python proxyPool.py server
1 从git拉去开源代码
git clone https://github.com/jhao104/proxy_pool.git
2 使用pycharm打开,创建虚拟环境
mkvirtualenv -p python3 pool
3 配置项目使用虚拟环境
4 修改项目配置文件
DB_CONN = 'redis://127.0.0.1:6379/2'
HTTP_URL = "http://www.baidu.com"
HTTPS_URL = "https://www.baidu.com"
5 启动调度程序---》爬取网站,验证,存到redis
python proxyPool.py schedule
6 启动web程序(flask写的)
python proxyPool.py server
7 向http://192.168 .1 .252 :5010 /get/?type =http 地址发送请求就可以随机获取代理ip
2.1 django后端获取客户端的ip
import requests
res = requests.get('http://192.168.1.252:5010/get/?type=http' ).json()['proxy' ]
proxies = {
'http' : res,
}
print (proxies)
respone = requests.get('http://139.155.203.196:8080/' , proxies=proxies)
print (respone.text)
3 爬取某视频网站
https://www.pearvideo.com/category_loading.jsp?reqType=5 &categoryId=1 &start=0
import requests
import re
res = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=0' )
video_list = re.findall('<a href="(.*?)" class="vervideo-lilink actplay">' , res.text)
print (video_list)
for video in video_list:
url = 'https://www.pearvideo.com/' + video
header = {
'Referer' : url
}
video_id = video.split('_' )[-1 ]
video_url = 'https://www.pearvideo.com/videoStatus.jsp?contId=%s&mrd=0.8273125965736401' % video_id
res1 = requests.get(video_url, headers=header).json()
real_mp4_url = res1['videoInfo' ]['videos' ]['srcUrl' ]
real_mp4_url = real_mp4_url.replace(real_mp4_url.split('/' )[-1 ].split('-' )[0 ], 'cont-%s' % video_id)
print (real_mp4_url)
res2 = requests.get(real_mp4_url)
with open ('./video/%s.mp4' % video, 'wb' ) as f:
for line in res2.iter_content():
f.write(line)
https://video.pearvideo.com/mp4/adshort/20181106 / 1688703103822 -13189302_adpkg-ad_hd.mp4
https://video.pearvideo.com/mp4/adshort/20181106 / cont-1470647 -13189302_adpkg-ad_hd.mp4
url = 'https://video.pearvideo.com/mp4/adshort/20181106/1688703103822-13189302_adpkg-ad_hd.mp4'