爬虫
编码问题:
import requests
res=requests.get('https://www.baidu.com')
#指定编码
res.encoding='utf-8'
print(res.text) #响应内容
with open("a.html",'w') as f:
f.write(res.text)
get请求携带参数
res=requests.get('https://www.baidu.com/s',
params={'wd':"泰迪"},
#请求头的信息
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
)
res.encoding='utf-8'# 制定编码
print(res.text) #响应内容
with open("a.html",'w') as f:
f.write(res.text)
# post请求
# params 是拼在get请求后的
# data:是post请求boyd体中的内容
模拟登陆
1带请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'Referer': 'http://www.aa7a.cn/user.php?&ref=http%3A%2F%2Fwww.aa7a.cn%2Fgoods-4210.html'
}
2request请求
res = requests.post('http://www.aa7a.cn/user.php',
headers=headers,
data={
'username': '@qq.com',
'password': '',
'captcha': 'sdfdswa',
'remember': 1,
'ref': 'http://www.aa7a.cn/goods-4210.html',
'act': 'act_login',
})
3.如果登录成功,cookie会存在于res对象中
cookie = res.cookies.get_dict()
4.向首页发送get请求验证是否登陆成功
res = requests.get('http://www.aa7a.cn/',headers=headers,cookies=cookie)
if 'a1632079340@qq.com' in res.text:
print('登陆成功')
else:
print('登陆失败')
爬取视频
import re
res=requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=0')
reg_text='<a href="(.*?)" class="vervideo-lilink actplay">'
obj=re.findall(reg/_text,res.text)
print(obj)
for url in obj:
url='https://www.pearvideo.com/'+url
res1=requests.get(url)
obj1=re.findall('srcUrl="(.*?)"',res1.text)
print(obj1)
print(obj1[0])
name=obj1[0].rsplit('-',1)[1]
print(name)
res2=requests.get(obj1[0])
print(res2)
with open(name,'wb') as f:
for line in res2.iter_content():# 这是for循环特有的写入二进制数据的方法;content
f.write(line)