昨日回顾
爬虫的全过程:
1发送请求(请求库)
——requests模块
——selenium模块
2获取响应数据(服务器返回)
——bs4
——Xpath
3解析并提取数据(解析库)
4保存数据(存储库)
——MongoDB
1,3,4需要手动写
爬取梨视频:
1分析网站的视频源地址
2通过requests网视频源地址
今日内容:
requests模块详细使用
selenium模块
爬取梨视频
import requests
import re #正则模块
#re.findall('正则匹配规则','解析文本','正则模式')
#re.S:全局模式(对整个文本行进行匹配)
response=requests.get(url='https://www.pearvideo.com/')
print(response.status_code)
import requests
response=requests.get(url='https://www.bilibili.com/')
print(response.status_code)
import uuid
#导入
import requests
import re
import uuid
#爬虫三部曲
#1发送请求
def get_page(url):
response=requests.get(url)
return response
#2解析数据
#解析主页获取视频详情页ID
def parse_index(text):
res=re.findall('<a href="video_(.*?)"',text,re.S)
detail_url_list=[]
for m_id in res:
detail_url='https://www.bilibili.com/video/av_'+ m_id
detail_url_list.append(detail_url)
return detail_url_list
#解析详情页获取视频url
def parse_detail(text):
movie_url=re.findall('srcUrl="(.*?)',text,re.S)[0]
return movie_url
#3保存数据
def save_moive(movie_url):
response=requests.get(movie_url)
with open(f'{uuid.uuid4()}.mp4','wb')as f:
f.write(response.content)
f.flush()
if __name__ == '__main__':
index_res=get_page(url='https://www.bilibili.com/')
detail_url_list=parse_index(index_res.text)
for detail_url in detail_url_list:
detail_res=get_page(url=detail_url)
movie_url=parse_detail(detail_res.text)
print(movie_url)
save_moive(movie_url)
#携带请求头参数访问知乎:
import requests
#请求头字典
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
}
response=requests.get(url='https://www.zhihu.com/topics',headers=headers)
print(response.status_code)
with open('zhihu.html','w',encoding='utf-8')as f:
f.write(response.text)
base_url='http://
n=0
for line in range(10)
print()
url
作业
import requests
import re
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}
base_url = 'https://movie.douban.com/top250?start={}&filter='
n = 0
for line in range(10):
url = base_url.format(n)
print(type(n))
n += 25
print(url)
# 1、往豆瓣TOP250发送请求获取响应数据
response = requests.get(url, headers=headers)
# print(response.text)
# 2、通过正则解析提取数据
# 电影详情页url、图片链接、电影名称、电影评分、评价人数
movie_content_list = re.findall(
# 正则规则
# '<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价',
'<div class="item">.*?href="(.*?)">.*?src="(.*?)" class="">.*?<span class="title">(.*?)</span>.*?<div class="bd">.*?导演:(.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价.*?<span class="inq">(.*?)</span>',
# 解析文本
response.text,
# 匹配模式
re.S)
for movie_content in movie_content_list:
# 解压赋值每一部电影
detail_url, movie_jpg, name, daoyan, timer,point, num, desc= movie_content
data = f'电影名称:{name}, 详情页url:{detail_url}, 图片url:{movie_jpg}, 导演: {daoyan} 上映时间: {timer}评分: {point}, 评价人数: {num} 简介:{desc}\n'
print(data)
# 3、保存数据,把电影信息写入文件中
with open('douban.txt', 'a', encoding='utf-8') as f:
f.write(data)