day4

昨日回顾
爬虫的全过程:
1发送请求(请求库)
——requests模块
——selenium模块
2获取响应数据(服务器返回)
——bs4
——Xpath
3解析并提取数据(解析库)
4保存数据(存储库)
——MongoDB
1,3,4需要手动写

爬取梨视频:
1分析网站的视频源地址
2通过requests网视频源地址

今日内容:
requests模块详细使用
selenium模块


爬取梨视频

import requests
import re #正则模块


#re.findall('正则匹配规则','解析文本','正则模式')
#re.S:全局模式(对整个文本行进行匹配)


response=requests.get(url='https://www.pearvideo.com/')
print(response.status_code)


import requests

response=requests.get(url='https://www.bilibili.com/')
print(response.status_code)

import uuid
#导入
import requests
import re
import uuid

#爬虫三部曲
#1发送请求
def get_page(url):
response=requests.get(url)
return response
#2解析数据
#解析主页获取视频详情页ID
def parse_index(text):
res=re.findall('<a href="video_(.*?)"',text,re.S)
detail_url_list=[]
for m_id in res:
detail_url='https://www.bilibili.com/video/av_'+ m_id
detail_url_list.append(detail_url)
return detail_url_list
#解析详情页获取视频url
def parse_detail(text):
movie_url=re.findall('srcUrl="(.*?)',text,re.S)[0]
return movie_url

#3保存数据
def save_moive(movie_url):
response=requests.get(movie_url)
with open(f'{uuid.uuid4()}.mp4','wb')as f:
f.write(response.content)
f.flush()
if __name__ == '__main__':
index_res=get_page(url='https://www.bilibili.com/')
detail_url_list=parse_index(index_res.text)
for detail_url in detail_url_list:
detail_res=get_page(url=detail_url)
movie_url=parse_detail(detail_res.text)
print(movie_url)
save_moive(movie_url)
#携带请求头参数访问知乎:
import requests
#请求头字典
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
}
response=requests.get(url='https://www.zhihu.com/topics',headers=headers)
print(response.status_code)
with open('zhihu.html','w',encoding='utf-8')as f:
f.write(response.text)



base_url='http://

n=0
for line in range(10)
print()
url

作业
import requests
import re


headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}
base_url = 'https://movie.douban.com/top250?start={}&filter='

n = 0
for line in range(10):
url = base_url.format(n)
print(type(n))
n += 25
print(url)

# 1、往豆瓣TOP250发送请求获取响应数据
response = requests.get(url, headers=headers)

# print(response.text)

# 2、通过正则解析提取数据
# 电影详情页url、图片链接、电影名称、电影评分、评价人数
movie_content_list = re.findall(
# 正则规则
# '<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价',
'<div class="item">.*?href="(.*?)">.*?src="(.*?)" class="">.*?<span class="title">(.*?)</span>.*?<div class="bd">.*?导演:(.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价.*?<span class="inq">(.*?)</span>',

# 解析文本
response.text,

# 匹配模式
re.S)

for movie_content in movie_content_list:
# 解压赋值每一部电影
detail_url, movie_jpg, name, daoyan, timer,point, num, desc= movie_content
data = f'电影名称:{name}, 详情页url:{detail_url}, 图片url:{movie_jpg}, 导演: {daoyan} 上映时间: {timer}评分: {point}, 评价人数: {num} 简介:{desc}\n'
print(data)

# 3、保存数据,把电影信息写入文件中
with open('douban.txt', 'a', encoding='utf-8') as f:
f.write(data)



posted on 2019-06-17 22:20  junrob  阅读(137)  评论(0编辑  收藏  举报

导航