8.梨视频数据的爬取1.py

import re
import requests
from lxml import etree

headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}

url = "https://www.pearvideo.com/category_1"

# 拿到首页源码数据:
page_text = requests.get(url, headers=headers).text

# 解析:
tree = etree.HTML(page_text)

# 定位视频所在位置
li_list = tree.xpath('//*[@id="listvideoListUl"]/li')

# 循环视频里的url信息和标题并拿到:
for li in li_list:
detail_url = "https://www.pearvideo.com/" + li.xpath('./div/a/@href')[0]
title = li.xpath('./div/a/div[2]/text()')[0] + '.mp4'
# 详情页信息:
detail_page_text = requests.get(detail_url, headers=headers).text
# 提取动态加载的视频数据:
"""
var contId="1653941",liveStatusUrl="liveStatus.jsp",liveSta="",playSta="1",autoPlay=!1,isLiving=!1,
isVrVideo=!1,hdflvUrl="",sdflvUrl="",hdUrl="",sdUrl="",ldUrl="",srcUrl="https://video.pearvideo
.com/mp4/adshort/20200220/cont-1653941-14928648_adpkg-ad_hd.mp4",vdoUrl=srcUrl,skinRes="//www.pearvideo
.com/domain/skin",videoCDN="//video.pearvideo.com";
"""
# 正则匹配提取:srcUrl="、(.*?)表示从某某无穷开始到",vdoUrl结束
ex = 'srcUrl="(.*?)",vdoUrl'
video_url = re.findall(ex, detail_page_text, re.S)[0] # 参数1:要查找的内容,参数2:从哪查找,3、返回结果是列表的形式,4、将这个字符串作为一个整体,在整体中进行匹配
video_data = requests.get(video_url, headers=headers).content
with open(title, "wb") as fp:
fp.write(video_data)
posted @ 2020-02-22 16:26  干it的小张  阅读(355)  评论(0编辑  收藏  举报