8.梨视频数据的爬取1.py

import re
import requests
from lxml import etree

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}

url = "https://www.pearvideo.com/category_1"

# 拿到首页源码数据：
page_text = requests.get(url, headers=headers).text

# 解析：
tree = etree.HTML(page_text)

# 定位视频所在位置
li_list = tree.xpath('//*[@id="listvideoListUl"]/li')

# 循环视频里的url信息和标题并拿到：
for li in li_list:
    detail_url = "https://www.pearvideo.com/" + li.xpath('./div/a/@href')[0]
    title = li.xpath('./div/a/div[2]/text()')[0] + '.mp4'
    # 详情页信息：
    detail_page_text = requests.get(detail_url, headers=headers).text
    # 提取动态加载的视频数据：
    """
    var contId="1653941",liveStatusUrl="liveStatus.jsp",liveSta="",playSta="1",autoPlay=!1,isLiving=!1,
    isVrVideo=!1,hdflvUrl="",sdflvUrl="",hdUrl="",sdUrl="",ldUrl="",srcUrl="https://video.pearvideo
    .com/mp4/adshort/20200220/cont-1653941-14928648_adpkg-ad_hd.mp4",vdoUrl=srcUrl,skinRes="//www.pearvideo
    .com/domain/skin",videoCDN="//video.pearvideo.com";
    """
    # 正则匹配提取：srcUrl="、(.*?)表示从某某无穷开始到",vdoUrl结束
    ex = 'srcUrl="(.*?)",vdoUrl'
    video_url = re.findall(ex, detail_page_text, re.S)[0]  # 参数1:要查找的内容,参数2:从哪查找,3、返回结果是列表的形式，4、将这个字符串作为一个整体，在整体中进行匹配
    video_data = requests.get(video_url, headers=headers).content
    with open(title, "wb") as fp:
        fp.write(video_data)

posted @ 2020-02-22 16:26 干it的小张阅读(355) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

干it的小张

8.梨视频数据的爬取1.py

公告