requests爬取梨视频主页所有视频

爬取梨视频步骤:

  • 1.爬取梨视频主页,获取主页所有的详情页链接
    - url: https://www.pearvideo.com/

    - 1) 往url发送请求,获取主页的html文本
    
    - 2) 解析并提取详情页相对路径video_1637593,与主页拼接即可
        - 拼接后获取的是电影详情页url:
        - detail_url: https://www.pearvideo.com/ + video_1637593
    
  • 2.往电影详情页发送请求,解析并提取真实视频url--> .mp4后缀的视频url
    - 1) 往detail_url发送请求,获取detail_url的html文本
    - 2) 解析并提取详情页中视频的真实url

案例一:

单线程同步爬取

import requests
import re
import uuid


# 1.发送请求
def get_html(url):
    res = requests.get(url)
    return res

# 2.解析数据
# 解析主页,获取视频详情页url
def parse_html(res):
    # 获取所有视频的id
    video_id_data = re.findall('<a href="video_(.*?)"',res.text, re.S)
    """
    <a href="video_(.*?)" class="actwapslide-link">

    """
    return video_id_data

# 3.请求 视频详情页,并解析出视频链接
def parse_detail(video_detail_url):

    detail_html = requests.get(video_detail_url)
    video_url = re.findall('srcUrl="(.*?)"',detail_html.text, re.S)[0]
    """
    srcUrl="https://video.pearvideo.com/mp4/adshort/20191231/cont-1637727-14751751_adpkg-ad_hd.mp4"
    """
    return video_url

# 4.保存数据
def save_video(video_url):
    print('开始保存视频')
    res_video = requests.get(video_url)

    with open(f'{str(uuid.uuid4())}.mp4','wb') as f:
        for line in res_video.iter_content():
            f.write(line)
    print('结束保存视频')


if __name__ == '__main__':
    url = 'https://www.pearvideo.com'
    res = get_html(url)
    # 1.对梨视频主页进行解析,提取所有视频详情页的绝对路径
    video_id_data = parse_html(res)
    for video_id in video_id_data:
        video_detail_url = url + '/video_' + video_id
        # 2.往电影详情页发送请求,并解析
        video_url = parse_detail(video_detail_url)
        # 3.保存视频
        save_video(video_url)

案例二:

多线程异步爬取

import requests
import re
import uuid
from concurrent.futures import ThreadPoolExecutor

# 创建线程池,最大连接数为50
pool = ThreadPoolExecutor(50)


# 1.发送请求
def get_html(url):
    res = requests.get(url)
    return res

# 2.解析数据
# 解析主页,获取视频详情页url
def parse_html(res):
    # 获取所有视频的id
    video_id_data = re.findall('<a href="video_(.*?)"',res.text, re.S)
    """
    <a href="video_(.*?)" class="actwapslide-link">

    """
    return video_id_data

# 3.请求 视频详情页,并解析出视频链接
def parse_detail(res):  # res对象 --- 》 {'result': response}

    detail_html = res.result()
    # 通过回调得到的response参数是一个对象
    video_url = re.findall('srcUrl="(.*?)"',detail_html.text, re.S)[0]
    """
    srcUrl="https://video.pearvideo.com/mp4/adshort/20191231/cont-1637727-14751751_adpkg-ad_hd.mp4"
    """
    # 异步提交任务爬取真实视频数据,并保存
    pool.submit(save_video, video_url)


# 4.保存数据
def save_video(video_url):
    print('开始保存视频')
    res_video = requests.get(video_url)

    with open(f'{str(uuid.uuid4())}.mp4','wb') as f:
        for line in res_video.iter_content():
            f.write(line)
    print('结束保存视频')


if __name__ == '__main__':
    import time
    url = 'https://www.pearvideo.com'
    res = get_html(url)
    # 1.对梨视频主页进行解析,提取所有视频详情页的绝对路径
    video_id_data = parse_html(res)
    for video_id in video_id_data:
        video_detail_url = url + '/video_' + video_id
        time.sleep(0.1)
        # 循环并发异步提交任务, add_done_callback将get_html任务的执行结果,回调给parse_detail函数
        pool.submit(get_html, video_detail_url).add_done_callback(parse_detail)
posted @ 2019-12-31 22:50  tomjoy  阅读(476)  评论(0编辑  收藏  举报