Python 爬虫 梨视频批量下载

进入主页面
  • 获取视频详情页链接
    p1

    p2

点击链接,进入详情页
  • 获取视频地址
    • 若不点击播放,仅能从html中获取封面地址
      p3
    • 打开网络抓包工具,刷新页面,从ajax请求,发现response携带mp4链接
      p4
获取视频数据,保存到本地,(用视频标题命名,可能会出错)
获取更多分页
  • 在首页面利用鼠标下滑展开更多视频
  • 发现ajax请求,发现通过更改start值可以获得不同页面,解析response获取新页面视频链接
  • 创建headers,params,进行get请求
    p5
创建线程池加快下载
import requests
from lxml import etree
import time
import os
from multiprocessing import Pool

filepath = 'D:/PYT/video/'
def get_page_text(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
    }
    page_text = requests.get(url=url,headers=headers).text
    return page_text

def get_video_page_url(page_text):
    tree = etree.HTML(page_text)
    li_list = tree.xpath('//li[@class="categoryem"]')
    video_page_urls = []
    print('url list:\n')
    for li in li_list:
        video_page_url =li.xpath('./div/a/@href')[0]
        video_page_urls.append('https://www.pearvideo.com/' + video_page_url)
        print(video_page_url)

    return video_page_urls

def get_video_url(url):
    #获取视频下载链接
    headers = {
        'Referer': url,

        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
    }
    contId = url.split('_')[-1]
    video_url = 'https://www.pearvideo.com/videoStatus.jsp'
    para = {
        'contId': contId,
        'mrd': '0.38177687506833946'
    }
    mp4_url_response = requests.get(url=video_url,headers=headers,params=para).json()
    mp4_url = mp4_url_response['videoInfo']['videos']['srcUrl']

    mp4_url = mp4_url.replace(mp4_url.split('-')[0].split('/')[-1],'cont-' + str(contId))
    print(mp4_url)
    return mp4_url


def get_title(url):
    #获取中文标题
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
    }
    video_page_text = requests.get(url=url, headers=headers).text
    tree = etree.HTML(video_page_text)
    titlte = tree.xpath('//div[@class="box-left clear-mar"]/h1/text()')[0]
    return titlte

def load_more_page_urls(base_page_url,no):
    #获取加载出的视频页链接
    headers = {
        'Referer': base_page_url,

        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
    }
    load_url = 'https://www.pearvideo.com/category_loading.jsp?'
    params = {
        'reqType': '5',
        'categoryId': '135',
        'start': str(12*int(no)),
        'mrd': '0.5547564352582317'
    }
    load_page_text = requests.get(url=load_url,headers=headers,params=params).text
    tree = etree.HTML(load_page_text)
    li_list = tree.xpath('//li[@class="categoryem"]')
    video_urls = []
    print('url list:')
    for li in li_list:
        url = li.xpath('./div[@class="vervideo-bd"]/a/@href')[0]
        video_urls.append('https://www.pearvideo.com/' + url)
        print(url)
    return video_urls


def get_video_data(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
    }
    video_data = requests.get(url=url,headers=headers).content
    return video_data

def creatPath(path):
    if not os.path.exists(path):
        os.makedirs(path)

def download_video(url):
    time.sleep(0.5)
    down_url = get_video_url(url)
    title = 'video_' + url.split('_')[-1]
    video_data = get_video_data(down_url)
    fp = open(filepath + title + '.mp4', 'wb')
    fp.write(video_data)
    print("...download over..." + title)

def main():
    pool = Pool(4)
    creatPath(filepath)
    # pagenum = input("pagenum:")
    # 爬取页面数
    pagenum = 5
    curren_page_no = 0
    # 视频链接集
    video_page_urls = []
    base_page_url = 'https://www.pearvideo.com/category_135'
    while curren_page_no < pagenum:
        #获取多个页面
        video_page_urls.clear()
        if (curren_page_no == 0):
            base_page_url_text = get_page_text(base_page_url)
            video_page_urls = get_video_page_url(base_page_url_text)
        else:
            video_page_urls = load_more_page_urls(base_page_url, curren_page_no)
        curren_page_no = int(curren_page_no) + 1

        pool.map(download_video,video_page_urls)
        # for url in video_page_urls:
        #     download_video(url)

if __name__ == '__main__':
    main()

地址

posted @ 2021-08-03 15:40  w0000  阅读(157)  评论(0编辑  收藏  举报