dummy多线程爬取梨视频例子

# _*_ coding:utf-8 _*_
"""
@FileName   :2.梨视频数据爬取.py
@CreateTime :2020/8/26 0026 15:26
@Author     : Lurker Zhang
@E-mail     : 289735192@qq.com
@Desc.      : 爬取梨视频音乐版块视频,地址:https://www.pearvideo.com/popular_59
"""
import atexit

from setting.config import *
from lxml import etree
import requests
import json
import re
import time
import os
from multiprocessing.dummy import Pool


def main():
    # 设置获取视频数:n*12
    n = 1
    global all_id_list, id_list
    get_id_pool = Pool(4)
    get_id_pool.map(get_id_list, [start for start in range(0, n * 12, 12)])
    down_video_pool = Pool(4)
    down_video_pool.map(down_video, all_id_list)


@atexit.register
def renew_id_list():
    print('采集完成,本地成功下载{0}个视频,失败{1}视频。'.format(total_success, total_fail))
    # 存储已下载文件名列表:
    with open("../depository/pear_video/down_id.json", 'w', encoding='utf-8') as fp:
        json.dump(id_list, fp)


def get_id_list(start):
    """
    get 12 video id list
    :param start: start
    :return: video id list
    """
    global all_id_list
    print("解析strart={}的12个视频ID".format(start))
    url = 'https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=59&start={}'.format(start)
    # 获取视频列表页面,从start开始,返回了12个视频
    video_list_page = requests.get(url=url, headers=headers).text
    tree = etree.HTML(video_list_page)
    all_id_list += [url.split('_')[1] for url in tree.xpath('/html/body/li/div/a/@href')]


def down_video(data_id):
    """
    Download video content according to video ID
    :param data_id: the video id for download
    :return: None
    """
    global path, id_list,total_fail,total_success
    print("开始下载保存data_id={}的视频".format(data_id))
    url = "https://www.pearvideo.com/video_{}".format(data_id)
    if data_id in id_list:
        total_fail += 1
        print(data_id, "已经下载过了,跳过!")
    else:
        video_preview_page = requests.get(url=url, headers=headers).text
        # 获取该视频的视频源地址
        """
        分析网页,视频地址为动态加载的,视频源地址在javascript代码中
        用re进行获取
        """
        url_ex = 'srcUrl="(.*?)"'
        title_ex = '"video-tt">(.*?)</h1>'
        try:
            video_url = re.findall(url_ex, video_preview_page)[0]
            video_title = re.findall(title_ex, video_preview_page)[0]
        except Exception:
            return 0
        # 下载视频
        video_content = requests.get(url=video_url, headers=headers).content
        try:
            with open(path + video_title + ".mp4", 'wb') as fp:
                fp.write(video_content)
        except Exception:
            return 0
        else:
            id_list.append(data_id)
            total_success += 1


if __name__ == '__main__':
    # 读取已下载视频ID
    if not os.path.exists('../depository/pear_video/down_id.json'):
        with open("../depository/pear_video/down_id.json", 'w', encoding="utf-8") as fp:
            json.dump([], fp)
    with open("../depository/pear_video/down_id.json", "r", encoding="utf-8") as fp:
        id_list = json.load(fp)
    # 设置视频保存位置
    path = '../depository/pear_video/' + time.strftime('%Y%m%d', time.localtime()) + '/'
    if not os.path.exists(path):
        os.mkdir(path)
    # 记录本次采集图片的数量
    total_success = 0
    total_fail = 0
    all_id_list = []  # 保存本次要下载的视频ID
    main()

 

 

posted @ 2020-08-26 18:43  lurkerzhang  阅读(201)  评论(0编辑  收藏  举报