爬虫视频案例图片案例

梨视频案例

import requests

from bs4 import BeautifulSoup

# 代理池获取一个ip地址
res = requests.get('http://127.0.0.1:10000/get/').json()

if res['https']:
    http = 'https'
else:
    http = 'http'
proxies = {http: http + '//:' + res['proxy']}

response = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=72',
                        proxies=proxies)
soup = BeautifulSoup(response.text, 'html.parser')
div_list = soup.find_all(name='div', class_='vervideo-title')

title_list = []
for i in div_list:
    title = i.text
    title_list.append(title)

import re

video_list = re.findall('<a href="(.*?)" class="vervideo-lilink actplay">', response.text)

url_list = []
for i in video_list:
    video_id = i.split('_')[1]
    header = {'Referer': 'https://www.pearvideo.com/%s' % i}
    res = requests.get('https://www.pearvideo.com/videoStatus.jsp?contId=%s&mrd=0.6668916974684385' % video_id,
                       proxies=proxies, headers=header)
    real_url = (res.json()['videoInfo']['videos'].get('srcUrl'))
    # print(real_url.replace(real_url.split('-')[0].split('/')[-1], 'cont-%s' % video_id))
    mp4_url = real_url.replace(real_url.split('-')[0].split('/')[-1], 'cont-%s' % video_id)
    url_list.append(mp4_url)

url_list = zip(url_list, title_list)


def download_mp4(count, url, title):
    res = requests.get(url)
    print('第%s下载正在执行>>>>>>>>>>>>>>>>>>>>>>>>>>>' % count)
    with open('./video/%s.mp4' % title, 'wb') as f:
        for i in res.iter_content():
            f.write(i)

    print('第%s个视频下载完毕>>>>>>>>>>>>>>>>>>>>>>>' % count)


from threading import Thread
import time

thread_list = []
thread_num = 10
for i, url in enumerate(url_list):
    # 为每个新URL创建下载线程
    print(i, url)
    t = Thread(target=download_mp4, args=(i, url[0], url[1]))
    # 加入线程池并启动
    thread_list.append(t)
    t.start()
    # 当线程池满时,等待线程结束
    while len(thread_list) > thread_num:
        # 移除已结束线程
        thread_list = [x for x in thread_list if x.is_alive()]
        time.sleep(3)
        # print("running threads_________" + str(thread_list))

    pass

图片案例

import requests
from bs4 import BeautifulSoup

res = requests.get('http://127.0.0.1:10000/get/').json()

if res['https']:
    http = 'https'
else:
    http = 'http'

proxies = {http: http + '//:' + res['proxy']}
url_list = []
title_list = []
count = 2
img_count = 0
hedaer = {
    'cookie': '__yjs_duid=1_a808921201046d8b029ff69055981a291669291524918; Hm_lvt_c59f2e992a863c2744e1ba985abaea6c=1669291526,1669336248; zkhanecookieclassrecord=,54,; Hm_lpvt_c59f2e992a863c2744e1ba985abaea6c=1669336289'}
try:
    while count < 63:
        res = requests.get('https://pic.netbian.com/4kmeinv/index_%s.html' % count, proxies=proxies,headers=hedaer)
        soup = BeautifulSoup(res.text, 'html.parser')
        img_list = soup.find_all(name='img')
        count += 1
        for i in img_list:
            url = i.attrs['src']
            if url.startswith('/uploads'):
                real_url = 'https://pic.netbian.com/' + url
                title = real_url.split('/')[-1].split('-')[0]
                title_list.append(title)
                url_list.append(real_url)
            img_count += 1
except:
    count += 1
print('共加载页面%s页,即将加载图片%s张' % (count, img_count))
url_list = zip(url_list, title_list)


def download_img(count, url, title):
    res = requests.get(url, proxies=proxies,headers=hedaer)
    print('第%s图片下载正在执行>>>>>>>>>>>>>>>>>>>>>>>>>>>' % count)
    with open('./img/%s.png' % title, 'wb') as f:
        for i in res.iter_content():
            f.write(i)
    print('第%s个图片下载完毕>>>>>>>>>>>>>>>>>>>>>>>' % count)


from threading import Thread
import time

thread_list = []
thread_num = 8
for i, url in enumerate(url_list):
    # 为每个新URL创建下载线程
    t = Thread(target=download_img, args=(i, url[0], url[1]))
    # 加入线程池并启动
    thread_list.append(t)
    t.start()
    # 当线程池满时,等待线程结束
    while len(thread_list) > thread_num:
        # 移除已结束线程
        thread_list = [x for x in thread_list if x.is_alive()]
        time.sleep(3)
        # print("running threads_________" + str(thread_list))

    pass

posted @ 2022-11-24 18:17  懒羊羊A  阅读(27)  评论(0)    收藏  举报