梨视频案例
import requests
from bs4 import BeautifulSoup
# 代理池获取一个ip地址
res = requests.get('http://127.0.0.1:10000/get/').json()
if res['https']:
http = 'https'
else:
http = 'http'
proxies = {http: http + '//:' + res['proxy']}
response = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=72',
proxies=proxies)
soup = BeautifulSoup(response.text, 'html.parser')
div_list = soup.find_all(name='div', class_='vervideo-title')
title_list = []
for i in div_list:
title = i.text
title_list.append(title)
import re
video_list = re.findall('<a href="(.*?)" class="vervideo-lilink actplay">', response.text)
url_list = []
for i in video_list:
video_id = i.split('_')[1]
header = {'Referer': 'https://www.pearvideo.com/%s' % i}
res = requests.get('https://www.pearvideo.com/videoStatus.jsp?contId=%s&mrd=0.6668916974684385' % video_id,
proxies=proxies, headers=header)
real_url = (res.json()['videoInfo']['videos'].get('srcUrl'))
# print(real_url.replace(real_url.split('-')[0].split('/')[-1], 'cont-%s' % video_id))
mp4_url = real_url.replace(real_url.split('-')[0].split('/')[-1], 'cont-%s' % video_id)
url_list.append(mp4_url)
url_list = zip(url_list, title_list)
def download_mp4(count, url, title):
res = requests.get(url)
print('第%s下载正在执行>>>>>>>>>>>>>>>>>>>>>>>>>>>' % count)
with open('./video/%s.mp4' % title, 'wb') as f:
for i in res.iter_content():
f.write(i)
print('第%s个视频下载完毕>>>>>>>>>>>>>>>>>>>>>>>' % count)
from threading import Thread
import time
thread_list = []
thread_num = 10
for i, url in enumerate(url_list):
# 为每个新URL创建下载线程
print(i, url)
t = Thread(target=download_mp4, args=(i, url[0], url[1]))
# 加入线程池并启动
thread_list.append(t)
t.start()
# 当线程池满时,等待线程结束
while len(thread_list) > thread_num:
# 移除已结束线程
thread_list = [x for x in thread_list if x.is_alive()]
time.sleep(3)
# print("running threads_________" + str(thread_list))
pass
图片案例
import requests
from bs4 import BeautifulSoup
res = requests.get('http://127.0.0.1:10000/get/').json()
if res['https']:
http = 'https'
else:
http = 'http'
proxies = {http: http + '//:' + res['proxy']}
url_list = []
title_list = []
count = 2
img_count = 0
hedaer = {
'cookie': '__yjs_duid=1_a808921201046d8b029ff69055981a291669291524918; Hm_lvt_c59f2e992a863c2744e1ba985abaea6c=1669291526,1669336248; zkhanecookieclassrecord=,54,; Hm_lpvt_c59f2e992a863c2744e1ba985abaea6c=1669336289'}
try:
while count < 63:
res = requests.get('https://pic.netbian.com/4kmeinv/index_%s.html' % count, proxies=proxies,headers=hedaer)
soup = BeautifulSoup(res.text, 'html.parser')
img_list = soup.find_all(name='img')
count += 1
for i in img_list:
url = i.attrs['src']
if url.startswith('/uploads'):
real_url = 'https://pic.netbian.com/' + url
title = real_url.split('/')[-1].split('-')[0]
title_list.append(title)
url_list.append(real_url)
img_count += 1
except:
count += 1
print('共加载页面%s页,即将加载图片%s张' % (count, img_count))
url_list = zip(url_list, title_list)
def download_img(count, url, title):
res = requests.get(url, proxies=proxies,headers=hedaer)
print('第%s图片下载正在执行>>>>>>>>>>>>>>>>>>>>>>>>>>>' % count)
with open('./img/%s.png' % title, 'wb') as f:
for i in res.iter_content():
f.write(i)
print('第%s个图片下载完毕>>>>>>>>>>>>>>>>>>>>>>>' % count)
from threading import Thread
import time
thread_list = []
thread_num = 8
for i, url in enumerate(url_list):
# 为每个新URL创建下载线程
t = Thread(target=download_img, args=(i, url[0], url[1]))
# 加入线程池并启动
thread_list.append(t)
t.start()
# 当线程池满时,等待线程结束
while len(thread_list) > thread_num:
# 移除已结束线程
thread_list = [x for x in thread_list if x.is_alive()]
time.sleep(3)
# print("running threads_________" + str(thread_list))
pass