import re
import redis
import requests,time
from setting import PAGE,CATEGORY_ID,START,MAIN_URL,DETAIL_URL
from concurrent.futures import ThreadPoolExecutor
from myredis import POOL
class CrawlVideo():
pools = ThreadPoolExecutor(100)
def __init__(self, page=PAGE):
self.page = page
self.video_info_dic_list = []
self.conn = redis.Redis(connection_pool=POOL)
def async_download(self,video_dic):
video_link = video_dic["video_link"]
if self.conn.get(video_link):
return
video_name = video_dic["title"][:3]
response = requests.get(video_link)
if response.status_code == 200:
with open("%s.mp4" % video_name, "wb")as f:
f.write(response.content)
self.conn.set(video_link,video_link)
def download_video(self, category_id=CATEGORY_ID, start=START, num=PAGE):
crawl_ids_list= self.crawl_videolist(category_id, start, num)
print(len(crawl_ids_list))
self.get_video_info(crawl_ids_list)
i = 0
while i < len(crawl_ids_list):
try:
video_dic = self.video_info_dic_list.pop()
self.pools.submit(self.async_download,video_dic)
i += 1
except Exception as e:
time.sleep(0.2)
def get_video_ids(self, category_id, start):
main_url = MAIN_URL.format(category_id, start)
try:
response = requests.get(main_url)
video_id_list = re.findall('<a href="(video_\d+)"', response.text)
return video_id_list
except Exception as e:
pass
# 爬取单个视频的id的列表,可以通过此列表发请求
def crawl_videolist(self, category_id, start, num):
crawl_ids_list = []
page_num = self.get_page_num(num)
for i in range(page_num):
video_id_list = self.get_video_ids(category_id, start)
crawl_ids_list.extend(video_id_list)
start += self.page
while len(crawl_ids_list) > num:
crawl_ids_list.pop()
return crawl_ids_list
def get_detail(self, obj):
response = obj.result()
dic = {}
title = re.search('<title>(.*?)</title>', response.text).group(1)
video_link = re.search('srcUrl="(.*?)"', response.text).group(1)
dic["title"] = title
dic["video_link"] = video_link
self.video_info_dic_list.append(dic)
def async_request(self,url,video_addr):
response = requests.get(url.format(video_addr))
return response
def get_video_info(self, video_id_list):
url = DETAIL_URL
try:
for video_addr in video_id_list:
obj = self.pools.submit(self.async_request,url,video_addr)
obj.add_done_callback(self.get_detail)
except Exception as e:
print(e)
def get_page_num(self, num):
if num % self.page == 0:
page_num = num / self.page
elif num <= self.page:
page_num = 1
else:
page_num = num // self.page + 1
return int(page_num)
crawl = CrawlVideo()
crawl.download_video(start=1,num=2)