基于线程池的梨视频爬取

需求:爬取梨视频中的视频数据

https://www.pearvideo.com/category_4

代码如下

 1 import requests
 2 from lxml import etree
 3 import re
 4 import random
 5 %%time
 6 from multiprocessing.dummy import Pool #线程池
 7 #实例化一个线程池对象
 8 pool = Pool(10)#参数表示的是开启线程的个数
 9 #爬取梨视频的视频数据
10 url='https://www.pearvideo.com/category_4'
11 headers = {
12     'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
13 }
14 page_text=requests.get(url=url,headers=headers).text
15 #解析 详情url 视频的标题
16 tree = etree.HTML(page_text)  
17 #定位视屏详情的标签元素
18 li_list=tree.xpath('//ul[@id="listvideoListUl"]/li')
19 video_url_list=[]
20 for li in li_list:
21     detail_url='https://www.pearvideo.com/'+ li.xpath('./div/a/@href')[0]
22     detail_text=requests.get(url=detail_url,headers=headers).text
23      #解析:视屏的url
24      # srcUrl在哪? 找寻路径,点击详情页面开发者模式,network--doc---video--mp4
25     exp='srcUrl="(.*?)",'
26     video_url=re.findall(exp,detail_text,re.S)[0]
27     video_url_list.append(video_url)
28 #使用线程池进行视频数据并发下载
29 video_data_list=pool.map(lambda link:requests.get(url=link,headers=headers).content,video_url_list)
30 
31 pool.map(saveData,video_data_list)
32 pool.close()
33 pool.join()
34     
爬虫代码

 

1 def saveData(data):
2     name = str(random.randint(1,10000))+'.mp4'
3     with open(name,'wb') as fp:
4         fp.write(data)
5         print(name+'下载成功')
saveDate函数

 

posted @ 2018-12-12 20:53  北伽  阅读(314)  评论(0编辑  收藏  举报