爬虫基础-线程池
线程池基本语法
import time
from multiprocessing.dummy import Pool
start_time = time.time()
def get_page(str):
print('正在下载: ', str)
time.sleep(2)
print('下载成功: ', str)
name_list = ['xiaoming','aa','bb','cc']
pool = Pool(4)
# 将列表中每一个元素交给get_page进行处理,返回值是个列表
pool.map(get_page, name_list)
end_time = time.time()
print('%d second' % (end_time-start_time))
多线池方式爬取梨视频内容
import requests
from lxml import etree
import re
from multiprocessing.dummy import Pool
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
# 对url发请求,解析视频详情页的url和视频名称
url = 'https://www.pearvideo.com/category_5'
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@id="listvideoListUl"]/li')
urls = [] # 创建一个列表,放置字典(字典里包含视频名称和url)
for li in li_list:
deatil_url = 'https://www.pearvideo.com/' + li.xpath('.//a[@class="vervideo-lilink actplay"]/@href')[0]
detail_name = li.xpath('.//a/div[@class="vervideo-title"]/text()')[0] + '.mp4'
# 对详情页的url发起请求
detail_page = requests.get(url=deatil_url,headers=headers).text
ex = 'srcUrl="(.*\.mp4?)",vdoUrl=srcUrl'
video_url = re.findall(ex, detail_page)[0]
# 创建字典
dic = {
'name' : detail_name,
'url': video_url
}
urls.append(dic)
def get_video(dic):
url = dic['url']
print(dic['name'], '正在下载')
video_content = requests.get(url,headers=headers).content
fp = open(dic['name'], 'wb')
fp.write(video_content)
fp.close()
print(detail_name, '下载完成')
pool = Pool(2)
pool.map(get_video, urls)
pool.close()
pool.join()