爬虫-多线程抓取斗图表情
import requests from lxml import etree import os import time from multiprocessing import Pool headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36' } def get_img_src(page): ''' 抓取页面的src :return: ''' for i in range(1, page + 1): url = f'https://www.pkdoutu.com/photo/list/?page={i}' print(f'开始抓取第{i}页数据') time.sleep(1) res = requests.get(url, headers=headers) res.encoding = res.apparent_encoding res_data = res.text tree = etree.HTML(res_data) # 抓取图片地址 img_list = tree.xpath('//img[@referrerpolicy="no-referrer"]/@data-original') yield img_list def download_img(url): ''' 下载图片 :return: ''' time.sleep(0.1) img_res = requests.get(url, headers=headers) img_name = url.split('_')[-1] # 拆分 # 写入到图片 # print(f'正在下载{img_name}') path = 'img' if not os.path.exists(path): os.mkdir(path) with open(os.path.join(path, img_name), 'wb') as f: f.write(img_res.content) if __name__ == '__main__': pool = Pool(10) # 通过生成器get_img_src,返回图片列表 for url_list in get_img_src(10): for url in url_list: # 加入进程池 pool.apply_async(download_img, args=(url, )) pool.close() # 关闭进程池 pool.join() # 等待进程 print('抓取结束')
效果展示: