python线程池爬虫
import re from concurrent.futures import ThreadPoolExecutor, as_completed import json from requests import adapters import time import requests from spiders.index_url_title_collection import index_title_script from spiders.script import headers requests.packages.urllib3.disable_warnings() class IndexKeyword(object): def __init__(self, keyword): self.headers = headers self.keyword = keyword # 关键词 self.data_list = [] # 返回的数据列表 def spider(self, url): for i in range(3): try: response = requests.get(url, headers=self.headers, verify=False) json_data = response.content.decode() except (json.JSONDecodeError, adapters.SSLError): continue else: break else: return {} return json_data def save_content_list(self, data): """ 数据保存 :return: """ data_str = re.findall(r'productKeywords\",\"value\":\"(.*?)\"', data) if len(data_str) > 0: self.data_list.append(data_str[0].split(',')) def run(self, start_page, end_page, section_id): with ThreadPoolExecutor(max_workers=8) as t: obj_list = [] begin = time.time() url_list = index_title_script('proc_url', self.keyword, start_page, end_page, section_id) print(url_list) for url in url_list: obj = t.submit(self.spider, url) # 第一个参数是函数名 以后的是函数里面需要传的参数 obj_list.append(obj) for future in as_completed(obj_list): data = future.result() self.save_content_list(data) times = time.time() - begin print(times) return self.data_list