多进程和多线程爬虫案例
| import os.path |
| import time |
| from multiprocessing import Process |
| from threading import Thread |
| import requests |
| from lxml import etree |
| from fake_useragent import UserAgent |
| |
| |
| class BaseSpider(object): |
| def __init__(self): |
| self.url_list = self.create_url_list() |
| |
| self.headers = { |
| 'User-Agent': UserAgent().random |
| } |
| self.BASE_DIR = os.path.dirname(__file__) |
| self.file_name_path = self.create_file_name() |
| |
| |
| def create_url_list(self): |
| url_list = [] |
| for i in range(1, 10): |
| if i == 1: |
| index_url = 'https://pic.netbian.com/4kdongman/' |
| url_list.append(index_url) |
| else: |
| index_url = f'https://pic.netbian.com/4kdongman/index_{i}.html' |
| url_list.append(index_url) |
| return url_list |
| |
| def get_tree(self, page_text): |
| tree = etree.HTML(page_text) |
| return tree |
| |
| def get_page_text(self, url, encoding='gbk'): |
| response = requests.get(url, headers=self.headers) |
| response.encoding = encoding |
| return response.text |
| |
| def create_file_name(self, path='img'): |
| file_name_path = os.path.join(self.BASE_DIR, path) |
| os.makedirs(file_name_path, exist_ok=True) |
| return file_name_path |
| |
| |
| class SpiderImg(BaseSpider): |
| |
| def __init__(self): |
| super().__init__() |
| |
| @staticmethod |
| def timer(func): |
| def inner(*args, **kwargs): |
| start_time = time.time() |
| res = func(*args, **kwargs) |
| print(f" {func.__name__} | 总耗时 :>>>> {time.time() - start_time} s") |
| return res |
| |
| return inner |
| |
| def spider_index_tree(self): |
| tree_list = [] |
| for url in self.url_list: |
| |
| page_text = self.get_page_text(url=url) |
| tree = self.get_tree(page_text=page_text) |
| tree_list.append(tree) |
| return tree_list |
| |
| def __get_tree_data(self, tree): |
| img_data_list = [] |
| li_list = tree.xpath('//*[@id="main"]/div[3]/ul/li') |
| for li in li_list: |
| |
| img_title = li.xpath('./a/img/@alt')[0] |
| img_src = 'https://pic.netbian.com' + li.xpath('./a/img/@src')[0] |
| img_data_list.append({'img_title': img_title, 'img_src': img_src}) |
| return img_data_list |
| |
| def spider_index_img_data(self): |
| img_data_list = [] |
| tree_list = self.spider_index_tree() |
| for tree in tree_list: |
| img_list = self.__get_tree_data(tree=tree) |
| |
| img_data_list.extend(img_list) |
| return img_data_list |
| |
| def download(self, img_src, img_title): |
| response = requests.get(url=img_src) |
| file_path = os.path.join(self.file_name_path, f'{img_title}.png') |
| with open(file_path, mode='wb') as fp: |
| for data in response.iter_content(): |
| fp.write(data) |
| print(f"当前图片 :>>>> {img_title} 保存成功!") |
| |
| @timer |
| def download_normal(self): |
| img_data_list = self.spider_index_img_data() |
| for img_data in img_data_list: |
| img_title = img_data.get('img_title') |
| img_src = img_data.get('img_src') |
| self.download(img_src=img_src, img_title=img_title) |
| |
| @timer |
| def download_process(self): |
| img_data_list = self.spider_index_img_data() |
| task_list = [Process(target=self.download, args=(img_data.get('img_src'), img_data.get('img_title'))) for |
| img_data in img_data_list] |
| [task.start() for task in task_list] |
| [task.join() for task in task_list] |
| |
| @timer |
| def download_thread(self): |
| img_data_list = self.spider_index_img_data() |
| task_list = [Thread(target=self.download, args=(img_data.get('img_src'), img_data.get('img_title'))) for |
| img_data in img_data_list] |
| [task.start() for task in task_list] |
| [task.join() for task in task_list] |
| |
| |
| if __name__ == '__main__': |
| spider = SpiderImg() |
| |
| |
| spider.download_thread() |
| |
| ''' |
| # num_list_one = [1, 2, 3, 4] |
| # num_list_two = [7, 8, 9, 10] |
| # num_list_new = [] |
| # print(num_list_new) |
| # num_list_new.extend(num_list_one) |
| # num_list_new.extend(num_list_two) |
| # print(num_list_new) |
| ''' |
协程案例
| import asyncio |
| import os |
| import time |
| |
| from fake_useragent import UserAgent |
| import aiohttp |
| from lxml import etree |
| |
| headers = { |
| 'User-Agent': UserAgent().random |
| } |
| BASE_DIR = os.path.dirname(__file__) |
| |
| |
| def create_file_name(path='img'): |
| file_name_path = os.path.join(BASE_DIR, path) |
| os.makedirs(file_name_path, exist_ok=True) |
| return file_name_path |
| |
| |
| file_name_path = create_file_name() |
| |
| |
| async def create_url_list(): |
| url_list = [] |
| for i in range(1, 10): |
| if i == 1: |
| index_url = 'https://pic.netbian.com/4kdongman/' |
| url_list.append(index_url) |
| else: |
| index_url = f'https://pic.netbian.com/4kdongman/index_{i}.html' |
| url_list.append(index_url) |
| return url_list |
| |
| |
| async def get_tree(page_text): |
| tree = etree.HTML(page_text) |
| return tree |
| |
| |
| async def get_page_text(tag_url, encoding='gbk'): |
| async with aiohttp.ClientSession() as session: |
| |
| async with session.get(url=tag_url, headers=headers, ssl=False) as response: |
| page_text = await response.text(encoding='gbk') |
| return page_text |
| |
| |
| async def spider_index_tree(): |
| tree_list = [] |
| url_list = await create_url_list() |
| |
| for url in url_list: |
| |
| page_text = await get_page_text(tag_url=url) |
| tree = await get_tree(page_text=page_text) |
| tree_list.append(tree) |
| return tree_list |
| |
| |
| async def get_tree_data(tree): |
| img_data_list = [] |
| li_list = tree.xpath('//*[@id="main"]/div[3]/ul/li') |
| for li in li_list: |
| |
| img_title = li.xpath('./a/img/@alt')[0] |
| img_src = 'https://pic.netbian.com' + li.xpath('./a/img/@src')[0] |
| img_data_list.append({'img_title': img_title, 'img_src': img_src}) |
| return img_data_list |
| |
| |
| async def spider_index_img_data(): |
| img_data_list = [] |
| tree_list = await spider_index_tree() |
| for tree in tree_list: |
| img_list = await get_tree_data(tree=tree) |
| |
| img_data_list.extend(img_list) |
| return img_data_list |
| |
| |
| async def download(img_src, img_title): |
| async with aiohttp.ClientSession() as session: |
| async with session.get(url=img_src, headers=headers, ssl=False) as response: |
| data_all = await response.read() |
| file_path = os.path.join(file_name_path, f'{img_title}.png') |
| with open(file_path, mode='wb') as fp: |
| fp.write(data_all) |
| print(f"当前图片 :>>>> {img_title} 保存成功!") |
| |
| |
| async def main(): |
| img_data_list = await spider_index_img_data() |
| |
| task_list = [asyncio.create_task(download(img_src=img_data.get('img_src'), img_title=img_data.get('img_title'))) for |
| img_data in img_data_list] |
| |
| await asyncio.wait(task_list) |
| |
| |
| if __name__ == '__main__': |
| start_time = time.time() |
| |
| asyncio.run(main()) |
| print(f"总耗时 :>>>> {time.time() - start_time} s") |
| |
| |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通