爬虫案例

多进程和多线程爬虫案例

import os.path
import time
from multiprocessing import Process
from threading import Thread
import requests
from lxml import etree
from fake_useragent import UserAgent
class BaseSpider(object):
def __init__(self):
self.url_list = self.create_url_list()
# self.url_list = ['https://pic.netbian.com/4kdongman/']
self.headers = {
'User-Agent': UserAgent().random
}
self.BASE_DIR = os.path.dirname(__file__)
self.file_name_path = self.create_file_name()
# 创建url列表
def create_url_list(self):
url_list = []
for i in range(1, 10):
if i == 1:
index_url = 'https://pic.netbian.com/4kdongman/'
url_list.append(index_url)
else:
index_url = f'https://pic.netbian.com/4kdongman/index_{i}.html'
url_list.append(index_url)
return url_list
def get_tree(self, page_text):
tree = etree.HTML(page_text)
return tree
def get_page_text(self, url, encoding='gbk'):
response = requests.get(url, headers=self.headers)
response.encoding = encoding
return response.text
def create_file_name(self, path='img'):
file_name_path = os.path.join(self.BASE_DIR, path)
os.makedirs(file_name_path, exist_ok=True)
return file_name_path
class SpiderImg(BaseSpider):
def __init__(self):
super().__init__()
@staticmethod
def timer(func):
def inner(*args, **kwargs):
start_time = time.time()
res = func(*args, **kwargs)
print(f" {func.__name__} | 总耗时 :>>>> {time.time() - start_time} s")
return res
return inner
def spider_index_tree(self):
tree_list = []
for url in self.url_list:
# 获取每一页的页面源码
page_text = self.get_page_text(url=url)
tree = self.get_tree(page_text=page_text)
tree_list.append(tree)
return tree_list
def __get_tree_data(self, tree):
img_data_list = []
li_list = tree.xpath('//*[@id="main"]/div[3]/ul/li')
for li in li_list:
# ./a/img
img_title = li.xpath('./a/img/@alt')[0]
img_src = 'https://pic.netbian.com' + li.xpath('./a/img/@src')[0]
img_data_list.append({'img_title': img_title, 'img_src': img_src})
return img_data_list
def spider_index_img_data(self):
img_data_list = []
tree_list = self.spider_index_tree()
for tree in tree_list:
img_list = self.__get_tree_data(tree=tree)
# [{},{}]
img_data_list.extend(img_list)
return img_data_list
def download(self, img_src, img_title):
response = requests.get(url=img_src)
file_path = os.path.join(self.file_name_path, f'{img_title}.png')
with open(file_path, mode='wb') as fp:
for data in response.iter_content():
fp.write(data)
print(f"当前图片 :>>>> {img_title} 保存成功!")
@timer
def download_normal(self):
img_data_list = self.spider_index_img_data()
for img_data in img_data_list:
img_title = img_data.get('img_title')
img_src = img_data.get('img_src')
self.download(img_src=img_src, img_title=img_title)
@timer
def download_process(self):
img_data_list = self.spider_index_img_data()
task_list = [Process(target=self.download, args=(img_data.get('img_src'), img_data.get('img_title'))) for
img_data in img_data_list]
[task.start() for task in task_list]
[task.join() for task in task_list]
@timer
def download_thread(self):
img_data_list = self.spider_index_img_data()
task_list = [Thread(target=self.download, args=(img_data.get('img_src'), img_data.get('img_title'))) for
img_data in img_data_list]
[task.start() for task in task_list]
[task.join() for task in task_list]
if __name__ == '__main__':
spider = SpiderImg()
# spider.download_normal() # download_normal | 总耗时 :>>>> 31.3393292427063 s
# spider.download_process() # download_process | 总耗时 :>>>> 34.51722550392151 s
spider.download_thread() # download_thread | 总耗时 :>>>> 15.272460699081421 s
'''
# num_list_one = [1, 2, 3, 4]
# num_list_two = [7, 8, 9, 10]
# num_list_new = []
# print(num_list_new)
# num_list_new.extend(num_list_one)
# num_list_new.extend(num_list_two)
# print(num_list_new)
'''

协程案例

import asyncio
import os
import time
from fake_useragent import UserAgent
import aiohttp
from lxml import etree
headers = {
'User-Agent': UserAgent().random
}
BASE_DIR = os.path.dirname(__file__)
def create_file_name(path='img'):
file_name_path = os.path.join(BASE_DIR, path)
os.makedirs(file_name_path, exist_ok=True)
return file_name_path
file_name_path = create_file_name()
async def create_url_list():
url_list = []
for i in range(1, 10):
if i == 1:
index_url = 'https://pic.netbian.com/4kdongman/'
url_list.append(index_url)
else:
index_url = f'https://pic.netbian.com/4kdongman/index_{i}.html'
url_list.append(index_url)
return url_list
async def get_tree(page_text):
tree = etree.HTML(page_text)
return tree
async def get_page_text(tag_url, encoding='gbk'):
async with aiohttp.ClientSession() as session:
# 如果遇到 ssl error 这种错,一般都是 ssl=False
async with session.get(url=tag_url, headers=headers, ssl=False) as response:
page_text = await response.text(encoding='gbk')
return page_text
async def spider_index_tree():
tree_list = []
url_list = await create_url_list()
# url_list = ['https://pic.netbian.com/4kdongman/']
for url in url_list:
# 获取每一页的页面源码
page_text = await get_page_text(tag_url=url)
tree = await get_tree(page_text=page_text)
tree_list.append(tree)
return tree_list
async def get_tree_data(tree):
img_data_list = []
li_list = tree.xpath('//*[@id="main"]/div[3]/ul/li')
for li in li_list:
# ./a/img
img_title = li.xpath('./a/img/@alt')[0]
img_src = 'https://pic.netbian.com' + li.xpath('./a/img/@src')[0]
img_data_list.append({'img_title': img_title, 'img_src': img_src})
return img_data_list
async def spider_index_img_data():
img_data_list = []
tree_list = await spider_index_tree()
for tree in tree_list:
img_list = await get_tree_data(tree=tree)
# [{},{}]
img_data_list.extend(img_list)
return img_data_list
async def download(img_src, img_title):
async with aiohttp.ClientSession() as session:
async with session.get(url=img_src, headers=headers, ssl=False) as response:
data_all = await response.read()
file_path = os.path.join(file_name_path, f'{img_title}.png')
with open(file_path, mode='wb') as fp:
fp.write(data_all)
print(f"当前图片 :>>>> {img_title} 保存成功!")
async def main():
img_data_list = await spider_index_img_data()
# 创建Task对象列表
task_list = [asyncio.create_task(download(img_src=img_data.get('img_src'), img_title=img_data.get('img_title'))) for
img_data in img_data_list]
# 等待任务完成
await asyncio.wait(task_list)
if __name__ == '__main__':
start_time = time.time()
# 启协程
asyncio.run(main())
print(f"总耗时 :>>>> {time.time() - start_time} s")
# 总耗时 :>>>> 6.5860209465026855 s
posted @   ssrheart  阅读(13)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通
点击右上角即可分享
微信分享提示