Python 多线程、线程池、协程 爬虫
多线程生产者消费者模型爬虫
import queue
import requests
from bs4 import BeautifulSoup
import threading
import time
import random
def craw(url):
r = requests.get(url=url)
return r.text
def parse(html):
soup = BeautifulSoup(html, "html.parser")
links = soup.find_all("a", class_="post-time-title")
return [(link["href"], link.get_test()) for link in links]
def do_craw(url_queue: queue.Queue, html_queue: queue.Queue):
while True:
url = url_queue.get()
html = craw(url)
html_queue.put(html)
print(threading.current_thread().name, url)
time.sleep(random.randint(1,2))
def do_parse(html_queue:queue.Queue, f_out):
while True:
html = html_queue.get()
results = parse(html)
for result in results:
f_out.write(str(result) + "\n")
print(threading.current_thread().name, html_queue.qsize())
time.sleep(1)
if __name__ == '__main__':
url_queue = queue.Queue()
html_queue = queue.Queue()
for url in ["https://www.cnblogs.com/#p{}".format(i) for i in range(1, 25)]:
url_queue.put(url)
for idx in range(3):
t = threading.Thread(target=do_craw, args=(url_queue, html_queue), name=f"craw-{idx}")
t.start()
file = open("02.data.txt", "w")
for idx in range(2):
d = threading.Thread(target=do_parse, args=(html_queue, file), name=f"parse-{idx}")
d.start()
多线程池爬虫
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
from bs4 import BeautifulSoup
spider_url = ["https://www.cnblogs.com/#p{}".format(i) for i in range(1, 25)]
def craw(url):
r = requests.get(url=url)
return r.text
def parse(html):
soup = BeautifulSoup(html, "html.parser")
links = soup.find_all("a", class_="post-time-title")
return [(link["href"], link.get_test()) for link in links]
# craw
with ThreadPoolExecutor() as pool:
htmls = pool.map(craw, spider_url)
htmls = list(zip(spider_url, htmls))
for k, v in htmls:
print(k, len(v))
with ThreadPoolExecutor() as pool:
futures = {}
for url, html in htmls:
future = pool.submit(parse, html)
futures[future] = url
# for k, v in futures.items():
# print(v, k.result())
for future in as_completed(futures):
print(futures[future], future.result())
协程
import asyncio
import aiohttp
spider_url = ["https://www.cnblogs.com/taozhengquan/p/14966535.html"]*50
# 信号量控制爬虫数量
semaphore = asyncio.Semaphore(10)
async def async_craw(url):
async with semaphore:
print("craw url:", url)
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
result = await resp.text()
print(url, len(result))
loop = asyncio.get_event_loop()
tasks = [
loop.create_task(async_craw(item)) for item in spider_url
]
loop.run_until_complete(asyncio.wait(tasks))
此时此刻,非我莫属
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· winform 绘制太阳,地球,月球 运作规律
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)