python爬虫3-多进程多线程协程
多进程和多线程
from multiprocessing import Process
import threading
def process_worker():
for i in range(200):
print(f"Process worker {i}")
def thread_worker():
for i in range(200):
print(f"Thread worker {i}")
if __name__ == '__main__':
# 创建并启动一个进程
p = Process(target=process_worker)
p.start()
# p.join() # 阻塞主线程(或主进程),直到被调用的线程(或进程)完成执行。
# 创建并启动一个线程
t = threading.Thread(target=thread_worker)
t.start()
# t.join() # 阻塞主线程(或主进程),直到被调用的线程(或进程)完成执行。
for i in range(200):
print(f"主进程 {i}")
线程池和进程池
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import time
def io_task(n):
print(f'IO Task {n} starting')
time.sleep(2)
print(f'IO Task {n} finished')
def cpu_task(n):
print(f'CPU Task {n} starting')
result = sum(i*i for i in range(10**6))
print(f'CPU Task {n} finished with result {result}')
if __name__ == '__main__':
with ThreadPoolExecutor(max_workers=2) as thread_pool, ProcessPoolExecutor(max_workers=2) as process_pool:
thread_futures = [thread_pool.submit(io_task, i) for i in range(4)] # 循环四次(从 i=0 到 i=3),每次循环调用 io_task 函数并传递当前的循环计数 i 作为参数。
process_futures = [process_pool.submit(cpu_task, i) for i in range(4)] # 循环四次(从 i=0 到 i=3),每次循环调用 cpu_task 函数并传递当前的循环计数 i 作为参数。
print("Main thread continues")
豆瓣实例
import requests
from lxml import etree
import re
import csv
from concurrent.futures import ThreadPoolExecutor
csvfile = open("douban4.csv",mode="w",encoding="utf-8")
csvwriter = csv.writer(csvfile)
def download_one_page(url):
# 网页源码
# 头部信息
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
}
resp = requests.get(url,headers=headers)
html = etree.HTML(resp.text)
# 使用 XPath 选取所有 title 元素的文本内容
lis = html.xpath('//*[@id="content"]/div/div[1]/ol/li')
for li in lis:
name = li.xpath("./div/div[2]/div[1]/a/span[1]/text()")[0] # 名字
year = li.xpath("./div/div[2]/div[2]/p[1]/text()[2]")[0] # 年份
rating = li.xpath("./div/div[2]/div[2]/div/span[2]/text()")[0] # 评分
year = re.findall(r'\b\d{4}\b', year)[0] # 对年份进行处理
# 保存为csv文件
csvwriter.writerow([name, year, rating])
print(url,"提取完毕")
if __name__ == '__main__':
with ThreadPoolExecutor(3) as thread_pool:
for i in range(10):
thread_pool.submit(download_one_page,f'http://movie.douban.com/top250?start={i*25}&filter=')
print("over")
协程
协程(coroutine)是一种用于并发编程的构造,允许你在等待某些操作完成时进行其他工作。协程在处理 I/O 绑定任务(如网络请求或文件读取)时特别有用,因为它们可以在一个任务等待完成时切换到另一个任务。
实例
import asyncio
async def say(what, when): # async 关键字 用来定义一个协程.
await asyncio.sleep(when) # await 关键字 用来挂起阻塞方法的执行。
print(what)
async def main():
# 并行执行多个协程
await asyncio.gather(
say("First", 2),
say("Second", 1),
say("Third", 3)
)
if __name__ == '__main__':
asyncio.run(main()) #运行协程
异步爬取图片
import asyncio
import aiohttp
import os
urls = [
"https://p0.ssl.qhimgs1.com/sdr/400__/t01c3a6a8abd00f35a4.jpg",
"https://p2.ssl.qhimgs1.com/sdr/400__/t01f2bf1a12bb90de88.jpg",
"https://p1.ssl.qhimgs1.com/sdr/400__/t0101cc32a15d91a3f7.jpg"
]
async def aiodownload(url):
name = url.rsplit("/", 1)[1] # 提取文件名
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
with open(name, mode="wb") as f:
f.write(await resp.content.read()) # 读取内容为异步,需要await
print(name, "over")
async def main():
tasks = [aiodownload(url) for url in urls]
await asyncio.gather(*tasks) # 使用 asyncio.gather() 代替 asyncio.wait()
if __name__ == '__main__':
# 在 Windows 上使用 SelectorEventLoop
if os.name == 'nt':
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
try:
asyncio.run(main()) # 运行协程
except RuntimeError as e:
if str(e) == "Event loop is closed":
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(main())
百度西游记异步爬取
import requests
import asyncio
import aiohttp
import aiofiles
import os
import json
async def aiodownload(cid, b_id, title):
# 拼接url
data = {
"book_id": b_id,
"cid": f"{b_id}|{cid}",
"need_bookinfo": 1
}
data = json.dumps(data)
url = f"https://dushu.baidu.com/api/pc/getChapterContent?data={data}"
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
# 异步获取网页数据
dic = await resp.json()
if 'data' in dic and 'novel' in dic['data'] and 'content' in dic['data']['novel']:
content = dic['data']['novel']['content']
# 异步写入文件
async with aiofiles.open(os.path.join('novel', title + '.txt'), mode="w", encoding='utf-8') as f:
await f.write(content)
else:
print(f"Failed to get content for chapter {title}")
# 实现同步获取目录
async def getCatalog(url, b_id):
resp = requests.get(url)
dic = resp.json()
tasks = []
# 获取目录并添加协程任务
for item in dic['data']['novel']['items']:
title = item['title']
cid = item['cid']
# 准备异步任务
tasks.append(aiodownload(cid, b_id, title))
# 异步执行任务
await asyncio.gather(*tasks)
if __name__ == '__main__':
b_id = "4306063500"
url = f'https://dushu.baidu.com/api/pc/getCatalog?data={{"book_id":"{b_id}"}}'
# 创建novel目录
if not os.path.exists('novel'):
os.makedirs('novel')
# 在 Windows 上使用 SelectorEventLoop
if os.name == 'nt':
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
try:
asyncio.run(getCatalog(url, b_id))
except RuntimeError as e:
if str(e) == "Event loop is closed":
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(getCatalog(url, b_id))