python爬虫3-多进程多线程协程

多进程和多线程

from multiprocessing import Process
import threading

def process_worker():
    for i in range(200):
        print(f"Process worker {i}")

def thread_worker():
    for i in range(200):
        print(f"Thread worker {i}")

if __name__ == '__main__':
    # 创建并启动一个进程
    p = Process(target=process_worker)
    p.start()
    # p.join() # 阻塞主线程(或主进程),直到被调用的线程(或进程)完成执行。

    # 创建并启动一个线程
    t = threading.Thread(target=thread_worker)
    t.start()
    # t.join() # 阻塞主线程(或主进程),直到被调用的线程(或进程)完成执行。

    for i in range(200):
        print(f"主进程 {i}")

线程池和进程池

from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import time

def io_task(n):
    print(f'IO Task {n} starting')
    time.sleep(2)
    print(f'IO Task {n} finished')

def cpu_task(n):
    print(f'CPU Task {n} starting')
    result = sum(i*i for i in range(10**6))
    print(f'CPU Task {n} finished with result {result}')

if __name__ == '__main__':
    with ThreadPoolExecutor(max_workers=2) as thread_pool, ProcessPoolExecutor(max_workers=2) as process_pool:
        thread_futures = [thread_pool.submit(io_task, i) for i in range(4)]     # 循环四次(从 i=0 到 i=3),每次循环调用 io_task 函数并传递当前的循环计数 i 作为参数。
        process_futures = [process_pool.submit(cpu_task, i) for i in range(4)]  # 循环四次(从 i=0 到 i=3),每次循环调用 cpu_task 函数并传递当前的循环计数 i 作为参数。

    print("Main thread continues")

豆瓣实例

import requests
from lxml import etree
import re
import csv
from concurrent.futures import ThreadPoolExecutor

csvfile = open("douban4.csv",mode="w",encoding="utf-8")
csvwriter = csv.writer(csvfile)


def download_one_page(url):
    # 网页源码
    # 头部信息
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    }
    resp = requests.get(url,headers=headers)

    html = etree.HTML(resp.text)
    # 使用 XPath 选取所有 title 元素的文本内容
    lis = html.xpath('//*[@id="content"]/div/div[1]/ol/li')

    for li in lis:
        name = li.xpath("./div/div[2]/div[1]/a/span[1]/text()")[0]  # 名字
        year = li.xpath("./div/div[2]/div[2]/p[1]/text()[2]")[0]  # 年份
        rating = li.xpath("./div/div[2]/div[2]/div/span[2]/text()")[0]  # 评分
        year = re.findall(r'\b\d{4}\b', year)[0]  # 对年份进行处理
        # 保存为csv文件
        csvwriter.writerow([name, year, rating])
    print(url,"提取完毕")



if __name__ == '__main__':

    with ThreadPoolExecutor(3) as thread_pool:
        for i in range(10):
            thread_pool.submit(download_one_page,f'http://movie.douban.com/top250?start={i*25}&filter=')
    print("over")

协程

协程(coroutine)是一种用于并发编程的构造,允许你在等待某些操作完成时进行其他工作。协程在处理 I/O 绑定任务(如网络请求或文件读取)时特别有用,因为它们可以在一个任务等待完成时切换到另一个任务。

实例

import asyncio

async def say(what, when):      # async 关键字 用来定义一个协程.
    await asyncio.sleep(when)   # await 关键字 用来挂起阻塞方法的执行。
    print(what)

async def main():
    # 并行执行多个协程
    await asyncio.gather(
        say("First", 2),
        say("Second", 1),
        say("Third", 3)
    )

if __name__ == '__main__':
    asyncio.run(main()) #运行协程

异步爬取图片

import asyncio
import aiohttp
import os

urls = [
    "https://p0.ssl.qhimgs1.com/sdr/400__/t01c3a6a8abd00f35a4.jpg",
    "https://p2.ssl.qhimgs1.com/sdr/400__/t01f2bf1a12bb90de88.jpg",
    "https://p1.ssl.qhimgs1.com/sdr/400__/t0101cc32a15d91a3f7.jpg"
]


async def aiodownload(url):
    name = url.rsplit("/", 1)[1]  # 提取文件名
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
            with open(name, mode="wb") as f:
                f.write(await resp.content.read())  # 读取内容为异步,需要await
    print(name, "over")


async def main():
    tasks = [aiodownload(url) for url in urls]
    await asyncio.gather(*tasks)  # 使用 asyncio.gather() 代替 asyncio.wait()


if __name__ == '__main__':
    # 在 Windows 上使用 SelectorEventLoop
    if os.name == 'nt':
        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

    try:
        asyncio.run(main())  # 运行协程
    except RuntimeError as e:
        if str(e) == "Event loop is closed":
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
            loop.run_until_complete(main())

百度西游记异步爬取

import requests
import asyncio
import aiohttp
import aiofiles
import os
import json

async def aiodownload(cid, b_id, title):
    # 拼接url
    data = {
        "book_id": b_id,
        "cid": f"{b_id}|{cid}",
        "need_bookinfo": 1
    }
    data = json.dumps(data)
    url = f"https://dushu.baidu.com/api/pc/getChapterContent?data={data}"
    
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
            # 异步获取网页数据
            dic = await resp.json()
            
            if 'data' in dic and 'novel' in dic['data'] and 'content' in dic['data']['novel']:
                content = dic['data']['novel']['content']
                # 异步写入文件
                async with aiofiles.open(os.path.join('novel', title + '.txt'), mode="w", encoding='utf-8') as f:
                    await f.write(content)
            else:
                print(f"Failed to get content for chapter {title}")

# 实现同步获取目录
async def getCatalog(url, b_id):
    resp = requests.get(url)
    dic = resp.json()
    tasks = []
    # 获取目录并添加协程任务
    for item in dic['data']['novel']['items']:
        title = item['title']
        cid = item['cid']
        
        # 准备异步任务
        tasks.append(aiodownload(cid, b_id, title))
	
    # 异步执行任务
    await asyncio.gather(*tasks)

if __name__ == '__main__':
    b_id = "4306063500"
    url = f'https://dushu.baidu.com/api/pc/getCatalog?data={{"book_id":"{b_id}"}}'

    # 创建novel目录
    if not os.path.exists('novel'):
        os.makedirs('novel')

 # 在 Windows 上使用 SelectorEventLoop
    if os.name == 'nt':
        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
    try:
        asyncio.run(getCatalog(url, b_id))
    except RuntimeError as e:
        if str(e) == "Event loop is closed":
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
            loop.run_until_complete(getCatalog(url, b_id))
posted @ 2024-07-04 20:33  noahze  阅读(24)  评论(0编辑  收藏  举报