python爬虫-使用线程池与使用协程的实例

背景:爬取豆瓣电影top250的信息

使用线程池

import re
from concurrent.futures import ThreadPoolExecutor
import requests

#获取豆瓣电影top250电影名字、导演、评分、评价人数
def getDoubanRource(url):
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0"}
    res=requests.get(url,headers=header)
    #获取页面源代码
    pagesource = res.text
    #预加热正则表达式对象
    obj=re.compile(r'<span class="title">(?P<filmname>.*?)</span>.*?<p class="">\s*(?P<director>.*?)'
                   r' .*?<span class="rating_num" property="v:average">(?P<score>.*?)</span>.*?<span>(?P<person>.*?)评价</span>',re.S)
    reptile_res=obj.finditer(pagesource)
    with open("D:\dir_ytj\\dome1.csv",mode="a") as f:
        for item in reptile_res :
            filmname=item.group("filmname")
            director = item.group("director")
            score = item.group("score")
            person = item.group("person")
            f.write(f"{filmname},{director},{score},{person}\n")
        print(url,"收取完毕")

if __name__ == '__main__':
    with ThreadPoolExecutor(10) as t:
        for i in range(10):
            t.submit(getDoubanRource,f"https://movie.douban.com/top250?start={25*i}&filter=")

    print("完成全部信息收录")

 使用协程

import asyncio
import sys,io
import re
from concurrent.futures import ThreadPoolExecutor
import requests
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
#获取豆瓣电影top250电影名字、导演、评分、评价人数
async def writeCsv(filmname,director,score,person):
    with open("D:\dir_ytj\\dome2.csv", mode="a") as f:
        f.write(f"{filmname},{director},{score},{person}\n")


async def getDoubanRource(url):
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0"}
    res=requests.get(url,headers=header)
    #获取页面源代码
    pagesource = res.text
    #预加热正则表达式对象
    obj=re.compile(r'<span class="title">(?P<filmname>.*?)</span>.*?<p class="">\s*(?P<director>.*?)'
                   r' .*?<span class="rating_num" property="v:average">(?P<score>.*?)</span>.*?<span>(?P<person>.*?)评价</span>',re.S)
    reptile_res=obj.finditer(pagesource)
    tasks=[]
    for item in reptile_res :
        filmname=item.group("filmname")
        director = item.group("director")
        score = item.group("score")
        person = item.group("person")
        tasks.append(writeCsv(filmname,director,score,person))

    await asyncio.wait(tasks)


if __name__ == '__main__':
    print("正在收集网页信息......")
    for i in range(10):
        asyncio.run(getDoubanRource(f"https://movie.douban.com/top250?start={25*i}&filter="))
    print("收集完成")

 友情链接:

Scrapy爬虫框架快速入门

posted @ 2022-02-19 18:08  yetangjian  阅读(78)  评论(0编辑  收藏  举报