python爬虫-使用线程池与使用协程的实例
背景:爬取豆瓣电影top250的信息
使用线程池
import re from concurrent.futures import ThreadPoolExecutor import requests #获取豆瓣电影top250电影名字、导演、评分、评价人数 def getDoubanRource(url): header = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0"} res=requests.get(url,headers=header) #获取页面源代码 pagesource = res.text #预加热正则表达式对象 obj=re.compile(r'<span class="title">(?P<filmname>.*?)</span>.*?<p class="">\s*(?P<director>.*?)' r' .*?<span class="rating_num" property="v:average">(?P<score>.*?)</span>.*?<span>(?P<person>.*?)评价</span>',re.S) reptile_res=obj.finditer(pagesource) with open("D:\dir_ytj\\dome1.csv",mode="a") as f: for item in reptile_res : filmname=item.group("filmname") director = item.group("director") score = item.group("score") person = item.group("person") f.write(f"{filmname},{director},{score},{person}\n") print(url,"收取完毕") if __name__ == '__main__': with ThreadPoolExecutor(10) as t: for i in range(10): t.submit(getDoubanRource,f"https://movie.douban.com/top250?start={25*i}&filter=") print("完成全部信息收录")
使用协程
import asyncio import sys,io import re from concurrent.futures import ThreadPoolExecutor import requests sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') #获取豆瓣电影top250电影名字、导演、评分、评价人数 async def writeCsv(filmname,director,score,person): with open("D:\dir_ytj\\dome2.csv", mode="a") as f: f.write(f"{filmname},{director},{score},{person}\n") async def getDoubanRource(url): header = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0"} res=requests.get(url,headers=header) #获取页面源代码 pagesource = res.text #预加热正则表达式对象 obj=re.compile(r'<span class="title">(?P<filmname>.*?)</span>.*?<p class="">\s*(?P<director>.*?)' r' .*?<span class="rating_num" property="v:average">(?P<score>.*?)</span>.*?<span>(?P<person>.*?)评价</span>',re.S) reptile_res=obj.finditer(pagesource) tasks=[] for item in reptile_res : filmname=item.group("filmname") director = item.group("director") score = item.group("score") person = item.group("person") tasks.append(writeCsv(filmname,director,score,person)) await asyncio.wait(tasks) if __name__ == '__main__': print("正在收集网页信息......") for i in range(10): asyncio.run(getDoubanRource(f"https://movie.douban.com/top250?start={25*i}&filter=")) print("收集完成")
友情链接:
作者: yetangjian
出处: https://www.cnblogs.com/yetangjian/p/15913293.html
关于作者: yetangjian
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出, 原文链接 如有问题, 可邮件(yetangjian@outlook.com)咨询.