使用sanic框架实现分布式爬虫
bee_server.py
from sanic import Sanic from sanic import response from urlpool import UrlPool #初始化urlpool,很久需要进行修改 urlpool = UrlPool(__file__) #初始化url urlpool.add('https://news.sina.com.cn/') app = Sanic(__name__) @app.listener("after_server_stop") async def cache_urlpool(app,loop): global urlpool print("caching urlpool after_server_stop") del urlpool print("bye!") @app.route("/task") async def task_get(request): count = request.args.get("count",10) try: count = int(count) except: count = 10 urls = urlpool.pop(count) return response.json(urls) @app.route("/task",methods=["POST",]) async def task_post(requrst): result = requrst.json() urlpool.set_status(result['url'],result['status']) if result['url_real'] != result['url']: urlpool.set_status(result["url_real"],result["status"]) if result["newurls"]: print("receive URLs:",len(result["newurls"])) for url in result["newurls"]: urlpool.add(url) return response.text("ok") if __name__ == '__main__': app.run( host='127.0.0.1', port = 8080, debug=False, access_log=False, workers=1 )
bee_client.py
import aiohttp import json import asyncio import traceback import time class CrawlerClient: def __init__(self): self._workers = 0 self.workers_max = 10 self.server_host = "localhost" self.server_port = 8080 self.session = aiohttp.ClientSession(loop=self.loop) self.queue = asyncio.Queue async def get_url(self): count = self.workers_max-self.queue.qsize() if count <= 0: print("no need to get urls this time") url = "http://%S:%S/task?count=%s" % (self.server_host,self.server_port,count) try: async with self.session.get(url,timeout = 3) as response: if response.status not in [200,201]: return jsn = await response.text() urls = json.loads(jsn) msg = ('get_urls() to get [%s] but got[%s],@%s') % (count,len(urls),time.strftime('%Y-%m-%d %H:%M:%S')) print(msg) for lv in urls.items(): await self.queue.put(lv) print() except: traceback.print_exc() return async def send_result(self,result): ''' result = { "url“:url, 'url_real':response.url, 'status':status, "newurls":newurls, } ''' url = "http://%S:%S/task" % (self.server_host,self.server_port) try: async with self.session.post(url,json = result,timeout = 3) as response: response.status except: traceback.print_exc() pass
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步