使用sanic框架实现分布式爬虫

bee_server.py

from sanic import Sanic
from sanic import  response
from urlpool import  UrlPool

#初始化urlpool,很久需要进行修改
urlpool = UrlPool(__file__)

#初始化url
urlpool.add('https://news.sina.com.cn/')
app = Sanic(__name__)

@app.listener("after_server_stop")
async def cache_urlpool(app,loop):
    global urlpool
    print("caching urlpool after_server_stop")
    del urlpool
    print("bye!")

@app.route("/task")
async def task_get(request):
    count = request.args.get("count",10)
    try:
        count = int(count)
    except:
        count = 10
    urls = urlpool.pop(count)
    return response.json(urls)

@app.route("/task",methods=["POST",])
async def task_post(requrst):
    result = requrst.json()
    urlpool.set_status(result['url'],result['status'])
    if result['url_real'] != result['url']:
        urlpool.set_status(result["url_real"],result["status"])
    if result["newurls"]:
        print("receive URLs:",len(result["newurls"]))
        for url in result["newurls"]:
            urlpool.add(url)
    return response.text("ok")

if __name__ == '__main__':
    app.run(
        host='127.0.0.1',
        port = 8080,
        debug=False,
        access_log=False,
        workers=1
    )

bee_client.py

import aiohttp
import json
import asyncio
import traceback
import time

class CrawlerClient:
    def __init__(self):
        self._workers = 0
        self.workers_max = 10
        self.server_host = "localhost"
        self.server_port = 8080
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.queue = asyncio.Queue

    async def get_url(self):
        count = self.workers_max-self.queue.qsize()
        if count <= 0:
            print("no need to get urls this time")

        url = "http://%S:%S/task?count=%s" % (self.server_host,self.server_port,count)
        try:
            async with self.session.get(url,timeout = 3) as response:
                if response.status not in [200,201]:
                    return
                jsn = await response.text()
                urls = json.loads(jsn)
                msg = ('get_urls() to get [%s] but got[%s],@%s') % (count,len(urls),time.strftime('%Y-%m-%d %H:%M:%S'))
                print(msg)
                for lv in urls.items():
                    await self.queue.put(lv)
                print()
        except:
            traceback.print_exc()
            return

    async def send_result(self,result):
        '''
        result = {
        "url“:url,
        'url_real':response.url,
        'status':status,
        "newurls":newurls,
        }
        '''
        url = "http://%S:%S/task" % (self.server_host,self.server_port)
        try:
            async with self.session.post(url,json = result,timeout = 3) as response:
                response.status
        except:
            traceback.print_exc()
            pass

posted @ 2021-10-18 16:32 山水无期阅读(131) 评论(0) 编辑收藏举报

努力加载评论中...

刷新页面返回顶部

山水无期

使用sanic框架实现分布式爬虫

公告