scrapy中deferred的回调

Posted on 2018-02-28 16:15  王将军之武库  阅读(1362)  评论(0编辑  收藏  举报
def _next_request_from_scheduler(self, spider):#在引擎中处理一个请求
        slot = self.slot
        request = slot.scheduler.next_request()
        if not request:
            return
        d = self._download(request, spider)#生成了一个deferred对象
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(lambda f: logger.info('Error while handling downloader output',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(lambda f: logger.info('Error while removing request from slot',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(lambda f: logger.info('Error while scheduling new request',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        return d

def _download(self, request, spider):
        slot = self.slot
        slot.add_request(request)
        def _on_success(response):
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request # tie request to response received
                logkws = self.logformatter.crawled(request, response, spider)
                logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
                self.signals.send_catch_log(signal=signals.response_received, \
                    response=response, request=request, spider=spider)
            return response

      def _on_complete(_):
            slot.nextcall.schedule()
            return _

        dwld = self.downloader.fetch(request, spider)#下载器fetch
        dwld.addCallbacks(_on_success)
        dwld.addBoth(_on_complete)
        return dwld
在HTTP11处理器中
class HTTP11DownloadHandler(object):

def download_request(self, request, spider):
        """Return a deferred for the HTTP download"""
        agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool,
            maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
            warnsize=getattr(spider, 'download_warnsize', self._default_warnsize),
            fail_on_dataloss=self._fail_on_dataloss)
        return agent.download_request(request)

class ScrapyAgent(object):

def download_request(self, request):
        timeout = request.meta.get('download_timeout') or self._connectTimeout
        agent = self._get_agent(request, timeout)

        # request details
        url = urldefrag(request.url)[0]
        method = to_bytes(request.method)
        headers = TxHeaders(request.headers)
        if isinstance(agent, self._TunnelingAgent):
            headers.removeHeader(b'Proxy-Authorization')
        if request.body:
            bodyproducer = _RequestBodyProducer(request.body)
        elif method == b'POST':
            # Setting Content-Length: 0 even for POST requests is not a
            # MUST per HTTP RFCs, but it's common behavior, and some
            # servers require this, otherwise returning HTTP 411 Length required
            #
            # RFC 7230#section-3.3.2:
            # "a Content-Length header field is normally sent in a POST
            # request even when the value is 0 (indicating an empty payload body)."
            #
            # Twisted < 17 will not add "Content-Length: 0" by itself;
            # Twisted >= 17 fixes this;
            # Using a producer with an empty-string sends `0` as Content-Length
            # for all versions of Twisted.
            bodyproducer = _RequestBodyProducer(b'')
        else:
            bodyproducer = None
        start_time = time()
        d = agent.request(
            method, to_bytes(url, encoding='ascii'), headers, bodyproducer)
        # set download latency
        d.addCallback(self._cb_latency, request, start_time)
        # response body is ready to be consumed
        d.addCallback(self._cb_bodyready, request)
        d.addCallback(self._cb_bodydone, request, url)
        # check download timeout
        self._timeout_cl = reactor.callLater(timeout, d.cancel)
        d.addBoth(self._cb_timeout, request, url, timeout)
        return d


class _ResponseReader(protocol.Protocol):

    def __init__(self, finished, txresponse, request, maxsize, warnsize,
                 fail_on_dataloss):
        self._finished = finished
        self._txresponse = txresponse
        self._request = request
        self._bodybuf = BytesIO()
        self._maxsize  = maxsize
        self._warnsize  = warnsize
        self._fail_on_dataloss = fail_on_dataloss
        self._fail_on_dataloss_warned = False
        self._reached_warnsize = False
        self._bytes_received = 0

    def dataReceived(self, bodyBytes):#读取数据,放到缓冲
        # This maybe called several times after cancel was called with buffered
        # data.
        if self._finished.called:
            return

        self._bodybuf.write(bodyBytes)
        self._bytes_received += len(bodyBytes)

        if self._maxsize and self._bytes_received > self._maxsize:
            logger.error("Received (%(bytes)s) bytes larger than download "
                         "max size (%(maxsize)s).",
                         {'bytes': self._bytes_received,
                          'maxsize': self._maxsize})
            # Clear buffer earlier to avoid keeping data in memory for a long
            # time.
            self._bodybuf.truncate(0)
            self._finished.cancel()

        if self._warnsize and self._bytes_received > self._warnsize and not self._reached_warnsize:
            self._reached_warnsize = True
            logger.warning("Received more bytes than download "
                           "warn size (%(warnsize)s) in request %(request)s.",
                           {'warnsize': self._warnsize,
                            'request': self._request})

    def connectionLost(self, reason):#连接完成后调用,也即响应已经到达。
        if self._finished.called:
            return

        body = self._bodybuf.getvalue()
        if reason.check(ResponseDone):#self._finished是deferred对象
            self._finished.callback((self._txresponse, body, None))#回调了,
            return

        if reason.check(PotentialDataLoss):
            self._finished.callback((self._txresponse, body, ['partial']))
            return

        if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons):
            if not self._fail_on_dataloss:
                self._finished.callback((self._txresponse, body, ['dataloss']))
                return

            elif not self._fail_on_dataloss_warned:
                logger.warn("Got data loss in %s. If you want to process broken "
                            "responses set the setting DOWNLOAD_FAIL_ON_DATALOSS = False"
                            " -- This message won't be shown in further requests",
                            self._txresponse.request.absoluteURI.decode())
                self._fail_on_dataloss_warned = True

        self._finished.errback(reason)

 

Copyright © 2025 王将军之武库
Powered by .NET 9.0 on Kubernetes