def _next_request_from_scheduler(self, spider):#在引擎中处理一个请求 slot = self.slot request = slot.scheduler.next_request() if not request: return d = self._download(request, spider)#生成了一个deferred对象 d.addBoth(self._handle_downloader_output, request, spider) d.addErrback(lambda f: logger.info('Error while handling downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.remove_request(request)) d.addErrback(lambda f: logger.info('Error while removing request from slot', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.nextcall.schedule()) d.addErrback(lambda f: logger.info('Error while scheduling new request', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d def _download(self, request, spider): slot = self.slot slot.add_request(request) def _on_success(response): assert isinstance(response, (Response, Request)) if isinstance(response, Response): response.request = request # tie request to response received logkws = self.logformatter.crawled(request, response, spider) logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) self.signals.send_catch_log(signal=signals.response_received, \ response=response, request=request, spider=spider) return response def _on_complete(_): slot.nextcall.schedule() return _ dwld = self.downloader.fetch(request, spider)#下载器fetch dwld.addCallbacks(_on_success) dwld.addBoth(_on_complete) return dwld 在HTTP11处理器中 class HTTP11DownloadHandler(object): def download_request(self, request, spider): """Return a deferred for the HTTP download""" agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool, maxsize=getattr(spider, 'download_maxsize', self._default_maxsize), warnsize=getattr(spider, 'download_warnsize', self._default_warnsize), fail_on_dataloss=self._fail_on_dataloss) return agent.download_request(request) class ScrapyAgent(object): def download_request(self, request): timeout = request.meta.get('download_timeout') or self._connectTimeout agent = self._get_agent(request, timeout) # request details url = urldefrag(request.url)[0] method = to_bytes(request.method) headers = TxHeaders(request.headers) if isinstance(agent, self._TunnelingAgent): headers.removeHeader(b'Proxy-Authorization') if request.body: bodyproducer = _RequestBodyProducer(request.body) elif method == b'POST': # Setting Content-Length: 0 even for POST requests is not a # MUST per HTTP RFCs, but it's common behavior, and some # servers require this, otherwise returning HTTP 411 Length required # # RFC 7230#section-3.3.2: # "a Content-Length header field is normally sent in a POST # request even when the value is 0 (indicating an empty payload body)." # # Twisted < 17 will not add "Content-Length: 0" by itself; # Twisted >= 17 fixes this; # Using a producer with an empty-string sends `0` as Content-Length # for all versions of Twisted. bodyproducer = _RequestBodyProducer(b'') else: bodyproducer = None start_time = time() d = agent.request( method, to_bytes(url, encoding='ascii'), headers, bodyproducer) # set download latency d.addCallback(self._cb_latency, request, start_time) # response body is ready to be consumed d.addCallback(self._cb_bodyready, request) d.addCallback(self._cb_bodydone, request, url) # check download timeout self._timeout_cl = reactor.callLater(timeout, d.cancel) d.addBoth(self._cb_timeout, request, url, timeout) return d class _ResponseReader(protocol.Protocol): def __init__(self, finished, txresponse, request, maxsize, warnsize, fail_on_dataloss): self._finished = finished self._txresponse = txresponse self._request = request self._bodybuf = BytesIO() self._maxsize = maxsize self._warnsize = warnsize self._fail_on_dataloss = fail_on_dataloss self._fail_on_dataloss_warned = False self._reached_warnsize = False self._bytes_received = 0 def dataReceived(self, bodyBytes):#读取数据,放到缓冲 # This maybe called several times after cancel was called with buffered # data. if self._finished.called: return self._bodybuf.write(bodyBytes) self._bytes_received += len(bodyBytes) if self._maxsize and self._bytes_received > self._maxsize: logger.error("Received (%(bytes)s) bytes larger than download " "max size (%(maxsize)s).", {'bytes': self._bytes_received, 'maxsize': self._maxsize}) # Clear buffer earlier to avoid keeping data in memory for a long # time. self._bodybuf.truncate(0) self._finished.cancel() if self._warnsize and self._bytes_received > self._warnsize and not self._reached_warnsize: self._reached_warnsize = True logger.warning("Received more bytes than download " "warn size (%(warnsize)s) in request %(request)s.", {'warnsize': self._warnsize, 'request': self._request}) def connectionLost(self, reason):#连接完成后调用,也即响应已经到达。 if self._finished.called: return body = self._bodybuf.getvalue() if reason.check(ResponseDone):#self._finished是deferred对象 self._finished.callback((self._txresponse, body, None))#回调了, return if reason.check(PotentialDataLoss): self._finished.callback((self._txresponse, body, ['partial'])) return if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons): if not self._fail_on_dataloss: self._finished.callback((self._txresponse, body, ['dataloss'])) return elif not self._fail_on_dataloss_warned: logger.warn("Got data loss in %s. If you want to process broken " "responses set the setting DOWNLOAD_FAIL_ON_DATALOSS = False" " -- This message won't be shown in further requests", self._txresponse.request.absoluteURI.decode()) self._fail_on_dataloss_warned = True self._finished.errback(reason)