Engine.py (core): d = self.scraper.enqueue_scrape(response, request, spider)
def _handle_downloader_output(self, response, request, spider): assert isinstance(response, (Request, Response, Failure)), response # downloader middleware can return requests (for example, redirects) if isinstance(response, Request): self.crawl(response, spider) return # response is a Response or Failure d = self.scraper.enqueue_scrape(response, request, spider) d.addErrback(lambda f: logger.error('Error while enqueuing downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d
Scraper.py (core): def enqueue_scrape(self, response, request, spider):
def enqueue_scrape(self, response, request, spider): slot = self.slot dfd = slot.add_response_request(response, request) def finish_scraping(_): slot.finish_response(response, request) self._check_if_closing(spider, slot) self._scrape_next(spider, slot) return _ dfd.addBoth(finish_scraping) dfd.addErrback( lambda f: logger.error('Scraper bug processing %(request)s', {'request': request}, exc_info=failure_to_exc_info(f), extra={'spider': spider})) self._scrape_next(spider, slot) return dfd
self._scrape_next(spider, slot)
def _scrape_next(self, spider, slot): while slot.queue: response, request, deferred = slot.next_response_request_deferred() self._scrape(response, request, spider).chainDeferred(deferred) def _scrape(self, response, request, spider): """Handle the downloaded response or failure through the spider callback/errback""" assert isinstance(response, (Response, Failure)) dfd = self._scrape2(response, request, spider) # returns spiders processed output dfd.addErrback(self.handle_spider_error, request, response, spider) dfd.addCallback(self.handle_spider_output, request, response, spider) return dfd def _scrape2(self, request_result, request, spider): """Handle the different cases of request's result been a Response or a Failure""" if not isinstance(request_result, Failure):#不是失败 return self.spidermw.scrape_response( self.call_spider, request_result, request, spider) else: # FIXME: don't ignore errors in spider middleware dfd = self.call_spider(request_result, request, spider) return dfd.addErrback( self._log_download_errors, request_result, request, spider) def call_spider(self, result, request, spider): result.request = request dfd = defer_result(result) dfd.addCallbacks(request.callback or spider.parse, request.errback) return dfd.addCallback(iterate_spider_output)
def handle_spider_output(self, result, request, response, spider): if not result: return defer_succeed(None) it = iter_errback(result, self.handle_spider_error, request, response, spider) dfd = parallel(it, self.concurrent_items, self._process_spidermw_output, request, response, spider) return dfd
def _process_spidermw_output(self, output, request, response, spider): """Process each Request/Item (given in the output parameter) returned from the given spider """ if isinstance(output, Request):#是请求的话 继续爬行 self.crawler.engine.crawl(request=output, spider=spider) elif isinstance(output, (BaseItem, dict)):#是item的话,要保存 self.slot.itemproc_size += 1 dfd = self.itemproc.process_item(output, spider) dfd.addBoth(self._itemproc_finished, output, response, spider) return dfd elif output is None: pass else: typename = type(output).__name__ logger.error('Spider must return Request, BaseItem, dict or None, ' 'got %(typename)r in %(request)s', {'request': request, 'typename': typename}, extra={'spider': spider})