celery 调用 scrapy

celery 调用 scrapy

需求

  1. 如同调用函数般调用.
  2. 定时或时时获取数据.

实现

from crochet import setup, wait_for
from scrapy.crawler import CrawlerProcess

class Crawler(CrawlerProcess):
    '''
    crawler = Crawler(settings={})
    result = crawler.crawl(BingSpider, params1='倚天屠龙记',params2 = ['苏有朋'])
    crawler.start()
    print(result.result)
    '''
    def _crawl(self, crawler, *args, **kwargs):
        self.crawlers.add(crawler)
        d = crawler.crawl(*args, **kwargs)
        self._active.add(d)

        def _done(*args, **kwargs):
			      """重写部分, 便于将结果返回, 不在利用其他数据存储方式"""
            self.crawlers.discard(crawler)
            self._active.discard(d)
            self.bootstrap_failed |= not getattr(crawler, 'spider', None)
			# 需要将结果存储在result里, 当然若不需要则可以不要这个class
            return crawler.spider.result
        return d.addBoth(_done)
		
def run(spider, *args, **kwargs):
	  '''
	  spider 即 spiderClass对象
	   *args, **kwargs 则是传入spider的一些参数
	  '''
    timeout = kwargs.pop('timeout', 60*10)
    # 特殊设置, 如设置代理
    settings = kwargs.pop('settings', {})
    # 没有必要开启 TELNETCONSOLE_ENABLED
    settings.setdefault('TELNETCONSOLE_ENABLED', False)
    settings['LOG_LEVEL'] = 'ERROR'
	  # setup 必须在这里
    setup()
    @wait_for(timeout=timeout)
    def _run():
        crawler = Crawler(settings=settings)
        result = crawler.crawl(spider, *args, **kwargs)
        return result
    return _run()
	

能如同函数一般调用归功于crochet这个库, 更多使用方式可以官方文档查看.

使用Dome

class BingSpider(spider):
	....
	def __init__(self, params1=None, params2=None, **kwargs):
			self.params1=params1
			self.params2=params2
			self.result = []
			super().__init__(name, **kwargs)
			
	def start_requests(self):
		   yield self.make_requests(self.params1)
		   ....
	def parase(self, response, **kwargs):
	       ....
		   self.result.append(response.url)
			 
			 
		   

from celery import shared_task


@shared_task(bind=True)
def search(self, key, *attr):
		self.update_state(state="RUNING", meta={'msg': '进行数据采集'})
		result = run(BingSpider, params1=works, params2=attr, callback=self)
		return result
posted @ 2022-11-17 09:31  apuyuseng  阅读(121)  评论(0编辑  收藏  举报