celery 调用 scrapy
celery 调用 scrapy
需求
- 如同调用函数般调用.
- 定时或时时获取数据.
实现
from crochet import setup, wait_for
from scrapy.crawler import CrawlerProcess
class Crawler(CrawlerProcess):
'''
crawler = Crawler(settings={})
result = crawler.crawl(BingSpider, params1='倚天屠龙记',params2 = ['苏有朋'])
crawler.start()
print(result.result)
'''
def _crawl(self, crawler, *args, **kwargs):
self.crawlers.add(crawler)
d = crawler.crawl(*args, **kwargs)
self._active.add(d)
def _done(*args, **kwargs):
"""重写部分, 便于将结果返回, 不在利用其他数据存储方式"""
self.crawlers.discard(crawler)
self._active.discard(d)
self.bootstrap_failed |= not getattr(crawler, 'spider', None)
# 需要将结果存储在result里, 当然若不需要则可以不要这个class
return crawler.spider.result
return d.addBoth(_done)
def run(spider, *args, **kwargs):
'''
spider 即 spiderClass对象
*args, **kwargs 则是传入spider的一些参数
'''
timeout = kwargs.pop('timeout', 60*10)
# 特殊设置, 如设置代理
settings = kwargs.pop('settings', {})
# 没有必要开启 TELNETCONSOLE_ENABLED
settings.setdefault('TELNETCONSOLE_ENABLED', False)
settings['LOG_LEVEL'] = 'ERROR'
# setup 必须在这里
setup()
@wait_for(timeout=timeout)
def _run():
crawler = Crawler(settings=settings)
result = crawler.crawl(spider, *args, **kwargs)
return result
return _run()
能如同函数一般调用归功于crochet
这个库, 更多使用方式可以官方文档查看.
使用Dome
class BingSpider(spider):
....
def __init__(self, params1=None, params2=None, **kwargs):
self.params1=params1
self.params2=params2
self.result = []
super().__init__(name, **kwargs)
def start_requests(self):
yield self.make_requests(self.params1)
....
def parase(self, response, **kwargs):
....
self.result.append(response.url)
from celery import shared_task
@shared_task(bind=True)
def search(self, key, *attr):
self.update_state(state="RUNING", meta={'msg': '进行数据采集'})
result = run(BingSpider, params1=works, params2=attr, callback=self)
return result