微型scrapy
from twisted.web.client import getPage #模块功能:socket对象,自动完成移除 from twisted.internet import reactor #模块功能:事件循环(所有的socket对象都移除) from twisted.internet import defer #模块功能:defer.Deferred,特殊的socket对象,不发请求,需手动移除 from queue import Queue class Request(object): """ 用于封装用户请求相关信息 """ def __init__(self,url,callback): self.url = url self.callback = callback class HttpResponse(object): #将下载结果和request封装成一个类,以后方法解析好,类.xxx就能取到所有内容 def __init__(self,content,request): self.content = content self.request = request class Scheduler(object): """ 任务调度器 """ def __init__(self): self.q = Queue() def open(self): pass def next_request(self): try: req = self.q.get(block=False) except Exception as e: req = None return req def enqueue_request(self,req): self.q.put(req) def size(self): return self.q.qsize() class ExecutionEngine(object): """ 引擎:所有的调度 """ def __init__(self): self._close = None self.scheduler = None self.max = 5 self.crawlling = [] def get_response_callback(self,content,request): """回调函数,传给parse和接受parse的返回值""" self.crawlling.remove(request) # 移除 response = HttpResponse(content, request) result = request.callback(response) # 即调用了parse import types if isinstance(result, types.GeneratorType): # 解析返回生成器的时候,再放到队列中 for req in result: self.scheduler.enqueue_request(req) def _next_request(self): """取""" if self.scheduler.size() == 0 and len(self.crawlling) == 0: self._close.callback(None) return while len(self.crawlling)<self.max: req = self.scheduler.next_request() if not req: # 还可以try,没有就except即可 return self.crawlling.append(req) d = getPage(req.url.encode('utf-8')) d.addCallback(self.get_response_callback,req) d.addCallback(lambda _: reactor.callLater(0, self._next_request)) @defer.inlineCallbacks def open_spider(self,start_requests): """将初始请求加入调度器,然后开始取任务""" self.scheduler = Scheduler() while True: try: req = next(start_requests) self.scheduler.enqueue_request(req) except StopIteration as e: break yield self.scheduler.open() #加这个装饰器必须有yield,否则twisted报错,none表示没有影响,这个方法没有返回,用途是调用方法 reactor.callLater(0,self._next_request) @defer.inlineCallbacks def start(self): """创建Deferred对象""" self._close = defer.Deferred() yield self._close class Crawler(object): """ 用户封装调度器以及引擎,将初始爬虫交给引擎调度 """ def _create_engine(self): """创建引擎对象""" return ExecutionEngine() def _creat_spider(self,spider_cls_path): """ 根据spider路径创建spider对象 :param spider_cls_path: :return: """ module_path,cls_name = spider_cls_path.rsplit('.',maxsplit=1) #模块路径,类名 import importlib #反射 m = importlib.import_module(module_path) cls = getattr(m,cls_name) return cls() @defer.inlineCallbacks def crawl(self,spider_cls_path): """创建引擎,创建spider,将初始url传给引擎open_spider加入队列,然后yield defered对象elf._close""" engine = self._create_engine() spider = self._creat_spider(spider_cls_path) start_requests = iter(spider.start_requests()) yield engine.open_spider(start_requests) # 相当于将yield写到open_spider里 yield engine.start() #yield self._close class CrawlerProcess(object): """ 调用Crawler创建爬虫,开启事件循环 """ def __init__(self): self._active = set() def crawl(self,spider_cls_path): # 创建爬虫Crawler对象,这样把创建爬虫和将爬虫添加到循环的工作分开 crawler = Crawler() d = crawler.crawl(spider_cls_path) self._active.add(d) def start(self): """ 启动一次reactor,就算多个spider :return: """ dd = defer.DeferredList(self._active) dd.addBoth(lambda _:reactor.stop()) reactor.run() class Command(object): def run(self): crawl_process = CrawlerProcess() spider_cls_path_list = ['spider.chouti.ChoutiSpider',] for spider_cls_path in spider_cls_path_list: crawl_process.crawl(spider_cls_path) crawl_process.start() if __name__ == '__main__': cmd = Command() cmd.run()
from ..engine import Request class ChoutiSpider(object): name = 'chouti' def start_requests(self): start_url = ['https://www.baidu.com','https://www.bing.com',] for url in start_url: yield Request(url,self.parse) def parse(self,response): print(response) # yield Request('https://www.baidu.com', callback=self.parse) #1.crawling移除 #2.获取parser yield值 #3.再次取队列中获取