二十七、miniscrapy,scrapy源码初解

基本使用

from twisted.web.client import getPage, defer
from twisted.internet import reactor


# 基本使用

def all_done(contents):
    # 所有爬虫执行完毕后,循环终止
    reactor.stop()


def callback(contents):
    # 每一个爬虫获取结果后,自动执行
    print(contents)


deferred_list = list()

url_list = ['http://www.bing.com', 'http://www.baidu.com']
for url in url_list:
    deferred = getPage(bytes(url, encoding='utf8'))
    deferred.addCallback(callback)
    deferred_list.append(deferred)

dlist = defer.DeferredList(deferred_list)
dlist.addBoth(all_done)

reactor.run()
View Code

基于装饰器1

from twisted.web.client import getPage, defer
from twisted.internet import reactor

# 基于装饰器1
def all_done(arg):
    reactor.stop()


def onedone(response):
    print(response)

# 三要素:装饰器,deferred对象,yield
@defer.inlineCallbacks
def task(url):
    deferred = getPage(bytes(url, encoding='utf8'))
    deferred.addCallback(onedone)
    yield deferred


deferred_list = list()

url_list = ['http://www.bing.com', 'http://www.baidu.com']

for url in url_list:
    deferred = task(url)
    deferred_list.append(deferred)

dlist = defer.DeferredList(deferred_list)
dlist.addBoth(all_done)

reactor.run()
View Code

基于装饰器2

from twisted.web.client import getPage, defer
from twisted.internet import reactor


# 基于装饰器2

def all_done(arg):
    reactor.stop()


def onedone(response):
    print(response)


@defer.inlineCallbacks
def task():
    deferred_1 = getPage(bytes('http://www.baidu.com', encoding='utf8'))
    deferred_1.addCallback(onedone)
    yield deferred_1

    deferred_2 = getPage(bytes('http://www.bing.com', encoding='utf8'))
    deferred_2.addCallback(onedone)
    yield deferred_2


ret = task()
ret.addBoth(all_done)

reactor.run()
View Code

基于装饰器3

from twisted.web.client import getPage, defer
from twisted.internet import reactor


# 基于装饰器3,永恒循环
def all_done(arg):
    reactor.stop()


def onedone(response):
    print(response)


@defer.inlineCallbacks
def task():
    deferred_1 = getPage(bytes('http://www.bing.com', encoding='utf8'))
    deferred_1.addCallback(onedone)
    yield deferred_1

    stop_deferred = defer.Deferred()  # 永远不能完成的任务
    # stop_deferred.callback(None)  # 自定义callback对象,通过回调终止操作
    yield stop_deferred


ret = task()
ret.addBoth(all_done)

reactor.run()  # run是事件循环
View Code

基于装饰器4

from twisted.web.client import defer, getPage
from twisted.internet import reactor

# 基于装饰器,执行完毕后停止事件循环

running_list = list()
stop_deferred = None


def all_done(arg):
    reactor.stop()


def onedone(response, url):
    print(response)
    running_list.remove(url)


def check_empty(response):
    if not running_list:
        stop_deferred.callback(None)


@defer.inlineCallbacks
def task(url):
    deferred = getPage(bytes(url, encoding='utf8'))
    deferred.addCallback(onedone, url)
    deferred.addCallabck(check_empty)
    yield deferred

    global stop_deferred
    stop_deferred = defer.Deferred()
    yield stop_deferred


running_list.append('http://www.baidu.com')
ret = task('http://www.baidu.com')
ret.addBoth(all_done)

reactor.run()
View Code

基于装饰器5

from twisted.web.client import getPage, defer
from twisted.internet import reactor


class ExecutionEngine(object):

    def __init__(self):
        self.stop_deferred = None
        self.running_list = list()

    def one_done(self, response, url):
        print(response)
        self.running_list.remove(url)

    def check_empty(self, url):
        if not self.running_list:
            self.stop_deferred.callback(None)

    @defer.inlineCallbacks
    def open_spider(self, url):
        deferred = getPage(bytes(url, encoding='utf8'))
        deferred.addCallback(self.one_done, url)
        deferred.addCallback(self.check_empty)
        yield deferred

    @defer.inlineCallbacks
    def close_spider(self, url):
        self.stop_deferred = defer.Deferred()
        yield self.stop_deferred


@defer.inlineCallbacks
def task(url):
    engine = ExecutionEngine()
    engine.running_list.append(url)
    yield engine.open_spider(url)
    yield engine.close_spider(url)


def all_done(arg):
    reactor.stop()


if __name__ == "__main__":
    ret = task("http://www.bing.com")
    ret.addBoth(all_done)
    reactor.run()
View Code

Miniscrapy,scrapy源码初解

from twisted.web.client import getPage, defer
from twisted.internet import reactor
import queue


class Request(object):

    def __init(self, url, callback):
        self.url = url
        self.callback = callback


class Response(object):

    def __init__(self, body, request):
        self.body = body
        self.request = request
        self.url = reqeust.url

    @property
    def text(self):
        return self.body.decode('utf8')


class Scheduler(object):

    def __init(self, engine):
        self.q = queue.Queue()
        self.engine = engine

    def enqueue_request(self, request):
        self.q.put(request)

    def next_request(self):
        try:
            req = self.q.get(block=False)
        except Exception as e:
            req = None
        return req

    def size(self):
        return self.q.qsize()


class ExecutionEngine(object):

    def __init(self):
        self._closewait = None
        self.runing = True
        self.start_requests = None
        self.scheduler = Scheduler(self)
        self.inprogress = set()

    def check_empty(self, response):
        if not self.runing:
            self._closewait.callback(None)

    def _next_request(self):
        while self.start_requests:
            try:
                request = next(self.start_requests)
            except StopIteration:
                self.start_requests = None
            else:
                self.scheduler.enqueue_request(request)
        while len(self.inprogress) < 5 and self.scheduler.size() > 0:  # 最大并发数
            request = self.scheduler.next_request()
            if not request:
                break
            self.inprogress.add(request)
            d = getPage(bytes(request.url, encoding='utf8'))
            d.addBoth(self._handle_downloader_output, request)
            d.addBoth(lambda x, req: self.inprogress.remove(req), request)
            d.addBoth(lambda x: self._next_request())
        if len(self.inprogress) == 0 and self.scheduler.size() == 0:
            self._closewait.callback(None)

    def _handle_downloader_output(self, body, request):
        # 获取内容,执行回调函数,并且把回调函数中的返回值获取,并添加到队列中
        import types
        response = Response(body, request)
        func = request.callback or self.spider.parse
        gen = func(response)
        if isinstance(gen, types.GeneratorType):
            for req in gen:
                self.scheduler.enqueue_request(req)

    @defer.inlineCallbacks
    def start(self):
        self._closewait = defer_Deferred()
        yield self._closewait

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests):
        self.start_requests = start_requests
        self.spider = spider
        yield None
        reactor.callLater(0, self._next_request)


class Crawler(object):

    def __init__(self, spider_cls):
        self.spider_cls = spider_cls
        self.spider = None
        self.engine = None

    @defer.inlineCallbacks
    def crawl(self):
        self.engine = ExecutionEngine()
        self.spider = self.spider_cls()
        start_requests = iter(self.spider.start_requests())
        yield self.engine.open_spider(self.spider, start_requests)
        yield self.engine.start()


class CrawlerProcess(object):

    def __init__(self):
        self._active = set()
        self.crawlers = set()

    def crawl(self, spider_cls, *args, **kwargs):
        crawler = Crawler(spider_cls)

        self.crawlers.add(crawler)
        d = crawler.crawl(*args, **kwargs)
        self._active.add(d)
        return d

    def start(self):
        d = defer.DeferredList(self._active)
        d.addBoth(self._stop_reactor)
        reactor.run()

    def _stop_reactor(self, _=None):
        reactor.stop()


class Spider(object):

    def start_requests(self):
        for url in self.start_urls:
            yield Request(url)


class BaiduSpider(spider):
    name = 'baidu'
    start_urls = [
        'http://www.baidu.com'
    ]

    def parse(self, response):
        print(response.text)


class BingSpider(spider):
    pass


if __name__ == "__main__":
    spider_cls_list = [BaiduSpider, BingSpider]
    crawler_process = CrawlerProcess()
    for spider_cls in spider_cls_list:
        crawler_process.crawl(spider_cls)
    crawler_process.start()
View Code

 

posted @ 2020-07-28 16:17  Norni  阅读(133)  评论(0编辑  收藏  举报