scrapy之downloader执行流程

Posted on 2018-03-06 23:49  王将军之武库  阅读(327)  评论(0编辑  收藏  举报

Agent = client.Agent

class ScrapyAgent(object):

    _Agent = Agent#为twisted的client.Agent类
    _ProxyAgent = ProxyAgent
    _TunnelingAgent = TunnelingAgent

    def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None,
                 maxsize=0, warnsize=0, fail_on_dataloss=True):
        self._contextFactory = contextFactory
        self._connectTimeout = connectTimeout
        self._bindAddress = bindAddress
        self._pool = pool
        self._maxsize = maxsize
        self._warnsize = warnsize
        self._fail_on_dataloss = fail_on_dataloss
        self._txresponse = None

    def _get_agent(self, request, timeout):#获得代理
        bindaddress = request.meta.get('bindaddress') or self._bindAddress
        proxy = request.meta.get('proxy')
        if proxy:
            _, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
            scheme = _parse(request.url)[0]
            proxyHost = to_unicode(proxyHost)
            omitConnectTunnel = b'noconnect' in proxyParams
            if  scheme == b'https' and not omitConnectTunnel:
                proxyConf = (proxyHost, proxyPort,
                             request.headers.get(b'Proxy-Authorization', None))
                return self._TunnelingAgent(reactor, proxyConf,
                    contextFactory=self._contextFactory, connectTimeout=timeout,
                    bindAddress=bindaddress, pool=self._pool)
            else:
                endpoint = TCP4ClientEndpoint(reactor, proxyHost, proxyPort,
                    timeout=timeout, bindAddress=bindaddress)
                return self._ProxyAgent(endpoint)

        return self._Agent(reactor, contextFactory=self._contextFactory,
            connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)

    def download_request(self, request):
        timeout = request.meta.get('download_timeout') or self._connectTimeout
        agent = self._get_agent(request, timeout)

        # request details
        url = urldefrag(request.url)[0]
        method = to_bytes(request.method)
        headers = TxHeaders(request.headers)
        if isinstance(agent, self._TunnelingAgent):
            headers.removeHeader(b'Proxy-Authorization')
        if request.body:
            bodyproducer = _RequestBodyProducer(request.body)
        elif method == b'POST':
            # Setting Content-Length: 0 even for POST requests is not a
            # MUST per HTTP RFCs, but it's common behavior, and some
            # servers require this, otherwise returning HTTP 411 Length required
            #
            # RFC 7230#section-3.3.2:
            # "a Content-Length header field is normally sent in a POST
            # request even when the value is 0 (indicating an empty payload body)."
            #
            # Twisted < 17 will not add "Content-Length: 0" by itself;
            # Twisted >= 17 fixes this;
            # Using a producer with an empty-string sends `0` as Content-Length
            # for all versions of Twisted.
            bodyproducer = _RequestBodyProducer(b'')
        else:
            bodyproducer = None
        start_time = time()
        d = agent.request(#调用代理的请求
            method, to_bytes(url, encoding='ascii'), headers, bodyproducer)
        # set download latency
        d.addCallback(self._cb_latency, request, start_time)
        # response body is ready to be consumed
        d.addCallback(self._cb_bodyready, request)
        d.addCallback(self._cb_bodydone, request, url)
        # check download timeout
        self._timeout_cl = reactor.callLater(timeout, d.cancel)
        d.addBoth(self._cb_timeout, req
class Agent(_AgentBase):
    

    def __init__(self, reactor,
                 contextFactory=BrowserLikePolicyForHTTPS(),
                 connectTimeout=None, bindAddress=None,
                 pool=None):
       
        if not IPolicyForHTTPS.providedBy(contextFactory):
            warnings.warn(
                repr(contextFactory) +
                " was passed as the HTTPS policy for an Agent, but it does "
                "not provide IPolicyForHTTPS.  Since Twisted 14.0, you must "
                "pass a provider of IPolicyForHTTPS.",
                stacklevel=2, category=DeprecationWarning
            )
            contextFactory = _DeprecatedToCurrentPolicyForHTTPS(contextFactory)
        endpointFactory = _StandardEndpointFactory(
            reactor, contextFactory, connectTimeout, bindAddress)
        self._init(reactor, endpointFactory, pool)


    @classmethod
    def usingEndpointFactory(cls, reactor, endpointFactory, pool=None):
        """
        Create a new L{Agent} that will use the endpoint factory to figure
        out how to connect to the server.

        """
        agent = cls.__new__(cls)
        agent._init(reactor, endpointFactory, pool)
        return agent


    def _init(self, reactor, endpointFactory, pool):
        
        _AgentBase.__init__(self, reactor, pool)
        self._endpointFactory = endpointFactory


    def _getEndpoint(self, uri):
        
        return self._endpointFactory.endpointForURI(uri)


    def request(self, method, uri, headers=None, bodyProducer=None):
        """
        Issue a request to the server indicated by the given C{uri}.

        An existing connection from the connection pool may be used or a new
        one may be created.

        I{HTTP} and I{HTTPS} schemes are supported in C{uri}.

        @see: L{twisted.web.iweb.IAgent.request}
        """
        parsedURI = URI.fromBytes(uri)
        try:
            endpoint = self._getEndpoint(parsedURI)
        except SchemeNotSupported:
            return defer.fail(Failure())
        key = (parsedURI.scheme, parsedURI.host, parsedURI.port)#key值的计算
        return self._requestWithEndpoint(key, endpoint, method, parsedURI,
                                         headers, bodyProducer,
                                         parsedURI.originForm)
#从class HTTPConnectionPool(object)中取得一个连接
def
getConnection(self, key, endpoint): # Try to get cached version: connections = self._connections.get(key) while connections: connection = connections.pop(0) # Cancel timeout: self._timeouts[connection].cancel() del self._timeouts[connection] if connection.state == "QUIESCENT":#该连接为空闲状态 if self.retryAutomatically: newConnection = lambda: self._newConnection(key, endpoint) connection = _RetryingHTTP11ClientProtocol( connection, newConnection) return defer.succeed(connection)#成功 return self._newConnection(key, endpoint)

 

Copyright © 2024 王将军之武库
Powered by .NET 9.0 on Kubernetes