scrapy

1、基本命令:

Global commands: scrapy + (大的命令):
    scrapy startproject ***  #创建项目  ***(这是项目名,不是爬虫名) 
    cd myproject     
    scrapy genspider baidu www.baidu.com  
        # 创建爬虫程序baidu.py,以及爬虫名称
        # 百度网址是域名,不一定非要有,有的话只能爬百度开头的网址
      # 一个项目可以有多个爬虫,使用 scrapy list 查看
    scrapy settings    
        #如果是在项目目录下,则得到的是该项目的配置    
    scrapy runspider baidu.py  
        # 爬虫文件的绝对路径(不是项目文件) 
      # 运行一个独立的python文件,不必创建项目 scrapy shell http://www.baidu.com
        # 在交互式调试,如选择器规则正确与否 response.status / text / body 
        # response.xpth()
    scrapy  view(response)         
        # 下载到本地,完毕后直接弹出浏览器,以此可以分辨出哪些数据是ajax请求
Project-only commands:
    scrapy crawl  baidu --nolog (爬虫名称)  
        #运行爬虫,必须创建项目才行,确保配置文件中ROBOTSTXT_OBEY = False
    scrapy check        
        #检测项目中有无语法错误
    scrapy list         
        #列出项目中所包含的爬虫名
    scrapy parse http://www.baodu.com/ --callback parse    
        #以此可以验证我们的回调函数是否正确
    scrapy bench        
        #scrapy bentch压力测试,看每分钟爬多少页面
    官网链接
        https://docs.scrapy.org/en/latest/topics/commands.html    

2、custom_settings

custom_settings值为一个字典,定义一些配置信息,在运行爬虫程序时,这些配置会覆盖项目级别的配置。所以custom_settings必须被定义成一个类属性(放在parse之前),由于settings会在类实例化前加载,但是后来居上custom_settings会覆盖settings.py的配置

import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from ..items import * 
from scrapy.http.cookies import CookieJar

class zhihuSpiser(scrapy.Spider):
    name="zhihu"
    allowed_urls=["xxx",]
    start_urls=["xxxx",]
    custom_settings={ "DEFAULT_REQUEST_HEADERS":{
                “user-agent”:...
                "accept_language":...

                    }     
                    }
    cookie_dict={}
custom_settings

3、start_requests 两种写法 

def start_requests(self):
    for url in self.start_urls:
        yield Request(url=url,callback=self.parse)
直接yieldRequest()
def start_requests(self):
    req_list = []
    for url in self.start_urls:
        req_list.append(Request(url=url,callback=self.parse2))
    return req_list  # scrapy内部会将返回值(list可迭代对象)转换成迭代器。
return req_list

4、cookie处理

4.1、post请求

from scrapy.http import Request 

req = Request(
    url='http://dig.chouti.com/login',
    method='POST',
    body='phone=8613121758648&password=woshiniba&oneMonth=1',
    headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'},
    cookies={},
        # meta={'cookiejar': True} 或者 cookies=self.cookie_dict
    callback=self.parse_check_login,
            )
post请求

4.2、手动操作cookie

def parse(self,response):
     
    prin(response.headers.getlist('Set-Cookie'))    # 原始cookie
    
    from scrapy.http.cookies import CookieJar    # 解析之后的cookie
    cookie_jar = CookieJar()
        cookie_jar.extract_cookies(response, response.request)
        for k, v in cookie_jar._cookies.items():
            for i, j in v.items():
                for m, n in j.items():
                    self.cookie_dict[m] = n.value
        print(cookie_dict)    
    req = Request(
            url='http://dig.chouti.com/login',
            method='POST',
            headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'},
            body='phone=8615131255089&password=pppppppp&oneMonth=1',
            cookies=self.cookie_dict,
            callback=self.check_login
        )
    yield req
    
def check_login(self, response):
     yield Request(
                url='http://dig.chouti.com/link/vote?linksId=%s' %(link_id,),
                method='POST',
                cookies=self.cookie_dict,
                callback=self.do_favor
                    )   
                    
def do_favor(self, response):
    print(response.text)
CookieJar

4.3、自动操作cookie :  meta={'cookiejar':True}

配置文件制定是否允许操作cookie:
    # Disable cookies (enabled by default)
    # COOKIES_ENABLED = False   # 第36行
配置文件
class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    allowed_domains = ['chouti.com']
    start_urls = ['http://dig.chouti.com/',]

    def start_requests(self):
        for url in self.start_urls:
            yield Request(url=url,callback=self.parse_index,meta={'cookiejar':True})
            
    def parse_index(self,response):
        req = Request(
            url='http://dig.chouti.com/login',
            method='POST',
            headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'},
            body='phone=8613121758648&password=woshiniba&oneMonth=1',
            callback=self.parse_check_login,
            meta={'cookiejar': True}
        )
        yield req

    def parse_check_login(self,response):
        # print(response.text)
        yield Request(
            url='https://dig.chouti.com/link/vote?linksId=19440976',
            method='POST',
            callback=self.parse_show_result,
            meta={'cookiejar': True},
        )

    def parse_show_result(self,response):
        print(response.text)
meta={'cookiejar': True}

4.4、cookie其他用法

模拟浏览器登录

start_requests()方法,可以返回一个请求给爬虫的起始网站,这个返回的请求相当于start_urls,start_requests()返回的请求会替代start_urls里的请求

Request()get请求,可以设置,url、cookie、回调函数

FormRequest.from_response()表单post提交,第一个必须参数,上一次响应cookie的response对象,其他参数,cookie、url、表单内容等

yield Request()可以将一个新的请求返回给爬虫执行


在发送请求时cookie的操作,
meta={'cookiejar':1}表示开启cookie记录,首次请求时写在Request()里
meta={'cookiejar':response.meta['cookiejar']}表示使用上一次response的cookie,写在FormRequest.from_response()里post授权
meta={'cookiejar':True}表示使用授权后的cookie访问需要登录查看的页面

 

获取Scrapy框架Cookies

请求Cookie
Cookie = response.request.headers.getlist('Cookie')
print(Cookie)

响应Cookie
Cookie2 = response.headers.getlist('Set-Cookie')
print(Cookie2)
View Code
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request,FormRequest

class PachSpider(scrapy.Spider):                            #定义爬虫类,必须继承scrapy.Spider
    name = 'pach'                                           #设置爬虫名称
    allowed_domains = ['edu.iqianyue.com']                  #爬取域名
    # start_urls = ['http://edu.iqianyue.com/index_user_login.html']     #爬取网址,只适于不需要登录的请求,因为没法设置cookie等信息

    header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0'}  #设置浏览器用户代理

    def start_requests(self):       #用start_requests()方法,代替start_urls
        """第一次请求一下登录页面,设置开启cookie使其得到cookie,设置回调函数"""
        return [Request('http://edu.iqianyue.com/index_user_login.html',meta={'cookiejar':1},callback=self.parse)]

    def parse(self, response):     #parse回调函数

        data = {                    #设置用户登录信息,对应抓包得到字段
            'number':'adc8868',
            'passwd':'279819',
            'submit':''
            }

        # 响应Cookie
        Cookie1 = response.headers.getlist('Set-Cookie')   #查看一下响应Cookie,也就是第一次访问注册页面时后台写入浏览器的Cookie
        print(Cookie1)

        print('登录中')
        """第二次用表单post请求,携带Cookie、浏览器代理、用户登录信息,进行登录给Cookie授权"""
        return [FormRequest.from_response(response,
                                          url='http://edu.iqianyue.com/index_user_login',   #真实post地址
                                          meta={'cookiejar':response.meta['cookiejar']},
                                          headers=self.header,
                                          formdata=data,
                                          callback=self.next,
                                          )]
    def next(self,response):
        a = response.body.decode("utf-8")   #登录后可以查看一下登录响应信息
        # print(a)
        """登录后请求需要登录才能查看的页面,如个人中心,携带授权后的Cookie请求"""
        yield Request('http://edu.iqianyue.com/index_user_index.html',meta={'cookiejar':True},callback=self.next2)
    def next2(self,response):
        # 请求Cookie
        Cookie2 = response.request.headers.getlist('Cookie')
        print(Cookie2)

        body = response.body  # 获取网页内容字节类型
        unicode_body = response.body_as_unicode()  # 获取网站内容字符串类型

        a = response.xpath('/html/head/title/text()').extract()  #得到个人中心页面
        print(a)
View Code

5、xpath

    # from scrapy.selector import HtmlXPathSelector
    # hxs=HtmlXPathSelector(response=response)
    # div_tag=hxs.xpath("//div[@id='content-list']/div[@class='item']")
    
    # 可以直接response.xpath()    没必要调用HtmlXPathSelector
    response.xpath('//div[@id='content-list']/div[@class='item']')
    
    for div in div_tag:
        a_text=div.xpath(".//div[@class='part1']/a[1]/text()").extract_first()
        a_href=div.xpath(".//div[@class='part1']/a[1]/@href").extract_first(default="not found")
        print(a_text,a_href)
        
        from ..items import * 
        item=XianglongItem(title=a_text,href=a_href)
        yield item
        
        pages=hxs.xpath().extract()
        for page in pages:
            page_url="xxxxx"+page_url
            yield Request(url=page_url,callback=self.parse)
        # 这样子会不停的爬取,可以在settings中设置DEPTH_LIMIT = 1
response.xpath(".//div[@class='part1']/a[1]/@href").extract_first(default="not found")

6、pipeline  处理item或者DropItem

ITEM_PIPELINES={ TextPipeline: 300,
                 MongoPipeline:400,
                }
MONGO_URL="localhost"
MONGO_DB="quotes"
settings配置
from scrapy.exceptions import DropItem

class TextPipeline(object):
    def __init__(self):
        self.limit=50
    def process_item(self,item,spider):
        if item["text]:
            if len(item["text]) > self.limit:
                item["text"] = item["text"][0:self.limit]/rstrip() + "..."
                return item
        elsefrom scrapy.exceptions import DropItem
            raise DropItem("Missing Text")
TextPipeline
from scrapy.exceptions import DropItem

class MongoPipeline(objects):
    def __init__(self,mongo_url,mongo_db):
        self.mongo_url=mongo_url
        self.mongo_db=mongo_db

    @classmethod
    def from_crawler(cls,crawler):     
        # 一般用来获取settings信息,也可用于spider.py中
    
        return cls(
                mongo_url=crawler.settings.get(“MONGO_UR”),
                mongo_db=crawler.settings.get(“MONGO_DB”),        
             )   
             # 获取的mongo_url又传到了def __init__ 中的参数
             # def __init__ 中再接收参数

    def open_spider(self,spider):

        self.client=pymongo.MongoClient(self.mongo_url)
        self.db=self.client[self.mongo_db]

    def process_item(self,item,spider):
        name=item.__class__.__name__
        self.db["quotes"].insert(dict(item))
        return item

    def close(self,spider):
        self.client.close()
MongoPipeline
import json
from scrapy.exceptions import DropItem

class JsonPipLine(object):
    def open_spider(self,spider):
        self.file=open("items.jl","wb")
        
    def process_item(self,item,spider):
        line=json.dumps(dict(item))+"\n"        
        self.file.write(line)
        return item

    def close(self,spider):
        selc.file.close()
JsonPipLine
from scrapy.exceptions import DropItem

class DropPipLine(object):
    def __init__(self):
        self.ids_seen = set()
    def process_item(self,item,spider):
        if item["id"] in self.self.ids_seen:
            raise DropItem("missing")
        else: self.ids_seen.add(item["id"])
            return item
DropPipLine

7、dupefilter

7.1、源码解析

  scrapy默认使用 scrapy.dupefilter.RFPDupeFilter 进行去重

  dupefilter.py(源码)---注意返回True或者false ,True表示已经访问过;False表示未访问过

DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'
DUPEFILTER_DEBUG = False
JOBDIR = "保存范文记录的日志路径,如:/root/"  # 最终路径为 /root/requests.seen
settings默认配置

  RFPDupeFilter继承BaseDupeFilter,再利用request_fingerprint

class RFPDupeFilter(BaseDupeFilter):
    def __init__(self,path=None,debug=False):
        self.fingerprints=set()      # 集合set
    
    def request_seen(self,request):   # 关键函数
        fp=self.request_fingerprint(request)  # request的指纹
        if fp in self.fingerprints:
            return True
            
        added=self.server.sadd(self.key,fp)   # k,v 插入,返回值added
        return added == 0          #如果added=0,说明已经存在这个请求,就不会再加入队列    
    
    def request_fingerprint(self,request):   # 获取指纹
        return request_fingerprint(request)
RFPDupeFilter(BaseDupeFilter)

  拓展:利用request_fingerprint,为请求创建唯一标识

    http://www.oldboyedu.com?id=1&age=2
    http://www.oldboyedu.com?age=2&id=1

    from scrapy.utils.request import request_fingerprint
    from scrapy.http import Request

    u1 = Request(url='http://www.oldboyedu.com?id=1&age=2')
    u2 = Request(url='http://www.oldboyedu.com?age=2&id=1')

    result1 = request_fingerprint(u1)
    result2 = request_fingerprint(u2)
    print(result1,result2)
request_fingerprint

7.2、自己写去重

class MyDupeFilter(BaseDupeFilter):
    def __init__(self):
        self.visited_url = set()

    @classmethod
    def from_settings(cls, settings):
        """
        初始化时,调用
        :param settings: 
        :return: 
        """
        return cls()

    def request_seen(self, request):
        """
        检测当前请求是否已经被访问过
        :param request: 
        :return: True表示已经访问过;False表示未访问过
        """
        if request.url in self.visited_url:
            return True
        self.visited_url.add(request.url)
        return False

    def open(self):
        """
        开始爬去请求时,调用
        :return: 
        """
        print('open replication')

    def close(self, reason):
        """
        结束爬虫爬取时,调用
        :param reason: 
        :return: 
        """
        print('close replication')

    def log(self, request, spider):
        """
        记录日志
        :param request: 
        :param spider: 
        :return: 
        """
        print('repeat', request.url)
基本结构
DUPEFILTER_CLASS = 'xxxxxxx.dupe.MyDupeFilter'
settings配置
from scrapy.dupefilter import BaseDupeFilter
from scrapy.utils.request import request_fingerprint
class MyDupeFilter(BaseDupeFilter):

def __init__(self):
    self.record = set()

@classmethod
def from_settings(cls, settings):
    return cls()

def request_seen(self, request):
    print('url开始去重了,request.url')
    ident=request_fingerprint(request)
    if ident in self.record:
        print('已经访问过了', request.url,ident)
        return True     
    self.record.add(ident)
    return False
        # 为什么返回 True/False , 因为scheduler.py
def open(self):  # can return deferred
    pass

def close(self, reason):  # can return a deferred
    pass
创建dupe.py

8、scheduler

# 将request对象全部放到内部维护的队列:self.q=deque()
# 将request对象全部放到硬盘维护的队列:文件操作 mq

from scrapy.core.scheduler import Scheduler

def enqueue_request(self, request):
    if not request.dont_filter and self.df.request_seen(request):
        self.df.log(request, self.spider)
        return False
    dqok = self._dqpush(request)
    if dqok:pass
    else:pass
    return True
    
# dont_filter参数解析:        
#(request.dont_filter=False)
    # self.df.request_seen(request):
    #  - True,已经访问  那么if 为真,return False
    #  - False,未访问  那么if 为假,dqok = self._dqpush(request)
# request.dont_filter=True,
    # 那么if 为假,dqok = self._dqpush(request)全部加入到调度器                    

def next_request(self):
    pass
源码解析

9、下载中间件  --- 请求头、代理、证书、cookie、

9.1、对爬虫中所有请求发送时,携带请求头?

方案一:在每个Request对象中添加一个请求头
方案一
配置: 
    DOWNLOADER_MIDDLEWARES = {
       'xianglong.middlewares.UserAgentDownloaderMiddleware': 543,
    }
编写类

class UserAgentDownloaderMiddleware(object):

@classmethod
def from_crawler(cls, crawler):                
    s = cls()
    return s

def process_request(self, request, spider):            
    request.headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"

    # return None # 继续执行后续的中间件的process_request
    # from scrapy.http import Request
    # return Request(url='www.baidu.com') # 重新放入调度器中,当前请求不再继续处理
    # from scrapy.http import HtmlResponse # 执行从最后一个开始执行所有的process_response
    # return HtmlResponse(url='www.baidu.com',body=b'asdfuowjelrjaspdoifualskdjf;lajsdf')

def process_response(self, request, response, spider):
    return response

def process_exception(self, request, exception, spider):
    pass
方案二:下载中间件
配置文件:
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
方案三:内置下载中间件

9.2、scrapy中如何添加代理?

scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware
import os
import scrapy
from scrapy.http import Request

class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    allowed_domains = ['chouti.com']
    start_urls = ['https://dig.chouti.com/']

    def start_requests(self):
        os.03['HTTP_PROXY'] = "http://192.168.11.11"

        for url in self.start_urls:
            yield Request(url=url,callback=self.parse)

    def parse(self, response):
        print(response)
方式一:内置添加代理功能
import random
import base64
import six
def to_bytes(text, encoding=None, errors='strict'):
    """Return the binary representation of `text`. If `text`
    is already a bytes object, return it as-is."""
    if isinstance(text, bytes):
        return text
    if not isinstance(text, six.string_types):
        raise TypeError('to_bytes must receive a unicode, str or bytes '
                        'object, got %s' % type(text).__name__)
    if encoding is None:
        encoding = 'utf-8'
    return text.encode(encoding, errors)

class MyProxyDownloaderMiddleware(object):
    def process_request(self, request, spider):
        proxy_list = [
            {'ip_port': '111.11.228.75:80', 'user_pass': 'xxx:123'},
            {'ip_port': '120.198.243.22:80', 'user_pass': ''},
            {'ip_port': '111.8.60.9:8123', 'user_pass': ''},
            {'ip_port': '101.71.27.120:80', 'user_pass': ''},
            {'ip_port': '122.96.59.104:80', 'user_pass': ''},
            {'ip_port': '122.224.249.122:8088', 'user_pass': ''},
        ]
        proxy = random.choice(proxy_list)
        if proxy['user_pass'] is not None:
            request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
            encoded_user_pass = base64.encodestring(to_bytes(proxy['user_pass']))
            request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass)
        else:
            request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
配置:
    DOWNLOADER_MIDDLEWARES = {
       # 'xiaohan.middlewares.MyProxyDownloaderMiddleware': 543,
                                }
    
方式二:自定义下载中间件

9.3、证书

Https访问时有两种情况:
1. 要爬取网站使用的可信任证书(默认支持)
    DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
    DOWNLOADER_CLIENTCONTEXTFACTORY = "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory"
    
2. 要爬取网站使用的自定义证书
# settings.py
    DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
    DOWNLOADER_CLIENTCONTEXTFACTORY = "项目名.https.MySSLFactory"
    
# https.py
    from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
    from twisted.internet.ssl import (optionsForClientTLS, CertificateOptions, PrivateCertificate)
    
    class MySSLFactory(ScrapyClientContextFactory):
        def getCertificateOptions(self):
            from OpenSSL import crypto
            v1 = crypto.load_privatekey(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.key.unsecure', mode='r').read())
            v2 = crypto.load_certificate(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.pem', mode='r').read())
            return CertificateOptions(
                privateKey=v1,  # pKey对象
                certificate=v2,  # X509对象
                verify=False,
                method=getattr(self, 'method', getattr(self, '_ssl_method', None))
            )
            
            
其他:
    相关类
        scrapy.core.downloader.handlers.http.HttpDownloadHandler
        scrapy.core.downloader.webclient.ScrapyHTTPClientFactory
        scrapy.core.downloader.contextfactory.ScrapyClientContextFactory
View Code

10、爬虫中间件

class XiaohanSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.
    def __init__(self):
        pass
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        return s

    # 每次下载完成之后,未执行parse函数之前。
    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        print('process_spider_input',response)
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        print('process_spider_output',response)
        for i in result:+
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    # 爬虫启动时,第一次执行start_requests时,触发。(只执行一次)
    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).

        print('process_start_requests')
        for r in start_requests:
            yield r
爬虫中间件(middlewares.py)

11、扩展、信号

11.1、单纯扩展

extends.py 
    class MyExtension(object):
        def __init__(self):
            pass

        @classmethod
        def from_crawler(cls, crawler):
            obj = cls()
            return obj
配置:
    EXTENSIONS = {
        'xiaohan.extends.MyExtension':500,
    }
View Code

11.2、扩展+信号

from scrapy import signals

class MyExtension(object):
    def __init__(self):
        pass

    @classmethod
    def from_crawler(cls, crawler):
        obj = cls()
# 在爬虫打开时,触发spider_opened信号相关的所有函数:xxxxxxxxxxx
        crawler.signals.connect(obj.xxxxxxxxxxx1, signal=signals.spider_opened)
# 在爬虫关闭时,触发spider_closed信号相关的所有函数:xxxxxxxxxxx
        crawler.signals.connect(obj.uuuuuuuuuu, signal=signals.spider_closed)
        return obj

    def xxxxxxxxxxx1(self, spider):
        print('open')

    def uuuuuuuuuu(self, spider):
        print('close')

            return obj
            
配置:  #  62 行
EXTENSIONS = {
'xiaohan.extends.MyExtension':500,
            }
extends.py (在与settings同级目录下新建一个文件,文件名可以为extentions.py,内容如下)

12、自定制命令

commands 文件夹 ,在其中创建 crawlall.py (此处文件名就是自定义的命令)在settings.py中添加配置: COMMANDS_MODULE = "sp3.commands"在项目目录执行命令:scrapy crawlall ( cmd 中直接执行)

from scrapy.commands import ScrapyCommand
from scrapy.utils.project import get_project_settings

class Command(ScrapyCommand):
    requires_project = True

    def syntax(self):
        return '[options]'

    def short_desc(self):
        return 'Runs all of the spiders'

    def run(self, args, opts):
        spider_list = self.crawler_process.spiders.list()
        
        for name in spider_list:
            self.crawler_process.crawl(name, **opts.__dict__)
        self.crawler_process.start()
View Code
def run(self, args, opts):
    from scrapy.crawler import CrawlerProcess
    CrawlerProcess.crawl
    CrawlerProcess.start
    """
        self.crawler_process对象中含有:_active = {d,}
    """
    self.crawler_process.crawl('chouti',**opts.__dict__)
    self.crawler_process.crawl('cnblogs',**opts.__dict__)
    #
    self.crawler_process.start()
源码

 

posted @ 2018-06-05 17:47  nick560  阅读(341)  评论(0编辑  收藏  举报