06. scrapy的Request对象

class scrapy.http.Request(url[, callback, method="GET", headers, body, cookies, meta, encoding='utf8', priority=0, dont_filter=Falese, errback]))

 
参数详解:
  1. url : 目标请求地址
  2. callback : 指定http方法, 默认为get
  3. method : 自定http方法, 默认为get
  4. meta : request.meta 可以传一些键值对
  5. body : 请求正文, 二进制内容
  6. headers : http请求头
  7. cookies: 附带在请求中要一起发出的cookies对象
  8. encoding : 当前请求的编码方式, 设置为true则不过滤请求
  9. priority : 设置请求的优先级, 默认为0, 这个优先级是scheduler在线程中用于定义处理请求的顺序
  10. dont_filter : 默认为False, 设置为True则不过滤请求
  11. erraback: 当请求发生任何异常时就会调用此回调函数
      

 

 

import scrapy 
from  scrapy.linkextractors import LinkExtractor

class DeepInSpider( scrapy.Spider ):
    name = 'example.com'
    start_urls = [ 'https://www.baidu.com' ]
    
    def parse( self, response ):
        link_extractor = LinkExtractor()
        seen =set()
        
        linkes = link_extractor.extract_links(response)
        links = [ link for link in linkes if link not in senn ]

        for link in links:
            print( link.url )
            seen.add(link)
            cd = None
            if ( link.contains( 'detail ) ):
                cd = self.parse_detail
            yield scrapy.Request( url=link, callback=cd )
            yield scrapy.Request( url = link.url, callback=cd )

    def parse_detail(self, response):
        pass

 

posted @ 2019-10-17 17:51  眼镜儿  阅读(109)  评论(0编辑  收藏  举报