爬取多个url

# -*- coding: utf-8 -*-
import scrapy
from qiubai.items import QiubaiItem


class QiushibaiSpider(scrapy.Spider):
    name = 'qiushibai'
    # allowed_domains = ['www.qiushibaike.com/text/']
    start_urls = ['http://www.qiushibaike.com/text/']
    url = "https://www.qiushibaike.com/text/page/%d/"
    page = 1
    def parse(self, response):
        #    建议大家使用xpath进行指定内容的解析(框架集成了xpath解析的接口)
        #    段子的内容和作者
        div_list = response.xpath('//div[@id="content-left"]/div')

        # data_list = []
        for div in div_list:

            # xpath解析到的指定内容被存储到了Selector对象
            # extract()该方法可以将Selector对象中存储的数据值拿到
            # author = div.xpath("./div/a[2]/h2/text()").extract()[0]
            # extract_first() == extract()[0]
            author = div.xpath("./div/a[2]/h2/text()").extract_first()
            content = div.xpath('.//div[@class="content"]/span/text()').extract_first()


            # 将解析到数值的数据存储到item对象
            item = QiubaiItem()
            item["author"] = author
            item["content"] = content
            # 将item对象提交给管道
            yield item
        if self.page <= 13:
            print("正在爬取第%d页" % self.page)
            self.page += 1
            new_url = format(self.url % self.page)
            yield scrapy.Request(url=new_url, callback=self.parse)

        #     data_list.append(data)
        # return data_list

用yield    callback 

posted @ 2018-12-14 16:12  Corey0606  阅读(492)  评论(0编辑  收藏  举报