# -*- coding: utf-8 -*- import scrapy # from quotetutorial.items import QuoteItem from quotetutorial.items import QuotetutorialItem # 主要编辑项目信息基本上都在在这里完成的 class QuotesSpider(scrapy.Spider): name = 'quotes' allowed_domains = ['quotes.toscrape.com'] start_urls = ['http://quotes.toscrape.com/'] # 爬取信息 def parse(self, response): # pass # 打印源代码 # print(response.text) quotes = response.css('.col-md-8 .quote') for quote in quotes: item = QuotetutorialItem() text = quote.css('.text::text').extract_first() author = quote.css('.author::text').extract_first() # 只提取一个内容 类似于 findone tags = quote.css('.tags .tag::text').extract() # 提多多个内容 类似于 findall item['text'] = text item['author'] = author item['tags'] = tags yield item next = response.css('.pager .netxt a::attr(href)').extract_first() # 选择下一页 url = response.urljoin(next) # 因为获取的地址不完整,获取完整的网址加内容连接地址 yield scrapy.Request(url=url,callback=self.parse()) # 从新调用自己并翻页 # 保存文件 # scrapy crawl quotes -o quotes.json # scrapy crawl quotes -o quotes.jl # scrapy crawl quotes -o quotes.csv # scrapy crawl quotes -o quotes.xml # scrapy crawl quotes -o ftp://user:pass@ftp.example.com/path/quotes.csv