Scrapy的基本使用
爬取:http://quotes.toscrape.com
单页面
# -*- coding: utf-8 -*- import scrapy class QuoteSpider(scrapy.Spider): name = 'quote' allowed_domains = ['quotes.toscrape.com'] start_urls = ['http://quotes.toscrape.com/'] """ 知识点 1. text()获取标签的text 2. @属性 获取属性的值 3. extract()查找多个 extract_first() 查找一个 """ def parse(self, response): # print(response.text) quotes = response.xpath('//div[@class="col-md-8"]/div[@class="quote"]') # print(quotes)'' for quote in quotes: print('=' * 20) # print(quote) # extract_first() 查找一个 text = quote.xpath('.//span[@class="text"]/text()').extract_first() print(text) author = quote.xpath('.//span/small[@class="author"]/text()').extract_first() print(author) # extract()查找多个 tags = quote.xpath('.//div[@class="tags"]/a[@class="tag"]/@href').extract() print(tags)
所有页面
# -*- coding: utf-8 -*- import scrapy class QuoteSpider(scrapy.Spider): name = 'quote' allowed_domains = ['quotes.toscrape.com'] start_urls = ['http://quotes.toscrape.com/'] """ 知识点 1. text()获取标签的text 2. @属性 获取属性的值 3. extract()查找多个 extract_first() 查找一个 4. response.urljoin() url拼接 5. scrapy.Request(url=_next, callback=self.parse) 回调 """ def parse(self, response): # print(response.text) quotes = response.xpath('//div[@class="col-md-8"]/div[@class="quote"]') # print(quotes)'' for quote in quotes: print('=' * 20) # print(quote) # extract_first() 查找一个 text = quote.xpath('.//span[@class="text"]/text()').extract_first() print(text) author = quote.xpath('.//span/small[@class="author"]/text()').extract_first() print(author) # extract()查找多个 tags = quote.xpath('.//div[@class="tags"]/a[@class="tag"]/@href').extract() print(tags) print('>' * 40) next_url = response.xpath('//div[@class="col-md-8"]/nav/ul[@class="pager"]/li[@class="next"]/a/@href').extract_first() print(next_url) # 拼接url _next = response.urljoin(next_url) print(_next) # callback 回调函数 yield scrapy.Request(url=_next, callback=self.parse)
补充
from scrapy import Spider, FormRequest FormRequest(ulr= '', callback='', formdata='')