精通scrapy04使用Item封装数据
Item和Field
● Item基类 自定义数据类(如BookItem)的基类。
● Field类 用来描述自定义数据类包含哪些字段(如name、price 等)。
接下来,我们改写第1章example项目中的代码,使用Item和 Field定义BookItem类,用其封装爬取到的书籍信息项目目录下的 items.py文件供用户实现各种自定义的数据类,在items.py中实现 BookItem,代码如下:
# Define here the models for your scraped items
#
from scrapy import Item,Field
class BookItem(Item):
name = Field()
price = Field()
review_rating = Field()
review_num = Field()
upc = Field()
stock = Field()
更改爬虫方法
#@Time :2021/03/05
#@Author:Moyu
import scrapy
from ..items import BookItem
from scrapy.linkextractors import LinkExtractor
class BookSpider(scrapy.Spider):
#每个爬虫的唯一标识
name = "books"
allowed_domains = ["books.toscrape.com"]
#定义爬虫的起始点
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
# 提取数据
# 每一本书的信息在<article class="product_pod">中,我们使用
# css()方法找到所有这样的article 元素,并依次迭代
le = LinkExtractor(restrict_css='article.product_pod h3')
for link in le.extract_links(response):
yield scrapy.Request(link.url, callback=self.parse_book)
le = LinkExtractor(restrict_css='ul.pager li.next')
links = le.extract_links(response)
if links:
next_url = links[0].url
yield scrapy.Request(next_url, callback=self.parse)
# for sel in response.css('article.product_pod'):
# book = BookItem()
# # 书名信息在article > h3 > a 元素的title属性里
# book['name'] = sel.xpath('./h3/a/@title').extract_first()
# book['price'] = sel.css('p.price_color::text').extract_first()
# yield book
# 提取链接
# 下一页的url 在ul.pager > li.next > a 里面
# next_url = response.css('ul.pager li.next a::attr(href)').extract_first()
# if next_url:
# #如果找到下一页的URL,得到绝对路径,构造新的Request 对象
# next_url = response.urljoin(next_url)
# yield scrapy.Request(next_url, callback=self.parse)
def parse_book(self, response):
book = BookItem()
sel = response.css('div.product_main')
book['name'] = sel.xpath('./h1/text()').extract_first()
book['price'] = sel.css('p.price_color::text').extract_first()
book['review_rating'] = sel.css('p.star-rating::attr(class)').re_first('star-rating ([A-Za-z]+)')
sel = response.css('table.table.table-striped')
book['upc'] = sel.xpath('(.//tr)[1]/td/text()').extract_first()
book['stock'] = sel.xpath('(.//tr)[last()-1]/td/text()').re_first('\((\d+) available\)')
book['review_num'] = sel.xpath('(.//tr)[last()]/td/text()').extract_first()
yield book
linkextractors,extract_first(),re_first用法.
博观而约取,厚积而薄发。