JDBOOK
# -*- coding: utf-8 -*-
import scrapy
from BOOK.items import BookItem
import json
from copy import deepcopy
class BookSpider(scrapy.Spider):
name = 'book'
# 域名范围
allowed_domains = ['jd.com', 'p.3.cn']
start_urls = ['https://book.jd.com/booksort.html']
index = 0
# 1.解析大类的名字 和 dt(为了小类)
def parse(self, response):
dt_list = response.xpath('//*[@id="booksort"]/div[2]/dl/dt')
for dt in dt_list[:1]:
item = BookItem()
item['big_name'] = dt.xpath('.//a/text()').extract_first()
# 2.提取小类dd 名字 和 url
em_list = dt.xpath('./following-sibling::*[1]/em')
for em in em_list[:1]:
item['small_name'] = em.xpath('./a/text()').extract_first()
item['small_link'] = 'https:' + em.xpath('./a/@href').extract_first()
# 3. 发送每个小类的请求
yield scrapy.Request(
item['small_link'],
callback=self.parse_book_info,
meta={'key': deepcopy(item)}
)
# 解析每本书的数据
def parse_book_info(self, response):
# 接收 从 小类 传入的 item
item = response.meta['key']
# 3.1图书列表
book_list = response.xpath('//*[@id="plist"]/ul/li')
# 3.2 遍历每一本书 取出信息 100本书
for book in book_list[:1]:
# 书的图片
item['book_img_src'] = book.xpath('.//div[@class="p-img"]/a/img/@src').extract_first()
# 书的名字
item['book_name'] = book.xpath('.//div[@class="p-name"]/a/em/text()').extract_first()
# 书的作者
item['book_auth'] = book.xpath('.//span[@class="p-bi-name"]/span/a/text()').extract_first()
# 出版社
item['book_store'] = book.xpath('.//span[@class="p-bi-store"]/a/text()').extract_first()
# 出版时间
item['book_time'] = book.xpath('.//span[@class="p-bi-date"]/text()').extract_first()
# 书的价格 前端里面 callback 前端jsonp 跨域
price_link = 'https://p.3.cn/prices/mgets?ext=11000000&pin=&type=1&area=1_72_4137_0&skuIds={}'
skuId = book.xpath('./div/@data-sku').extract_first()
# 发送每本书的价格
yield scrapy.Request(
price_link.format(skuId),
callback=self.parse_price,
meta={'key': deepcopy(item)}
)
self.index += 1
# 4.翻页 判断什么时候结束
next_url = response.xpath('//*[@id="J_bottomPage"]/span[1]/a[10]/@href').extract_first()
if next_url is not None:
# 验证数据 翻5页
if self.index > 3:
return
yield response.follow(
next_url,
callback=self.parse_book_info,
meta={'key': item}
)
# 解析价格
def parse_price(self, response):
print(response.body.decode())
# 接收 从 book 传入的 item
item = response.meta['key']
item['book_price'] = json.loads(response.body.decode())[0]['op']
# 交给 engien -- pipeline
yield item