爬虫———通过pipeline以及items 将数据以json类型存储

1、books.py

 1 import scrapy
 2 from scrapy.selector.unified import SelectorList
 3 
 4 class BooksSpider(scrapy.Spider):
 5     name = 'books'
 6     allowed_domains = ['books.toscrape.com']
 7     start_urls = ['http://books.toscrape.com/catalogue/page-1.html']
 8 
 9     def parse(self, response):
10         # print('*')
11         # print(type(response))    #  <class 'scrapy.http.response.html.HtmlResponse'>
12         # print('*')
13 
14         Books_lis = response.xpath("//div/ol[@class='row']/li")
15         # print('*')
16         # print(type(Books_lis))
17         # print('*')
18         for Books_li in Books_lis:
19             book_name = Books_li.xpath(".//h3/a/@title").get()
20             book_price = Books_li.xpath(".//div[@class='product_price']/p[@class='price_color']/text()").getall()
21 
22             book_price = "".join(book_price)
23             books_all = {"book_name":book_name,"book_price":book_price}
24             yield books_all
View Code

2、pipeline .py

 1 import json
 2 
 3 
 4 class BooksPipeline(object):
 5     def __init__(self):
 6         self.file = open("books_all.json", "w", encoding="utf-8")
 7 
 8     def open_spider(self, spider):
 9         # self.file = open('items.jl', 'w')
10         print("爬虫开始了。。。")
11 
12     def close_spider(self, spider):
13         self.file.close()
14         print("爬虫结束了。。。")
15 
16     def process_item(self, item, spider):
17         # line = json.dumps(dict(item)) + "\n"
18         # self.file.write(line)
19         item_json = json.dumps(item,ensure_ascii= False)
20         self.file.write(item_json+'\n')
21         return item
View Code

3、结果

 

 

 

 

 3、犯的错误

 

 

 

 

 

 

 4、专业化

设置item.py文件

1 from scrapy import Item,Field
2 
3 
4 class BooksItem(Item):
5     # define the fields for your item here like:
6     # name = scrapy.Field()
7     book_name = Field()
8     book_price = Field()
View Code

 

 

 

posted @ 2019-11-14 15:07  cfancy  阅读(800)  评论(0编辑  收藏  举报