爬虫 ——(50页)books
代码如下所示
1 import scrapy 2 from scrapy.selector.unified import SelectorList 3 from bookspider.items import BooksItem 4 class BooksSpider(scrapy.Spider): 5 name = 'books' 6 allowed_domains = ['books.toscrape.com'] 7 start_urls = ['http://books.toscrape.com/catalogue/page-1.html'] 8 url = "http://books.toscrape.com" 9 def parse(self, response): 10 # print('*') 11 # print(type(response)) # <class 'scrapy.http.response.html.HtmlResponse'> 12 # print('*') 13 14 Books_lis = response.xpath("//div/ol[@class='row']/li") 15 # print('*') 16 # print(type(Books_lis)) 17 # print('*') 18 for Books_li in Books_lis: 19 book_name = Books_li.xpath(".//h3/a/@title").getall() 20 book_price = Books_li.xpath(".//div[@class='product_price']/p[@class='price_color']/text()").getall() 21 22 book_price = "".join(book_price) 23 item = BooksItem(book_name=book_name,book_price=book_price) 24 # books_all = {"book_name":book_name,"book_price":book_price} 25 # yield books_all 26 yield item 27 28 next_url = response.xpath("//ul[@class = 'pager']/li[@class = 'next']/a/@href").get() 29 # 30 if next_url: 31 # 如果找到下一页的URL,就得到绝对路径,构造新的Request对象 32 next_url = response.urljoin(next_url) 33 yield scrapy.Request(next_url,callback=self.parse) 34 35 # if not next_url: #此方法报错 待解决 36 # return 37 # else: 38 # yield scrapy.Request(self.url + next_url, callback=self.parse)