scrapy 爬取当当网产品分类
#spider部分
import scrapy from Autopjt.items import AutopjtItem from scrapy.http import Request class AutospdSpider(scrapy.Spider): name = "autospd" allowed_domains = ["dangdang.com"] start_urls = ['http://category.dangdang.com/pg1-cid4007379.html'] def parse(self, response): item = AutopjtItem() item['name'] =response.xpath('//a[@name="itemlist-title"]/@title').extract() item['price'] = response.xpath('//span[@class="price_n"]/text()').extract() item['link'] = response.xpath('//a[@name="itemlist-title"]/@href').extract() item['comnum'] = response.xpath('//a[@name="itemlist-review"]/text()').extract() yield item for i in range(1,101): url = 'http://category.dangdang.com/pg'+str(i)+'-cid4007379.html' yield Request(url,callback=self.parse)
pipeline部分
import codecs import json class AutopjtPipeline(object): def __init__(self): self.file = codecs.open('D:/mydata.json','wb',encoding='utf-8') def process_item(self, item, spider): for j in range(0,len(item['name'])): name = item['name'][j] price = item['price'][j] comnum = item['comnum'][j] link =item['link'][j] goods = {'name':name,'price':price,'comnum':comnum,'link':link} i = json.dumps(dict(goods),ensure_ascii=False) line = i + '\n' self.file.write(line) return item def close_spider(self,spider): self.file.close()
item部分
import scrapy class AutopjtItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() name = scrapy.Field() price = scrapy.Field() link = scrapy.Field() comnum = scrapy.Field()