爬取100页京东商品评论
#items.py import scrapy class InsistItem(scrapy.Item): comment=scrapy.Field() #pipelines.py import json class InsistPipeline(object): def __init__(self): self.f=open('tencent.json','w',encoding='gbk') def process_item(self, item, spider): #item(Item对象,被爬取的item) #这个方 content=json.dumps(dict(item),ensure_ascii=False)+",\n" self.f.write(content) return item #tengxun.py import scrapy from insist.items import InsistItem import json class TengxunSpider(scrapy.Spider): name = 'tengxun' allowed_domains = ['sclub.jd.com'] #start_urls = ['https://item.jd.com/4432058.html'] baseURL = 'https://sclub.jd.com/comment/productPageComments.action?productId=4432058&score=0&sortType=5&pageSize=10&isShadowSku=0&rid=0&fold=1&page=' offset = 0 start_urls = [baseURL + str(offset)] def parse(self, response): com=json.loads(response.body.decode('gbk')) comment=com['comments'] for co in comment: item = InsistItem() item['comment']=co['content'] yield item if self.offset<100: self.offset+=1 yield scrapy.Request(self.baseURL+str(self.offset),callback=self.parse)