items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class DouyuItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
nickname = scrapy.Field()
imagelink = scrapy.Field()
imagePath = scrapy.Field()
爬虫文件:meinvspider.py
# -*- coding: utf-8 -*-
import scrapy
import json
from ..items import DouyuItem
class DouyumeinvSpider(scrapy.Spider):
name = 'douyumeinv'
allowed_domains = ['capi.douyucdn.cn']
url="http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset="
offset=0
start_urls = [url+str(offset)]
def parse(self, response):
# 把json格式的数据转换为python格式,data段是列表
data = json.loads(response.text)['data']
for each in data:
item = DouyuItem()
item['nickname']=each['nickname']
item['imagelink']=each['vertical_src']
yield item
self.offset+=20
yield scrapy.Request(self.url+str(self.offset),callback=self.parse)
pipelines.py
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import scrapy,os
from scrapy.utils.project import get_project_settings
from scrapy.pipelines.images import ImagesPipeline
class DouyuPipeline(ImagesPipeline):
# def process_item(self, item, spider):
# return item
# 获取settings文件里设置的变量值
IMAGES_STORE = get_project_settings().get('IMAGES_STORE')
def get_media_requests(self, item, info):
image_url = item["imagelink"]
yield scrapy.Request(image_url)
def item_completed(self, result, item, info):
image_path = [x["path"] for ok, x in result if ok]
os.rename(self.IMAGES_STORE + "/" + image_path[0], self.IMAGES_STORE + "/" + item["nickname"] + ".jpg")
item["imagePath"] = self.IMAGES_STORE + "/" + item["nickname"]
return item
settings.py
自定义字段
IMAGES_STORE = r'C:\Users\xxxx\Desktop\test\heima\day04scrapy\douyu\images'