items.py
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class DouyuItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
nickname = scrapy.Field()
imagelink = scrapy.Field()
imagePath = scrapy.Field()
爬虫文件:meinvspider.py

# -*- coding: utf-8 -*-
import scrapy
import json
from ..items import DouyuItem
class DouyumeinvSpider(scrapy.Spider):
name = 'douyumeinv'
allowed_domains = ['capi.douyucdn.cn']
url="http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset="
offset=0
start_urls = [url+str(offset)]

def parse(self, response):
# 把json格式的数据转换为python格式,data段是列表
data = json.loads(response.text)['data']
for each in data:
item = DouyuItem()
item['nickname']=each['nickname']
item['imagelink']=each['vertical_src']
yield item

self.offset+=20
yield scrapy.Request(self.url+str(self.offset),callback=self.parse)



pipelines.py


# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import scrapy,os
from scrapy.utils.project import get_project_settings
from scrapy.pipelines.images import ImagesPipeline

class DouyuPipeline(ImagesPipeline):
# def process_item(self, item, spider):
# return item
# 获取settings文件里设置的变量值
IMAGES_STORE = get_project_settings().get('IMAGES_STORE')

def get_media_requests(self, item, info):
image_url = item["imagelink"]
yield scrapy.Request(image_url)

def item_completed(self, result, item, info):
image_path = [x["path"] for ok, x in result if ok]
os.rename(self.IMAGES_STORE + "/" + image_path[0], self.IMAGES_STORE + "/" + item["nickname"] + ".jpg")
item["imagePath"] = self.IMAGES_STORE + "/" + item["nickname"]
return item

settings.py
自定义字段
IMAGES_STORE = r'C:\Users\xxxx\Desktop\test\heima\day04scrapy\douyu\images'
posted on 2018-10-27 14:02  简简单单的小神  阅读(81)  评论(0编辑  收藏  举报