python Scrapy -5 抓斗鱼
1 2 3 4 5 6 7 8 9 | import scrapy class DouyuItem(scrapy.Item): # define the fields for your item here like: nickname = scrapy.Field() imagelink = scrapy.Field() imagePath = scrapy.Field() #pass |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | # -*- coding: utf-8 -*- import scrapy from douyu.items import DouyuItem import json class DouyumeinvSpider(scrapy.Spider): name = "douyumeinv" allowed_domains = [ "capi.douyucdn.cn" ] offset = 0 url = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=" start_urls = [url + str (offset)] def parse( self , response): # 把json格式的数据转换为python格式,data段是列表 data = json.loads(response.text)[ "data" ] for each in data: item = DouyuItem() item[ "nickname" ] = each[ "nickname" ] item[ "imagelink" ] = each[ "vertical_src" ] yield item self .offset + = 20 yield scrapy.Request( self .url + str ( self .offset), callback = self .parse) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | import scrapy from scrapy.utils.project import get_project_settings from scrapy.pipelines.images import ImagesPipeline import os class ImagesPipeline(ImagesPipeline): #def process_item(self, item, spider): # return item # 获取settings文件里设置的变量值 IMAGES_STORE = get_project_settings().get( "IMAGES_STORE" ) def get_media_requests( self , item, info): image_url = item[ "imagelink" ] yield scrapy.Request(image_url) def item_completed( self , result, item, info): image_path = [x[ "path" ] for ok, x in result if ok] os.rename( self .IMAGES_STORE + "/" + image_path[ 0 ], self .IMAGES_STORE + "/" + item[ "nickname" ] + ".jpg" ) item[ "imagePath" ] = self .IMAGES_STORE + "/" + item[ "nickname" ] return item |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)