scrapy save mysql or mongo, 和图片下载保存
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymongo import pymysql from scrapy import Request from scrapy.exceptions import DropItem from scrapy.pipelines.images import ImagesPipeline class Images360Pipeline( object ): def process_item( self , item, spider): return item # mongo db class MongoPipeline( object ): def __init__( self , mongo_url, mongo_db): self .mongo_url = mongo_url self .mongo_db = mongo_db @classmethod def from_crawler( cls , crawler): return cls ( mongo_url = crawler.settings.get( 'MONGO_URL' ), mongo_db = crawler.settings.get( 'MONGO_DB' ) ) def open_spider( self , spider): self .client = pymongo.MongoClient( self .mongo_url) self .db = self .client[ self .mongo_db] def process_item( self , item, spider): self .db[item.collection].insert( dict (item)) return item def close_spider( self , spider): self .client.close() # mysql class MysqlPipeline( object ): def __init__( self , host, database, user, password, port): self .host = host self .database = database self .user = user self .password = password self .port = port @classmethod def from_crawler( cls , crawler): return cls ( host = crawler.settings.get( 'MYSQL_HOST' ), database = crawler.settings.get( 'MYSQL_DATABASE' ), user = crawler.settings.get( 'MYSQL_USER' ), password = crawler.settings.get( 'MYSQL_PASSWORD' ), port = crawler.settings.get( 'MYSQL_PORT' ) ) def open_spider( self , spider): self .db = pymysql.connect( self .host, self .user, self .password, self .database, charset = 'utf8' , port = self .port) self .cursor = self .db.cursor() def close_spider( self , spider): self .db.close() def process_item( self , item, spider): data = dict (item) keys = ',' .join(data.keys()) value = ',' .join([ '%s' ] * len (data)) sql = 'insert into %s (%s) values (%s)' % (item.table, keys, value) self .cursor.execute(sql, tuple (data.values())) self .db.commit() return item # 下载图片 class ImagePipeline(ImagesPipeline): def file_path( self , request, response = None , info = None ): url = request.url file_name = url.split( '/' )[ - 1 ] return file_name # 如果图片下载失败,不进行保存数据库,IMAGES_STORE = '保存的文件名称如: ./images' def item_completed( self , results, item, info): image_paths = [x[ 'path' ] for ok, x in results if ok] if not image_paths: raise DropItem( 'Image Downloaded Failed' ) return item def get_media_requests( self , item, info): yield Request(item[ 'url' ]) |
settings.py配置
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | # 只列出部分代码,最先执行ImagePipeline ITEM_PIPELINES = { 'images360.pipelines.ImagePipeline' : 300 , 'images360.pipelines.MongoPipeline' : 301 , 'images360.pipelines.MysqlPipeline' : 302 , } MAX_PAGE = 50 MONGO_URL = 'localhost' MONGO_DB = 'images360' BOT_NAME = 'images360' MYSQL_HOST = 'localhost' MYSQL_DATABASE = 'images360' MYSQL_USER = 'root' MYSQL_PASSWORD = '123456' MYSQL_PORT = '3306' # 下载的图片保存路径 IMAGE_STORE = './images' SPIDER_MODULES = [ 'images360.spiders' ] NEWSPIDER_MODULE = 'images360.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent # USER_AGENT = 'images360 (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?
· 【译】Visual Studio 中新的强大生产力特性
· 【设计模式】告别冗长if-else语句:使用策略模式优化代码结构
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义