使用官方组件下载图片,保存到MySQL数据库,保存到MongoDB数据库
需要学习的地方,使用官方组件下载图片的用法,保存item到MySQL数据库
需要提前创建好MySQL数据库,根据item.py文件中的字段信息创建相应的数据表
1.items.py文件
from scrapy import Item,Field class Images360Item(Item): # MongoDB数据库的表名和MySQL数据库的表名 collection = table = 'images' id = Field() url = Field() title = Field() thumb = Field()
2.settings.py文件
ITEM_PIPELINES = { 'images360.pipelines.ImagesPipeline': 300, 'images360.pipelines.MongoPipeline': 301, 'images360.pipelines.MySQLPipeline': 302, } # MongoDB数据库参数 MONGO_URI = '127.0.0.1' MONGO_DATABASE = 'images360' # 图片保存路径 IMAGES_STORE = 'D:\\images360\\images' # MySQL数据库参数 MYSQL_HOST = '127.0.0.1' MYSQL_DATABASE = 'images360' MYSQL_USER = 'root' MYSQL_PASSWORD = 'root' MYSQL_PORT = 3306
3.pipeline.py文件
import pymongo import pymysql from scrapy import Request from scrapy.exceptions import DropItem from scrapy.pipelines.images import ImagesPipeline # 下载项目图片 class ImagesPipeline(ImagesPipeline): def file_path(self, request, response=None, info=None): url = request.url filename = url.split('/')[-1] return filename def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Image Downloaded Failed") return item def get_media_requests(self, item, info): yield Request(item['url']) # 保存到MongoDB数据库 class MongoPipeline(object): # 数据表名 collection_name = 'scrapy_items' def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod # 从settings.py文件中获取参数 def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DATABASE', 'items') # 数据库名 ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def close_spider(self, spider): self.client.close() def process_item(self, item, spider): self.db[item.collection].insert_one(dict(item)) # 从items.py文件中获取表名 return item # 保存数据到MySQL数据库 class MySQLPipeline(object): def __init__(self, host, database, user, password, port): self.host = host self.database = database self.user = user self.password = password self.port = port @classmethod def from_crawler(cls, crawler): return cls( host=crawler.settings.get('MYSQL_HOST'), database=crawler.settings.get('MYSQL_DATABASE'), user=crawler.settings.get('MYSQL_USER'), password=crawler.settings.get('MYSQL_PASSWORD'), port=crawler.settings.get('MYSQL_PORT'), ) def open_spider(self, spider): self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8', port=self.port) self.cursor = self.db.cursor() def close_spider(self, spider): self.db.close() def process_item(self, item, spider): data = dict(item) keys = ', '.join(data.keys()) values = ', '.join(['%s'] * len(data)) sql = 'insert into %s (%s) values (%s)' % (item.table, keys, values) self.cursor.execute(sql, tuple(data.values())) self.db.commit() return item