数据存储 mongodb
数据存储 mongodb
1 from pymongo import MongoClient 2 import os 3 base_dir = os.getcwd() 4 class MongoPipeline(object): 5 # 实现保存到mongo数据库的类, 6 collection = 'douban' # mongo 数据库的 collection 名字 7 8 def __init__(self, mongo_uri, db_name, db_user, db_pass): 9 self.mongo_uri = mongo_uri 10 self.db_name = db_name 11 self.db_user = db_user 12 self.db_pass = db_pass 13 14 @classmethod 15 def from_crawler(cls, crawler): 16 # scrapy 为我们访问settings提供了这样的一个方法,这里, 17 # 我们需要从 settings.py 文件中,取得数据库的URI和数据库名称 18 return cls( 19 mongo_uri=crawler.settings.get('MONGO_URI'), 20 db_name=crawler.settings.get('DB_NAME'), 21 db_user=crawler.settings.get('DB_USER'), 22 db_pass=crawler.settings.get('DB_PASS')) 23 24 def open_spider(self, spider): # 爬虫启动时调用,连接到数据库 25 self.client = MongoClient(self.mongo_uri) 26 self.zfdb = self.client[self.db_name] 27 self.zfdb.authenticate(self.db_user, self.db_pass) 28 29 def close_spider(self, spider): # 爬虫关闭时调用,关闭数据库连接 30 self.client.close() 31 32 def process_item(self, item, spider): 33 self.zfdb[self.collection].insert({"title": item["title"].strip()}) 34 return item