pymongo
config.py (创建)
MONGO_URL='localhost'
MONGO_DB='taobao'
MONGO_TABLE='prodect'
spider.py
import pymongo from pymongo import MongoClient from config import *
1、链接
client=MongoClient('mongodb://root:123@localhost:27017/') # 或者 # client = MongoClient('localhost', 27017) # client = MongoClient(MONGO_URL)
2、use 数据库
db=client['db2'] #等同于:client.db1 # db=client[MONGO_DB] # 注意是中括号 [ ]
3、查看库下所有的集合
print(db.collection_names(include_system_collections=False))
4、创建集合
table_user=db['userinfo'] #等同于:db.user
5、增加数据
import datetime user0={ "_id":1, "name":"egon", "birth":datetime.datetime.now(), "age":10, 'hobbies':['music','read','dancing'], 'addr':{ 'country':'China', 'city':'BJ' } } user1={ "_id":2, "name":"alex", "birth":datetime.datetime.now(), "age":10, 'hobbies':['music','read','dancing'], 'addr':{ 'country':'China', 'city':'weifang' } }
def save_to_mongo(result): try: if db[MONGO_TABLE].insert(result): print("保存到数据库成功',result) except Exception: print("存储失败") # res=table_user.insert_many([user0,user1]).inserted_ids # print(res) # print(table_user.count())
6、查
print(table_user_collection.find_one()) for item in table_user_collection.find(): print(item) # print(table_user_collection.find_one({"_id":{"$gte":1},"name":'egon'}))
7、改
8、删
table_user_collection.remove({'id':2})
9、scrapy框架pipeline
class MongoPipeline(object): def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DATABASE') ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def close_spider(self, spider): self.client.close() def process_item(self, item, spider): self.db[item.table_name].update({'id': item.get('id')}, {'$set': dict(item)}, True) return item
MONGO_URI = 'localhost'
MONGO_DATABASE = 'weibo'
from pymongo import MongoClient class CustomPipeline(object): def __init__(self,host,port,user,pwd,db,table): self.host=host self.port=port self.user=user self.pwd=pwd self.db=db self.table=table @classmethod def from_crawler(cls, crawler): """ Scrapy会先通过getattr判断我们是否自定义了from_crawler,有则调它来完 成实例化 """ HOST = crawler.settings.get('HOST') PORT = crawler.settings.get('PORT') USER = crawler.settings.get('USER') PWD = crawler.settings.get('PWD') DB = crawler.settings.get('DB') TABLE = crawler.settings.get('TABLE') return cls(HOST,PORT,USER,PWD,DB,TABLE) def open_spider(self,spider): """ 爬虫刚启动时执行一次 """ self.client = MongoClient('mongodb://%s:%s@%s:%s' %(self.user,self.pwd,self.host,self.port)) def close_spider(self,spider): """ 爬虫关闭时执行一次 """ self.client.close() def process_item(self, item, spider): # 操作并进行持久化 d=dict(item) if all(d.values()): self.client[self.db][self.table].save(d)
HOST="127.0.0.1" PORT=27017 USER="root" PWD="123" DB="amazon" TABLE="goods"