mongodb数据库 存爬虫数据
#在进行操作之前,先把mongodb数据库启动起来,新建一个mongo_cache.py文件
import pickle import zlib from datetime import datetime,timedelta import requests from pymongo import MongoClient from bson.binary import Binary class MongoCache(object): """ 数据库缓存 """ def __init__(self,client=None,expires=timedelta(days=30)): self.client = MongoClient("localhost",27017) self.db = self.client.cache ##加速查找设置索引,设置超时时间,如果达到expi reAfterSeconds设置的超时时间,mongodb会自动删除超时数据 self.db.webpage.create_index('timestamp',expireAfterSeconds=expires.total_seconds()) def __setitem__(self, key, value): # 压缩数据设置时间戳 record = {"result":Binary(zlib.compress(pickle.dumps(value))),"timestamp":datetime.utcnow()} ##使用update的upsert (如果不存在执行insert,存在update)参数迸行插入更新操作,$set内置函数表示覆盖原始数据 self.db.webpage.update({"_id":key},{'$set':record},upsert=True) def __getitem__(self, item): #根据_id以item作为关键字,查找相关网页 record = self.db.webpage.find_one({"_id":item}) if record: #如果存在进行解压缩反序列化 return pickle.loads(zlib.decompress(record["result"])) else: raise KeyError(item + "does not exist")#找不到抛出异常 def __contains__(self, item): try: self[item]#执行__getitem__方法 except KeyError: return False#捕获到keyerror异常 else: return True#找到相应数据说明说句酷白喊下载内容 def clear(self): self.db.webpage.drop() if __name__ == '__main__': mongo_cache = MongoCache() url = 'http://tieba.baidu.com/f?kw=猫&red_tag=1' response = requests.get(url) mongo_cache[url] = response.text print(mongo_cache[url])
#在建一个文件 import requests import mongo_cache download_url = "http://tieba.baidu.com/f?kw=猫&red_tag=2" download_response = requests.get(download_url) m_cache = mongo_cache. MongoCache() m_cache [download_url] = download_response.content print (m_cache [download_url]. decode('utf-8')) print(download_url in m_cache)