封装一个缓存类,在爬取时可以调用,保存在数据库中
# coding=utf-8 import pickle import zlib from datetime import datetime,timedelta import requests from pymongo import MongoClient from bson.binary import Binary class MongoCache(object): """ 数据库缓存 """ def __init__(self,client=None,expires=timedelta(days=30)): self.client = MongoClient("localhost",27017) self.db = self.client.cache # 加速查找设置索引,设超时时间,如果达到expireAfterSeconds设置的超时时间,MongoDB会把超时数据自动删除 self.db.webpage.create_index('timestamp',expireAfterSeconds=expires.total_seconds()) def __setitem__(self, key, value): ''' timestamp:时间戳 :param key: :param value: :return: ''' # 压缩数据,设置时间戳 record = {"result":Binary(zlib.compress(pickle.dumps(value))),"timestamp":datetime.utcnow()} # 使用update的upsert(如果不存在执行insert,存在执行update参数进行插入更新操作),$set内置函数表示覆盖原始数据 self.db.webpage.update({"_id":key},{'$set':record},upsert=True) def __getitem__(self, item): # 根据_id以iteam作为关键字(例如url:http://www.baidu.com)查找相关网页 record = self.db.webpage.find_one({"_id":item}) if record: # return pickle.dumps(zlib.decompress(record["result"])) #解压缩 return pickle.loads(zlib.decompress(record["result"])) #解压缩 else: raise KeyError(item + "does not exist") #找不到抛出异常 def __contains__(self, item): try: self[item] #这里会调用__getitem__方法 except KeyError: return False #捕获到KeyError异常,说明没找到相关参数,参考33行抛出异常的条件 else: return True #找到相应数据说明数据库包含下载内容 def clear(self): self.db.webpage.drop() #把缓存库清空 if __name__=='__main__': mongoCache = MongoCache() url = 'http://www.51hei.com/bbs/dpj-135132-1.html' response = requests.get(url) mongoCache[url] = response.content