mongodb数据库存爬虫数据

#在进行操作之前，先把mongodb数据库启动起来，新建一个mongo_cache.py文件

import pickle
import zlib
from datetime import datetime,timedelta

import requests
from pymongo import MongoClient
from bson.binary import Binary

class MongoCache(object):
    """
    数据库缓存
    """
    def __init__(self,client=None,expires=timedelta(days=30)):
        self.client = MongoClient("localhost",27017)
        self.db = self.client.cache
        ##加速查找设置索引，设置超时时间,如果达到expi reAfterSeconds设置的超时时间，mongodb会自动删除超时数据
        self.db.webpage.create_index('timestamp',expireAfterSeconds=expires.total_seconds())


    def __setitem__(self, key, value):
        # 压缩数据设置时间戳
        record = {"result":Binary(zlib.compress(pickle.dumps(value))),"timestamp":datetime.utcnow()}
        ##使用update的upsert (如果不存在执行insert,存在update)参数迸行插入更新操作，$set内置函数表示覆盖原始数据
        self.db.webpage.update({"_id":key},{'$set':record},upsert=True)

    def __getitem__(self, item):
        #根据_id以item作为关键字，查找相关网页
        record = self.db.webpage.find_one({"_id":item})
        if record:
            #如果存在进行解压缩反序列化
            return pickle.loads(zlib.decompress(record["result"]))
        else:
            raise KeyError(item + "does not exist")#找不到抛出异常

    def __contains__(self, item):
        try:
            self[item]#执行__getitem__方法
        except KeyError:
            return False#捕获到keyerror异常
        else:
            return True#找到相应数据说明说句酷白喊下载内容

    def clear(self):
        self.db.webpage.drop()

if __name__ == '__main__':
    mongo_cache = MongoCache()
    url = 'http://tieba.baidu.com/f?kw=猫&red_tag=1'
    response = requests.get(url)
    mongo_cache[url] = response.text
    print(mongo_cache[url])

#在建一个文件
import requests

import mongo_cache

download_url = "http://tieba.baidu.com/f?kw=猫&red_tag=2"
download_response = requests.get(download_url)
m_cache = mongo_cache. MongoCache()
m_cache [download_url] = download_response.content
print (m_cache [download_url]. decode('utf-8'))
print(download_url in m_cache)

posted @ 2018-12-19 11:51 青春叛逆者阅读(265) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

青春叛逆者

mongodb数据库 存爬虫数据

公告

mongodb数据库存爬虫数据