(三)爬虫之添加缓存

  对于爬取下来的网页进行缓存,可以方便下次访问,无需再次下载。实现过程可以分为两大块,一是实现下载器,二是实现缓存器。

1. 下载器:

  下载器功能:下载时先查看缓存中是否保存url对应的文件,不存在时再去访问网页端,并将结果加入到缓存中,下载器代码如下:

class Downloader(object):
    def __init__(self,user_agent=None, proxies=None, num_retries=3,delay=5,cache=None):
        self.user_agent = user_agent
        self.proxies = proxies
        self.num_retries=num_retries
        self.throttle = Throttle(delay)
        self.cache = cache

    def __call__(self, url):
        result = None
        try:
            result = self.cache[url]  #从缓存中获取结果
        except KeyError:
            pass
        else:
            if self.num_retries>0 and 500<=result['code']<None:
                result = None
        if result==None:
            self.throttle.wait(url)
            response = self.download(url,self.user_agent,self.proxies,self.num_retries)
            result={'html':response.text,'code':response.status_code}
            if self.cache:
                self.cache[url]=result   #将结果保存到缓存
        return result['html']

    def download(self,url, user_agent, proxies, num_retries):
        response = requests.get(url, headers={'User-Agent': user_agent}, proxies=proxies)
        if response.status_code and 500 <= response.status_code < 600:  # 出现服务器端错误时重试三次
            if num_retries > 0:
                response = self.download(url, user_agent, proxies, num_retries - 1)
        return response

#同一个域名的下载延迟
class Throttle(object):
    def __init__(self,delay):
        self.delay = delay
        self.domains={}

    def wait(self,url):
        domain = urlparse.urlparse(url).netloc  #提取网址的域名
        last_accessed = self.domains.get(domain)
        if self.delay>0 and last_accessed!=None:
            sleep_secs = self.delay-(datetime.now()-last_accessed).seconds
            if sleep_secs>0:
                time.sleep(sleep_secs)
        self.domains[domain]=datetime.now()

  在爬取网页时,使用下载器代码如下:

def link_carwl(start_url,link_regex,max_depth=5,callback=None,user_agent=None, proxies=None, num_retries=3,delay=5,cache=None):
    url_queue = [start_url]
    seen = {start_url:0}
    down = Downloader(user_agent=user_agent, proxies=proxies, num_retries=num_retries,delay=delay,cache=cache)
    while url_queue:
        url = url_queue.pop()
        html = down(url)
        if callback!=None:
            callback(url,html)
        depth = seen[url]
        if depth<max_depth:
            for link in get_links(html):
                if re.match(link_regex,link):
                    #urlparse.urljoin(url,link)  #link可能为相对路径
                    if link not in seen:   #不访问重复的url
                        seen[link] =depth+1  #在url的深度基础上加一
                        url_queue.append(link)
# url提取
def get_links(html):
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)  # ["\']匹配单引号或双引号
    return webpage_regex.findall(html)

2. 缓存器:

  2.1 磁盘缓存

    在上述下载器的代码中,缓存器可以像字典一样,以url为键,网页内容为值,进行存取,因此缓存器应该实现__getitem__()和__setitem__()方法。另外,缓存还应该设置有效期,并对网页内容进行压缩,减少所需磁盘空间。

    思路: 对url进行hash计算,将结果做为文件名,然后将内容写入文件或从文件读出。实现代码如下:

class DiskCache(object):
    def __init__(self,cache_dir='cache',expires=timedelta(days=30)):
        self.cache_dir = cache_dir
        self.expires = expires  #缓存有效期30天
     if not os.path.exists(self.cache_dir):
  os.makedirs(self.cache_dir) def url_to_path(self,url): #对url进行hash摘要计算,以其为文件名 h = hashlib.md5() h.update(url) return h.hexdigest() def __getitem__(self, url): path = os.path.join(self.cache_dir, self.url_to_path(url)) if os.path.exists(path): with open(path,'rb') as f: result,timestamp = pickle.loads(zlib.decompress(f.read())) if datetime.utcnow()>timestamp+self.expires: #判断缓存是否过期 raise KeyError(url+'has expired!') return result else: raise KeyError(url+'does not exist!') def __setitem__(self, url, result): path = os.path.join(self.cache_dir,self.url_to_path(url)) timestamp = datetime.utcnow() data = pickle.dumps((result,timestamp)) #加上时间戳,判断缓存是否过期 with open(path,'wb') as f: f.write(zlib.compress(data)) #压缩,减少内存

 使用下载器和磁盘缓存器的完整代码如下: 

#coding:utf-8

# 为爬虫添加缓存

import requests
import re
import urlparse
from datetime import datetime,timedelta
import time
import hashlib
import os
import pickle
import zlib
from pymongo import MongoClient
from bson.binary import Binary



class Downloader(object):
    def __init__(self,user_agent=None, proxies=None, num_retries=3,delay=5,cache=None):
        self.user_agent = user_agent
        self.proxies = proxies
        self.num_retries=num_retries
        self.throttle = Throttle(delay)
        self.cache = cache

    def __call__(self, url):
        result = None
        try:
            result = self.cache[url]  #从缓存中获取结果
        except KeyError:
            pass
        else:
            if self.num_retries>0 and 500<=result['code']<None:
                result = None
        if result==None:
            self.throttle.wait(url)
            response = self.download(url,self.user_agent,self.proxies,self.num_retries)
            result={'html':response.text,'code':response.status_code}
            if self.cache:
                self.cache[url]=result   #将结果保存到缓存
        return result['html']

    def download(self,url, user_agent, proxies, num_retries):
        response = requests.get(url, headers={'User-Agent': user_agent}, proxies=proxies)
        if response.status_code and 500 <= response.status_code < 600:  # 出现服务器端错误时重试三次
            if num_retries > 0:
                response = self.download(url, user_agent, proxies, num_retries - 1)
        return response

#同一个域名的下载延迟
class Throttle(object):
    def __init__(self,delay):
        self.delay = delay
        self.domains={}

    def wait(self,url):
        domain = urlparse.urlparse(url).netloc  #提取网址的域名
        last_accessed = self.domains.get(domain)
        if self.delay>0 and last_accessed!=None:
            sleep_secs = self.delay-(datetime.now()-last_accessed).seconds
            if sleep_secs>0:
                time.sleep(sleep_secs)
        self.domains[domain]=datetime.now()

class DiskCache(object):
    def __init__(self,cache_dir='cache',expires=timedelta(days=30)):
        self.cache_dir = cache_dir
        self.expires = expires  #缓存有效期30天
        if not os.path.exists(self.cache_dir):
            os.makedirs(self.cache_dir)

    def url_to_path(self,url):  #对url进行hash摘要计算,以其为文件名
        h = hashlib.md5()
        h.update(url)
        return h.hexdigest()

    def __getitem__(self, url):
        path = os.path.join(self.cache_dir, self.url_to_path(url))
        if os.path.exists(path):
            with open(path,'rb') as f:
                result,timestamp = pickle.loads(zlib.decompress(f.read()))
            if datetime.utcnow()>timestamp+self.expires:  #判断缓存是否过期
                raise KeyError(url+'has expired!')
            return result
        else:
            raise KeyError(url+'does not exist!')

    def __setitem__(self, url, result):
        path = os.path.join(self.cache_dir,self.url_to_path(url))
        timestamp = datetime.utcnow()
        data = pickle.dumps((result,timestamp))  #加上时间戳,判断缓存是否过期
        with open(path,'wb') as f:
            f.write(zlib.compress(data))  #压缩,减少内存

def link_carwl(start_url,link_regex,max_depth=5,callback=None,user_agent=None, proxies=None, num_retries=3,delay=5,cache=None):
    url_queue = [start_url]
    seen = {start_url:0}
    down = Downloader(user_agent=user_agent, proxies=proxies, num_retries=num_retries,delay=delay,cache=cache)
    while url_queue:
        url = url_queue.pop()
        html = down(url)
        if callback!=None:
            callback(url,html)
        depth = seen[url]
        if depth<max_depth:
            for link in get_links(html):
                if re.match(link_regex,link):
                    #urlparse.urljoin(url,link)  #link可能为相对路径
                    if link not in seen:   #不访问重复的url
                        seen[link] =depth+1  #在url的深度基础上加一
                        url_queue.append(link)
# url提取
def get_links(html):
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)  # ["\']匹配单引号或双引号
    return webpage_regex.findall(html)

if __name__ == '__main__':
    link_carwl('https://nj.lianjia.com/ershoufang/',r'https://nj.lianjia.com/ershoufang/.*',max_depth=1,cache=DiskCache())
    d =Downloader(cache=DiskCache())
    print d.cache['https://nj.lianjia.com/ershoufang/']['html']
View Code

  2.2 MongoDB缓存

  使用磁盘缓存时,每个文件夹下存储的文件数量有限(FAT32为65535个),而且对于过期的缓存数据需要手动管理。使用数据库缓存,能存储的文件数量更多,且能自动清除过期数据,实现起来也较为简单。还是以url为键,网页为值进行存储,实现代码如下:

from pymongo import MongoClient
from bson.binary import Binary

class MongoCache(object):
    def __init__(self,client=None,expires=timedelta(days=30)):
        self.client =MongoClient('127.0.0.1',27017) if client is None else client
        self.db = self.client.cache #连接cache数据库,没有则创建
        self.collection = self.db.webpage #webpage集合,没有则创建(集合相当于表)
        self.collection.create_index('timestamp',expireAfterSeconds=expires.total_seconds())

    def __getitem__(self, url):
        record = self.collection.find_one({'_id':url})
        if record:
            return pickle.loads(zlib.decompress(record['result']))
        else:
            raise KeyError(url + 'does not exist!')
    def __setitem__(self, url, result):
        record={'result':Binary(zlib.compress(pickle.dumps(result))),'timestamp':datetime.utcnow()}
        #mongoDB 存储文件,将数据转化为二进制再存储
        self.collection.update({'_id':url},{'$set':record},upsert=True)

  使用下载器和MongoDB缓存器的完整代码如下:  

#coding:utf-8

# 为爬虫添加缓存

import requests
import re
import urlparse
from datetime import datetime,timedelta
import time
import hashlib
import os
import pickle
import zlib
from pymongo import MongoClient
from bson.binary import Binary



class Downloader(object):
    def __init__(self,user_agent=None, proxies=None, num_retries=3,delay=5,cache=None):
        self.user_agent = user_agent
        self.proxies = proxies
        self.num_retries=num_retries
        self.throttle = Throttle(delay)
        self.cache = cache

    def __call__(self, url):
        result = None
        try:
            result = self.cache[url]  #从缓存中获取结果
        except KeyError:
            pass
        else:
            if self.num_retries>0 and 500<=result['code']<None:
                result = None
        if result==None:
            self.throttle.wait(url)
            response = self.download(url,self.user_agent,self.proxies,self.num_retries)
            result={'html':response.text,'code':response.status_code}
            if self.cache:
                self.cache[url]=result   #将结果保存到缓存
        return result['html']

    def download(self,url, user_agent, proxies, num_retries):
        response = requests.get(url, headers={'User-Agent': user_agent}, proxies=proxies)
        if response.status_code and 500 <= response.status_code < 600:  # 出现服务器端错误时重试三次
            if num_retries > 0:
                response = self.download(url, user_agent, proxies, num_retries - 1)
        return response

#同一个域名的下载延迟
class Throttle(object):
    def __init__(self,delay):
        self.delay = delay
        self.domains={}

    def wait(self,url):
        domain = urlparse.urlparse(url).netloc  #提取网址的域名
        last_accessed = self.domains.get(domain)
        if self.delay>0 and last_accessed!=None:
            sleep_secs = self.delay-(datetime.now()-last_accessed).seconds
            if sleep_secs>0:
                time.sleep(sleep_secs)
        self.domains[domain]=datetime.now()

class MongoCache(object):
    def __init__(self,client=None,expires=timedelta(days=30)):
        self.client =MongoClient('127.0.0.1',27017) if client is None else client
        self.db = self.client.cache #连接cache数据库,没有则创建
        self.collection = self.db.webpage #webpage集合,没有则创建(集合相当于表)
        self.collection.create_index('timestamp',expireAfterSeconds=expires.total_seconds())

    def __getitem__(self, url):
        record = self.collection.find_one({'_id':url})
        if record:
            return pickle.loads(zlib.decompress(record['result']))
        else:
            raise KeyError(url + 'does not exist!')
    def __setitem__(self, url, result):
        record={'result':Binary(zlib.compress(pickle.dumps(result))),'timestamp':datetime.utcnow()}
        #mongoDB 存储文件,将数据转化为二进制再存储
        self.collection.update({'_id':url},{'$set':record},upsert=True)

def link_carwl(start_url,link_regex,max_depth=5,callback=None,user_agent=None, proxies=None, num_retries=3,delay=5,cache=None):
    url_queue = [start_url]
    seen = {start_url:0}
    down = Downloader(user_agent=user_agent, proxies=proxies, num_retries=num_retries,delay=delay,cache=cache)
    while url_queue:
        url = url_queue.pop()
        html = down(url)
        if callback!=None:
            callback(url,html)
        depth = seen[url]
        if depth<max_depth:
            for link in get_links(html):
                if re.match(link_regex,link):
                    #urlparse.urljoin(url,link)  #link可能为相对路径
                    if link not in seen:   #不访问重复的url
                        seen[link] =depth+1  #在url的深度基础上加一
                        url_queue.append(link)
# url提取
def get_links(html):
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)  # ["\']匹配单引号或双引号
    return webpage_regex.findall(html)

if __name__ == '__main__':
    link_carwl('https://nj.lianjia.com/ershoufang/',r'https://nj.lianjia.com/ershoufang/.*',max_depth=1,cache=MongoCache())
    d = Downloader(cache=MongoCache())
    print d.cache['https://nj.lianjia.com/ershoufang/']['html']
View Code

 

posted @ 2018-12-21 21:47  silence_cho  阅读(948)  评论(0编辑  收藏  举报