一个完整的爬虫
# encoding=utf-8 import requests from fake_useragent import UserAgent from retrying import retry import hashlib #信息摘要 md5 sha import queue #队列 import re #正则表达式 from urllib import robotparser #解析网站robots.txt文件 from urllib.parse import urlparse,urljoin,urldefrag #解析url from threading import Thread #多线程 from datetime import datetime import time import mongo_cache import random MAX_DEP = 2 #定义爬虫爬取深度 def get_robots(url): """ 解析robots.txt文件 :param url: :return: """ rp = robotparser.RobotFileParser() rp.set_url(urljoin(url,'robots.txt')) rp.read() return rp def save_url(html_content,url_str): """ 存储下载内容 :param html_content: :return: """ md5 = hashlib.md5() md5.update(html_content) #file_path = "./download/" + md5.hexdigest()+".html" file_path = "./download/" + gen_html_name(url_str) + ".html" with open(file_path,"wb") as f: f.write(html_content) def gen_html_name(url_str): path = urlparse(url_str).path path_array = path.split('/') return path_array[len(path_array) - 1] def extractor_url_lists(html_content): """ 抽取网页中的其他链接 :param self: :param html_content: :return: """ url_regex = re.compile('<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE) return url_regex.findall(html_content) class CrawlerCommon(Thread): """ 实现一个通用爬虫,涵盖基本的爬虫功能及涉及一些反爬虫技术 """ def __init__(self,init_url): super(CrawlerCommon, self).__init__() __ua = UserAgent() #随机User-Agent self.seed_url = init_url #初始爬取的种子网址 self.crawler_queue = queue.Queue()#使用不同的队列会造成BFS和DFS的效果 self.crawler_queue.put(init_url) #将种子网址放入队列 self.visited = {init_url : 0} #初始化爬取深度为0 self.rp = get_robots(init_url) #初始化orbots解析器 self.headers = {"User-Agent": __ua.random} #生成一个随机user-agent self.link_regex = '(index|view)' #抽取网址的过滤条件 self.throttle = Throttle(5.0) #下载限流器间隔5秒 self.mcache = mongo_cache.MongoCache() #初始化mongo_cache @retry(stop_max_attempt_number=3) def retry_download(self,url_str,data,method,proxies): """ 使用装饰器的重试下载类 :param url_str: :param data: :param method: :param proxies: :return: """ if method == "POST": result = requests.post(url_str,data=data,headers=self.headers,proxies=proxies) else: result = requests.get(url_str,headers=self.headers,timeout=3,proxies=proxies) assert result.status_code == 200 #此处为断言,判断状态码是否为200 return result.content def download(self,url_str,data=None,method="GET",proxies={}): """ 真正的下载类 """ print("download url is ::::::",url_str) try: #此处添加随机代理代码 result = self.retry_download(url_str,data,method,proxies) except Exception as e:#python3使用as e print(e.message) result = None return result def nomalize(self,url_str): """ 补全下载链接 :param url_str: :return: """ real_url,_ = urldefrag(url_str) return urljoin(self.seed_url,real_url) def save_result(self,html_content,url_str): """ 将结果存入数据库,存入前检查内容是否存在 :param html_content: 下载的二进制内容 :param url_str: 下载网页的url :return: 无 """ if url_str not in self.mcache: self.mcache[url_str] = html_content else: data_from_mongo = self.mcache[url_str] #初始化md5算法 md5_func_mongo = hashlib.md5() md5_func_download = hashlib.md5() #生成数据库记录的md5摘要 md5_func_mongo.update(data_from_mongo) mongo_md5_str = md5_func_mongo.hexdigest() #生成下载数据的md5摘要 md5_func_download.update(html_content) download_md5_str = md5_func_download.hexdigest() #对比下载结果是否和数据库一样 if download_md5_str != mongo_md5_str: self.mcache[url_str] = html_content def run(self): """ 进行网页爬取的主要方法 :return: """ while not self.crawler_queue.empty(): url_str = self.crawler_queue.get() #检测robots.txt文件规则 if self.rp.can_fetch(self.headers["User-Agent"],url_str): self.throttle.wait_url(url_str) depth = self.visited[url_str] if depth < MAX_DEP: #下载链接 html_content = self.download(url_str) #存储链接 if html_content is not None: self.mcache[url_str] = html_content save_url(html_content,url_str) #筛选出页面所有链接 url_list = extractor_url_lists(html_content.decode('utf-8')) #筛选需要爬取的链接 filter_urls = [link for link in url_list if re.search('/(html)',link)] for url in filter_urls: #补全链接 real_url = self.nomalize(url) #判断////链接是否访问过 if real_url not in self.visited: #print "link is ::::::",real_url self.visited[real_url] = depth + 1 self.crawler_queue.put(real_url) else: print("robots.txt 禁止下载:",url_str) class Throttle(object): """ 下载限流器 """ def __init__(self,delay): self.domains = {} self.delay = delay def wait_url(self,url_str): domain_url = urlparse(url_str).netloc #取出网址域名部分(netloc) last_accessed = self.domains.get(domain_url) #取出域名的上次下载时间 if self.delay > 0 and last_accessed is not None: #将当前时间和上次下载时间相减,得出两次下载时间间隔,然后用休眠时间(delay)减去这个间隔 #如果大于0就休眠,否则直接下载后续链接 sleep_interval = self.delay - (datetime.now() - last_accessed).seconds if sleep_interval > 0 : time.sleep(sleep_interval) self.domains[domain_url] = datetime.now() #把当前时间以域名作为key存到domains字典中 if __name__ == "__main__": crawler = CrawlerCommon("http://www.runoob.com/html/html5-intro.html") crawler.start()