python系列整理---爬虫架构简单代码实现
爬虫spider流程示意图
根据以上流程简单实现爬虫功能,只是一种简单的做事风格,实际更复杂,不做具体讨论。
1. 目录
2. engine.py
# encoding=utf-8 import os from spider.scheduler import Scheduler def read_urls(file_path): with open(file_path, 'r+', encoding='utf-8') as fp: lines = fp.readlines() return [line.strip() for line in lines if line.strip()] def engine(): path = os.path.dirname(__file__) + '/urls.txt' urls = read_urls(path) htmls = Scheduler.download(urls) data = Scheduler.analysis(htmls) Scheduler.storage(data) if __name__ == '__main__': engine()
3. scheduler.py
# encoding=utf-8 from spider.downloader import Download from spider.analysis import Analysis from spider.storage import Storage class Scheduler: def __init__(self): pass @staticmethod def download(urls): urls = urls if isinstance(urls, list) else [urls] htmls = [] # 下载 for url in urls: htmls.append((url, Download.get(url))) return htmls @staticmethod def analysis(_tuple): """[(url, html), (url, html)]""" # 解析 data = [] for url, html in _tuple: data.append(Analysis.parse(url, html)) return data @staticmethod def storage(data): # 存储 for params in data: Storage.storage(params)
4. downloader.py
# encoding=utf-8 import requests class Download: """ 1. 高效爬取 2. 常见反反爬虫手段 3. 数据量的问题:并发, 分布式 """ def __init__(self): pass @staticmethod def get(url, headers={}): html = requests.get(url, headers=headers) return html.text @staticmethod def post(url, data={}, headers={}): html = requests.post(url, data=data, headers=headers) return html.text @staticmethod def get_headers(params): """...""" return params
5. storage.py
# encoding=utf-8 import hashlib import pymysql class Storage: table = 'spider' def __init__(self): pass @staticmethod def storage(params): """ insert or update params :param params: :return: """ sql_util = SqlUtil('127.0.0.1', 3306, 'root', '123456', 'mysql') _id = Storage.url2md5(url=params['url']) if sql_util.exists(Storage.table, _id): sql_util.update(Storage.table, where={'id': _id}, dict_value=params) else: sql_util.insert(Storage.table, params) @staticmethod def url2md5(url): if isinstance(url, str): url = url.encode('utf-8') m2 = hashlib.md5() m2.update(url) return m2.hexdigest()
6. urls.txt
https://www.cnblogs.com/sui776265233/p/9719463.html
https://github.com/jiangsiwei2018/BigData.git
实例代码git仓库地址