spider简介以及基础方法(第一章)
查看网站结构
import builtwith print builtwith.parse("http://example.webscraping.com")
最原始的爬虫
import urllib2 def download(url): print "down", url try: html = urllib2.urlopen(url).read() except urllib2.URLError as e: print "download error", e.reason html = None return html print download("http://example.webscraping.com")
增加递归
import urllib2 def download(url, num_retries): print "down", url try: html = urllib2.urlopen(url).read() except urllib2.URLError as e: print "download error", e.reason html = None if hasattr(e, "code") and 500 <= e.code <600: # recursively retry 5xx HTTP errors return download(url, num_retries-1) return html print download("http://example.webscraping.com",2)
用户代理
import urllib2 def download(url, user_agent = "wswp", num_retries = 2): print "down",url headers = {"User_agent":user_agent} request = urllib2.Request(url, headers=headers) try: html = urllib2.urlopen(request).read() except urllib2.URLError as e: print "download error", e.reason html = None if hasattr(e, "code") and 500 <= e.code < 600: # recursively retry 5xx Http errors return download(url, user_agent, num_retries-1) return html print download("http://www.meetup.com")
运用上述的download
脚本读不出标签(问题:可能是标签不存在)
import re def crawl_sitemap(url): # download the sitemap site_map = download(url) print "site_map", site_map # extract the sitemap links links = re.findall('<loc>(.*?)</loc>', site_map) print 'links', links # download each link for link in links: html = download(links) crawl_sitemap("http://example.webscraping.com/sitmap.xml")
对ID进行遍历,直到出错为止
import itertools # 无限迭代器 for page in itertools.count(1): url = "http://example.webscraping.com/view/-%d" % page html = download(url) if html is None: break else: pass
若ID出现中间被删除,就无法连续自动退出, 为了解决这种问题,
脚本加入连续判断5次,若都为空,就结束
import itertools max_error = 5 # 最大错误值 num_error = 0 # 现有错误 for page in itertools.count(1): url = "http://example.webscraping.com/view/-%d" % page html = download(url) if html is None: num_error += 1 if num_error == max_error: break # 若连续5次错误,程序结束 else: num_error = 0 # 若错误不是连续的,则变量归0
链接爬虫
import re def get_link(html): """return a list of links from html """ webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) # re.IGNORECASE 忽略大小写 return webpage_regex.findall(html) def link_crawler(seed_url, link_regex): """ """ crawl_queue = [seed_url] while crawl_queue: url = crawl_queue.pop() html = download(url) for link in get_link(html): if re.match(link_regex, link): crawl_queue.append(link)
python HTTP模块requests 来实现支持代理的功能
import urllib2 import urlparse proxy = "" opener = urllib2.build_opener() proxy_params = {urlparse.urlparse(url).scheme: proxy} opener.add_handler(urllib2.ProxyHandler(proxy_params)) response = opener.open(request)
新版本的download函数
def download(url, user_agent="wswp", proxy=None, num_retries=2): print "DownLoading", url headers = {"User-agent": user_agent} request = urllib2.Request(url, headers=headers) opener = urllib2.build_opener() if proxy: proxy_params = {urlparse.urlparse(url).scheme: proxy} opener.add_handler(urllib2.ProxyHandler(proxy_params)) try: html = opener.open(request).read() except urllib2.URLError as e: print "download error", e.reason html = None if num_retries > 0: if hasattr(e, "code") and 500 <= e.code <600: # retry 5xx http error html = download(url, user_agent, proxy, num_retries-1) return html
下载限速(两次下载中添加延时)
import time import datetime class Throttle: """Add a delay between downloads to the same domain """ def __init__(self, delay): # amount of delay between downloads for each domain self.delay = delay # timestamp of when a domain was last accessed self.domains = {} def wait(self, url): domain = urlparse.urlparse(url).netloc last_accessed = self.domains.get(domain) if self.delay > 0 and last_accessed is not None: sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds if sleep_secs > 0: # domain has been accessd recently # so need to sleep time.sleep(sleep_secs) # update the last accessed time self.domains[domain] = datetime.datetime.now()
实例操作延时
throttle = Throttle(delay) throttle.wait(url) result = download(url, headers, proxy=proxy, num_retries=num_retries) """爬虫陷阱(有些网站会动态生成内容如:下一月,下一年这种无限递归) 方法:添加深度限制,修改seen变量 (该变量原本只记录访问过的链接,现在修改成为一个字典,增加了页面深度记录) """ def link_crawler(... , max_depth=2): max_depth = 2 seen = {} ... depth = seen[url] if depth != max_depth: for link in links: if link not in seen: seen[link] = depth + 1 crawl_queue.append(link) """禁用该功能把max_depth设成负数就永远不会相等 """
调用最终版本
seed_url = "http://example.webscraping.com/index" link_regex = "/(index|view)" link_crawler(seed_url, link_regex, user_agent="BadCrawler") # user_agent="BadCrawler"用户代理被屏蔽程序运行不了 link_crawler(seed_url, link_regex, max_depth=1) # 这是使用默认用户代理的,深度为1