python3爬虫-1
注:erroe:5** 服务器端错误 4**地址错误或错误发生在客户端
1.基本的网页下载
from urllib import request
from urllib.error import URLError
def download(url):
reponse= request.Request(url)
html = request.urlopen(response).read().decode('utf-8')
return html
2.当不能访问的链接过多时,需要对爬虫进行设置,当错误链接超过限定次数时,停止
def download(url, num_retried=2):
print('download:',url)
try:
response = request.Request(url)
html = request.urlopen(response).read().decode('utf-8')
except error.URLError as e:
print('Download Error:',e.reason)
html =None
if num_retried >0:
if hasattr(e,'code') and 500 <=e.code <600:
return download(url,num_retried-1)
return html
3.有用户代理的重试下载def download(url, num_retried=2,user_agent = 'wswp'): print('download:',url)
headers = {'User-agent':user_agent}
response = request.Request(url, headers=headers)
try:
html = request.urlopen(response).read()
except error.URLError as e:
print('downloadError',e.reason)
html = None
if num_retried > 0:
if hasattr(e,'code') and 500<= e.code <600:
download(url,user_agent,num_retried-1)
return html
4.网络地图爬虫
def crawl_sitemap(url):
sitemap = download(url)
print(type(sitemap))
pattern = re.compile('<loc>(.*?)</loc>')
links = pattern.findall(str(sitemap))
for link in links:
html = download(link)
5.ID遍历爬虫
import itertools
def crawl_ID(url):
for page in itertools.count(1):
urls =url+'-%d' % page
html = download(urls)
if html is None:
break
else:
pass
6.升级版网络爬虫,知道链接错误5次才会停止下载
def crawl_ID_1(url,max_error):
num_error = 0
for page in itertools.count(1):
urls = url+'-%d' % page
html = download(urls)
if html is None:
num_error +=1
if num_error == max_error:
break
else:
num_error = 0
7.链接爬虫
import re
def link_crawler(seed_url, link_regex):
crawl_queue = [seed_url]
while crawl_queue:
url = crawl_queue.pop()
html = download(url)
for link in get_links(html):
if re.match(link_regex, link):
crawl_queue.append(link)
def get_links(html):
webpage_regex = re.compile('<a[^>] +href=["\'](.*?)["\']', re.IGNORECASE)
return webpage_regex.findall(str(html))
8.升级版链接爬虫,合成绝对链接
def link_crawler_1(seed_url, link_regex):
crawl_queue = [seed_url]
while crawl_queue:
url = crawl_queue.pop()
html = download(url)
for link in get_links(html):
if re.match(link_regex, link):
link = urlparse.urljoin(seed_url, link)
crawl_queue.append(link)
def get_links(html):
webpage_regex = re.compile('<a[^>] +href=["\'](.*?)["\']', re.IGNORECASE)
return webpage_regex.findall(str(html))
9.解析robot.txt文件
在shell中进行
import urllib.robotparser
rp =urllib.robotparser.RobotFileParser()
rp.set_url('http://example.webscraping.com/robot.txt')
rp.read()
url = 'http://example.webscraping.com'
user_agent = 'BadCrawler'
rp.can_fetch(user_agent,url)
10.支持代理
from urllib import request
from urllib.error import URLError
def download(url,user_agent='wswp',proxy=None,num_retries=2):
print('download',url)
header = {'User-agent',user_agent}
response = request.Request(url,headers=header)
opener =urllib.request.build_opener(response)
if proxy:
proxy_params = {urlparse(url).scheme: proxy}
opener.add_handler(urllib.request.ProxyHandler(proxy_params))
try:
html = opener.open(response).read()
except urllib.error.URLError as e:
print('Download error:', e.reason)
html =None
if num_retries >0:
if hasattr(e, 'code') and 500 <= e.code < 600:
html = download(url, user_agent,proxy,num_retries-1)
return html
11.下载限速
import datetime
import time
class Throttle:
def __init__(self,delay):
self.delay = delay
self.domains = {}
def wait(self,url):
domain = urlparse(url).netloc
last_accessed = self.domains.get(domain)
if self.delay >0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds
if sleep_secs > 0:
time.sleep(sleep_secs)
self.domains[domain] = datetime.datetime.now()
12.避免重复下载
def link_crawler_1(seed_url, link_regex):
crawl_queue = [seed_url]
seen= set[seed_url]
while crawl_queue:
url = crawl_queue.pop()
html = download(url)
for link in get_links(html):
if re.match(link_regex, link):
link = urlparse.urljoin(seed_url, link)
if link not in seen:
seen.add(link)
crawl_queue.append(link)
13.避免爬虫陷阱
def link_crawler(*args,max_depth):
max_depth = 2
seen={}
...
depth =seen[url]
if depth != max_depth:
for link in links:
seen[link] = depth+1
crawl_queue.append(link)
总:融合上面各方法,就可以做出一个比较棒的爬虫