简介

爬虫下载页面

代码

简易下载

#!/usr/bin/env python
#coding=utf-8
import urllib2

def download(url):
	print('Download:',url)
	try:
		html = urllib2.urlopen(url).read()
	except urllib2.URLError as e:
		print('Download error:', e.reason)
		html = None
	return html

if __name__ == '__main__':
	download('http://www.baidu.com')

似乎并没有把百度的html 下载下来

多次尝试下载 5XX服务器错误 并设置 代理

很多网站都不喜欢被爬虫程序访问,但又没有办法完全禁止,于是就设置了一些反爬策略。比如User Agent,中文名为用户代理,简称UA。User Agent存放于Headers中,服务器就是通过查看Headers中的User Agent来判断是谁在访问。
通过不同的浏览器访问,会有不同的User Agent,如果爬虫不设置的话,很容易被识别出来,就会被限制访问。一般的做法是收集很多不同的User Agent,然后随机使用。
def download(url, user_agent='wswp', num_retries=2):
	print 'Downloading:', url
	headers = {'User-agent':user_agent}
	request = urllib2.Request(url, headers=headers)
	try:
		html = urllib2.urlopen(request).read()
	except urllib2.URLError as e:
		print('Download error:', e.reason)
		html = None
		if num_retries > 0:
			if hasattr(e, 'code') and 500 <= e.code < 600:
				#retyr 5XX HTTP errors
				return download(url, user_agent, num_retries-1)
	return html

使用网站地图下载相关的页面

def crawl_sitemap(url):
	# download the sitemap file
	sitemap = download(url)
	# extract the sitemap links
	links = re.findall('<loc>(.*?)</loc>', sitemap)
	# download each link
	for link in links:
		html = download(link)
		print link

网站可能会把前面的字符串忽略然后可以只用后面的数字

def crawl_string():
	for page in itertools.count(1):
		url = 'http://example.webscraping.com/view/-%d' % page
		html = download(url)
		if ( html is None):
			break
		else:
			pass

网站通过一个页面的链接下载

def get_links(html):
	webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE)
	return webpage_regex.findall(html)

def link_crawler(seed_url, link_regex):
	crawl_queue = [seed_url]
	# keep track which URL's have seen before
	seen = set(crawl_queue)
	while crawl_queue:
		url = crawl_queue.pop()
		html = download(url)
		print "getlinks", get_links(html)
		for link in get_links(html):
			if re.match(link_regex, link):
				link = urlparse.urljoin(seed_url, link)
				if link not in seen:
					seen.add(link)
					crawl_queue.append(link)

if __name__ == '__main__':
	link_crawler('http://example.webscraping.com', '/places/default/(index|view)')

支持对 robots.txt 的解析

def link_crawler(seed_url, link_regex):
	rp = robotparser.RobotFileParser()
	rp.set_url(seed_url+'/robots.txt')
	rp.read()
	crawl_queue = [seed_url]
	# keep track which URL's have seen before
	seen = set(crawl_queue)
	while crawl_queue:
		url = crawl_queue.pop()
		user_agent = 'wswp'
		if rp.can_fetch(user_agent, url):
			html = download(url)
			print "getlinks", get_links(html)
			for link in get_links(html):
				if re.match(link_regex, link):
					link = urlparse.urljoin(seed_url, link)
					if link not in seen:
						seen.add(link)
						crawl_queue.append(link)
		else:
			print 'Blocked by robots.txt:', url

代理

def link_crawler(seed_url, link_regex, proxy=False):
	if proxy: # 暂时无法代理

		proxy_info={
			'host':'106.12.38.133',
			'port':22
		}

		# We create a handler for the proxy
		proxy_support = urllib2.ProxyHandler({"http" : "http://%(host)s:%(port)d" % proxy_info})
		# We create an opener which uses this handler:
		opener = urllib2.build_opener(proxy_support)

		# Then we install this opener as the default opener for urllib2:
		urllib2.install_opener(opener)

		#如果代理需要验证
		proxy_info = { 
					   'host' : '106.12.38.133',
		               'port' : 20,
		               'user' : 'root',
		               'pass' : 'Woaini7758258!'
		             }
		proxy_support = urllib2.ProxyHandler({"http" : "http://%(user)s:%(pass)s@%(host)s:%(port)d" % proxy_info})
		opener = urllib2.build_opener(proxy_support)
		urllib2.install_opener(opener)
		#htmlpage = urllib2.urlopen("http://sebsauvage.net/").read(200000)



	rp = robotparser.RobotFileParser()
	rp.set_url(seed_url+'/robots.txt')
	rp.read()
	crawl_queue = [seed_url]
	# keep track which URL's have seen before
	seen = set(crawl_queue)
	while crawl_queue:
		url = crawl_queue.pop()
		user_agent = 'wswp'
		if rp.can_fetch(user_agent, url):
			html = download(url)
			print "getlinks", get_links(html)
			for link in get_links(html):
				if re.match(link_regex, link):
					link = urlparse.urljoin(seed_url, link)
					if link not in seen:
						seen.add(link)
						crawl_queue.append(link)
		else:
			print 'Blocked by robots.txt:', url

下载限速

class Throttle:
	"""
	下载延迟 下载之前调用
	"""
	def __init__(self, delay):
		self.delay = delay
		self.domains()
	def wait(self, url):
		domain = urlparse.urlparse(url).netloc
		last_accessed = self.domains.get(domain)

		if self.delay > 0 and last_accessed is not None:
			sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds
		if sleep_secs > 0:
			time.sleep(sleep_secs)
		self.domains[domain] = datetime.datetime.now()

参考链接

https://tieba.baidu.com/p/5832236970

posted on 2019-09-20 22:01  HDU李少帅  阅读(348)  评论(0编辑  收藏  举报