爬虫程序

下面是一个简单的爬虫程序。
#!/usr/bin/env python

from sys import argv
from os import makedirs, unlink, sep
from os.path import dirname, exists, isdir, splitext
from string import replace, find, lower
#from htmllib import HTMLParser
from urllib import urlretrieve
from urlparse import urlparse, urljoin
from formatter import DumbWriter, AbstractFormatter
from cStringIO import StringIO
from HTMLParser import HTMLParser
'''下面的三行代码是为了设置默认编码 utf8.如果不这样做，python会默认用ascii编码方式去解析，那么如果遇到unicode的编码就出错了。这里先import sys后 reload sys是因为，sys在默认导入的时候通常会删掉setdefaultencoding这个函数，所以需要用reload加载一下'''
import sys
reload(sys)
sys.setdefaultencoding('utf8')


class RetrieveURL(HTMLParser):#我们用HTMLParser新生成了一个类
	def __init__(self):
		HTMLParser.__init__(self)
		self.anchorlist=[]#重写__init__函数的唯一目的就是对该类的对象增加一个anchorlist
	def handle_starttag(self, tag, attrs):#重写handle_starttag函数，让它在遇到<A>标签的时候把href属性代表的超链接记录在anchorlist中
			if tag=='a' or tag=='A':
				for t in attrs :
					if t[0] == 'href' or t[0]=='HREF':
						self.anchorlist.append(t[1])

class Retriever(object):# download Web pages
	def __init__(self, url):
		self.url = url
		self.file = self.filename(url)
	
	def filename(self, url, deffile='index.htm'):
		parsedurl = urlparse(url, 'http:', 0) ## parse path
		path = parsedurl[1] + parsedurl[2]
		ext = splitext(path)
		if ext[1] == '':	# no file, use default. ( what kind of situation this could be? https://www.baidu.com/file1)
			if path[-1] == '/':
				path += deffile
			else:
				path += '/' + deffile
		ldir = dirname(path)	# local directory
		if sep != '/':	# os-indep. path separator
			ldir = replace(ldir, '/', sep)
		if not isdir(ldir):	# create archive dir if nec.
			if exists(ldir): unlink(ldir)
			print 'ldir is ',ldir
			makedirs(ldir)
		return path
		
	
	def download(self):	# download Web page
		try:
			retval = urlretrieve(self.url, self.file)
		except IOError:
			retval = ('*** ERROR: invalid URL "%s"' %self.url,)
			return retval
		return retval

	'''def parseAndGetLinks(self):# parse HTML, save links
		self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
		self.parser.feed(open(self.file).read())
		self.parser.close()
		return self.parser.anchorlist'''
	def parseAndGetLinks(self):
		self.parser=RetrieveURL()
		self.parser.feed(open(self.file).read())
		self.parser.close()
		return self.parser.anchorlist


class Crawler(object):# manage entire crawling process
	count = 0	# static downloaded page counter
	def __init__(self, url):
		self.q = [url]
		self.seen = []
		self.dom = urlparse(url)[1]

	def getPage(self, url):
		r = Retriever(url)
		retval = r.download()
		
		if retval[0] == '*': # error situation, do not parse
			print retval, '... skipping parse'
			return

		Crawler.count += 1
		print '\n(', Crawler.count, ')'
		print 'URL:', url
		print 'FILE:', retval[0]
		self.seen.append(url)
		
		links = r.parseAndGetLinks() # get and process links
		for eachLink in links:
			if eachLink[:4] != 'http' and find(eachLink, '://') == -1:
				eachLink = urljoin(url, eachLink)
				print '* ', eachLink,
			if find(lower(eachLink), 'mailto:') != -1:
				print '... discarded, mailto link'
				continue
			if eachLink not in self.seen:
				if find(eachLink, self.dom) == -1:
					print '... discarded, not in domain'
				else:
					if eachLink not in self.q:
						self.q.append(eachLink)
						print '... new, added to Q'
					else:
						print '... discarded, already in Q'
			else:
				print '... discarded, already processed'
						
	def go(self):# process links in queue
		while self.q:
			url = self.q.pop()
			self.getPage(url)

def main():
	if len(argv) > 1:
		url = argv[1]
	else:
		try:
			url = raw_input('Enter starting URL: ')
		except (KeyboardInterrupt, EOFError):
			url = ''
	
	if not url: return
	robot = Crawler(url)
	robot.go()

if __name__ == '__main__':
	main()
posted on 2014-06-03 16:55 kramer 阅读(935) 评论(0) 收藏举报
刷新页面返回顶部
kramer

爬虫程序

导航

公告