爬虫程序
下面是一个简单的爬虫程序。
#!/usr/bin/env python from sys import argv from os import makedirs, unlink, sep from os.path import dirname, exists, isdir, splitext from string import replace, find, lower #from htmllib import HTMLParser from urllib import urlretrieve from urlparse import urlparse, urljoin from formatter import DumbWriter, AbstractFormatter from cStringIO import StringIO from HTMLParser import HTMLParser
'''下面的三行代码是为了设置默认编码 utf8.如果不这样做,python会默认用ascii编码方式去解析,那么如果遇到unicode的编码就出错了。这里先import sys后 reload sys是因为,sys在默认导入的时候通常会删掉setdefaultencoding这个函数,所以需要用reload加载一下''' import sys reload(sys) sys.setdefaultencoding('utf8') class RetrieveURL(HTMLParser):#我们用HTMLParser新生成了一个类 def __init__(self): HTMLParser.__init__(self) self.anchorlist=[]#重写__init__函数的唯一目的就是对该类的对象增加一个anchorlist def handle_starttag(self, tag, attrs):#重写handle_starttag函数,让它在遇到<A>标签的时候把href属性代表的超链接记录在anchorlist中 if tag=='a' or tag=='A': for t in attrs : if t[0] == 'href' or t[0]=='HREF': self.anchorlist.append(t[1]) class Retriever(object):# download Web pages def __init__(self, url): self.url = url self.file = self.filename(url) def filename(self, url, deffile='index.htm'): parsedurl = urlparse(url, 'http:', 0) ## parse path path = parsedurl[1] + parsedurl[2] ext = splitext(path) if ext[1] == '': # no file, use default. ( what kind of situation this could be? https://www.baidu.com/file1) if path[-1] == '/': path += deffile else: path += '/' + deffile ldir = dirname(path) # local directory if sep != '/': # os-indep. path separator ldir = replace(ldir, '/', sep) if not isdir(ldir): # create archive dir if nec. if exists(ldir): unlink(ldir) print 'ldir is ',ldir makedirs(ldir) return path def download(self): # download Web page try: retval = urlretrieve(self.url, self.file) except IOError: retval = ('*** ERROR: invalid URL "%s"' %self.url,) return retval return retval '''def parseAndGetLinks(self):# parse HTML, save links self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist''' def parseAndGetLinks(self): self.parser=RetrieveURL() self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist class Crawler(object):# manage entire crawling process count = 0 # static downloaded page counter def __init__(self, url): self.q = [url] self.seen = [] self.dom = urlparse(url)[1] def getPage(self, url): r = Retriever(url) retval = r.download() if retval[0] == '*': # error situation, do not parse print retval, '... skipping parse' return Crawler.count += 1 print '\n(', Crawler.count, ')' print 'URL:', url print 'FILE:', retval[0] self.seen.append(url) links = r.parseAndGetLinks() # get and process links for eachLink in links: if eachLink[:4] != 'http' and find(eachLink, '://') == -1: eachLink = urljoin(url, eachLink) print '* ', eachLink, if find(lower(eachLink), 'mailto:') != -1: print '... discarded, mailto link' continue if eachLink not in self.seen: if find(eachLink, self.dom) == -1: print '... discarded, not in domain' else: if eachLink not in self.q: self.q.append(eachLink) print '... new, added to Q' else: print '... discarded, already in Q' else: print '... discarded, already processed' def go(self):# process links in queue while self.q: url = self.q.pop() self.getPage(url) def main(): if len(argv) > 1: url = argv[1] else: try: url = raw_input('Enter starting URL: ') except (KeyboardInterrupt, EOFError): url = '' if not url: return robot = Crawler(url) robot.go() if __name__ == '__main__': main()