Python通用网络爬虫脚本
1 from sys import argv 2 from os import makedirs,unlink,sep,mkdir 3 from os.path import dirname,exists,isdir,splitext 4 from string import replace,find,lower 5 from htmllib import HTMLParser 6 from urllib import urlretrieve 7 from urlparse import urlparse,urljoin 8 from formatter import DumbWriter,AbstractFormatter 9 from cStringIO import StringIO 10 11 12 class Retriever(object): 13 def __init__(self,url): 14 self.url = url 15 self.file = 'E:\install\Python27\\' + self.filename(url) 16 17 def filename(self,url,deffile='index.htm'): 18 parsedurl = urlparse(url,'http:',0) 19 path = parsedurl[1] + parsedurl[2] 20 ext = splitext(path) # seperate ext name 21 if ext[1] == '': 22 if path[-1] == '/': 23 path += deffile 24 else: 25 path += '/' + deffile 26 27 ldir = dirname(path) 28 if sep != '/': 29 ldir = replace(ldir,'/',sep) 30 if not isdir(ldir): 31 if exists(ldir): unlink(ldir) 32 makedirs(ldir) 33 return path 34 35 def download(self): 36 try: 37 retval = urlretrieve(self.url,self.file) 38 except IOError: 39 retval = ('*** ERROR: invalid URL "%s"' %\ 40 self.url) 41 return retval 42 43 def parseAndGetLinks(self): 44 self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) 45 self.parser.feed(open(self.file).read()) 46 self.parser.close() 47 return self.parser.anchorlist 48 49 class Crawler(object): 50 count = 0 # static downloaded page counter 51 52 def __init__(self,url): 53 self.q = [url] 54 self.seen = [] 55 self.dom = urlparse(url)[1] 56 57 def getPage(self,url): 58 r = Retriever(url) 59 retval = r.download() 60 if retval[0] == '*': 61 print retval,'...skipping parse' 62 return 63 Crawler.count += 1 64 print '\n(',Crawler.count,')' 65 print 'URL:',url 66 print 'FILE:',retval[0] 67 self.seen.append(url) 68 69 70 71 links = r.parseAndGetLinks() 72 for eachLink in links: 73 if eachLink[:4] != 'http' and find(eachLink,'://') == -1: 74 eachLink = urljoin(url,eachLink) 75 76 if find(lower(eachLink),'mailto:') != -1: 77 print '...discarded,mailto link' 78 continue 79 if eachLink not in self.seen: 80 if find(eachLink,self.dom) == -1: 81 print '...discarded,not in domain' 82 else: 83 if eachLink not in self.q: 84 self.q.append(eachLink) 85 print '...new,added to Q' 86 else: 87 print '...discarded,already in Q' 88 else: 89 print '...discarded,already processed' 90 91 92 93 def go(self):#process links in queue 94 while self.q: 95 url = self.q.pop() 96 self.getPage(url) 97 98 99 100 def main(): 101 if len(argv) > 1: 102 url = argv[1] 103 104 else: 105 try: 106 url = raw_input('Enter starting URL:') 107 except(KeyboardInerrupt,EOFError): 108 url = '' 109 if not url: return 110 robot = Crawler(url) 111 robot.go() 112 113 if __name__ == '__main__': 114 main()