1 #!/usr/bin/env python
2
3 import cStringIO #
4 import formatter #
5 from htmllib import HTMLParser # We use various classes in these modules for parsing HTML.
6 import httplib # We only need an exception from this module
7 import os # This provides various file system functions
8 import sys # We are just using argv for command-line arguments
9 import urllib # We only need the urlretrieve()function for downloading Web pages
10 import urlparse # We use the urlparse()and urljoin()functions for URL manipulation
11
12 class Retriever(object):
13 __slots__ = ('url','file')
14
15 def __init__(self,url):
16 self.url, self.file = self.get_file(url)
17
18 def get_file(self, url, default='index.html'):
19 'Create usable local filename from URL'
20 parsed = urlparse.urlparse(url) # ParseResult(scheme='http', netloc='www.baidu.com', path='', params='', query='', fragment='')
21 host = parsed.netloc.split('@')[-1].split(':')[0] # 'www.baidu.com'
22 filepath = '%s%s' % (host,parsed.path) # 'www.baidu.com'
23 if not os.path.splitext(parsed.path)[1]: # ''
24 filepath = os.path.join(filepath, default) # 'www.baidu.com\\index.html'
25 linkdir = os.path.dirname(filepath) # 'www.baidu.com'
26 if not os.path.isdir(linkdir): # False
27 if os.path.exists(linkdir): # False
28 os.unlink(linkdir)
29 os.makedirs(linkdir) # make a directory named by link directory on the hard disc
30 return url, filepath
31
32 def download(self):
33 'Download URL to specific name file'
34 try:
35 retval = urllib.urlretrieve(self.url, self.file)
36 except (IOError, httplib.InvalidURL) as e:
37 retval = (('*** ERROR:bad URL "%s": %s' % (self.url,e)),)
38 return retval
39
40 def parse_links(self):
41 'Parse out the links found in downloaded HTML file'
42 f = open(self.file, 'r')
43 data = f.read()
44 f.close()
45 parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(cStringIO.StringIO())))
46 parser.feed(data)
47 parser.close()
48 return parser.anchorlist
49
50 class Crawler(object):
51 count = 0 # the number of objects downloaded from the internet
52
53 def __init__(self, url):
54 self.q = [url] # a queue of links to download
55 self.seen = set() # a set containing all the links that we have seen(downloaded) already
56 parsed = urlparse.urlparse(url)
57 host = parsed.netloc.split('@')[-1].split(':')[0]
58 self.dom = '.'.join(host.split('.')[-2:]) # 'b.a.i.d.u'
59
60 def get_page(self, url, media=False):
61 'Download page & parse links, add to queue if nec'
62 r = Retriever(url)
63 fname = r.download()[0] # 'www.baidu.com\\index.html'
64 if fname[0] == '*': # 'w'
65 print fname, '... skipping parse'
66 return
67 Crawler.count += 1 # 1
68 print '\n(', Crawler.count, ')' # (1)
69 print 'URL:', url # URL: http://www.baidu.com
70 print 'FILE:', fname # FILE: www.baidu.com\\index.html
71 self.seen.add(url) # set(['http://www.baidu.com'])
72 ftype = os.path.splitext(fname)[1] # '.html'
73 if ftype not in ('.htm', '.html'): # False
74 return
75
76 for link in r.parse_links():
77 if link.startswith('mailto:'): # False
78 print '... discarded, mailto link'
79 continue
80 if not media: # False
81 ftype = os.path.splitext(link)[1]
82 if ftype in ('.mp3','.mp4','.m4v','.wav'):
83 print '... discarded, media file'
84 continue
85 if not link.startswith('http://'): # False
86 link = urlparse.urljoin(url, link)
87 print '*', link,
88 if link not in self.seen: # True
89 if self.dom not in link: # False
90 print '... discarded, not in domain'
91 else:
92 if link not in self.q:
93 self.q.append(link)
94 print '... new, added to Q'
95 else:
96 print '... discarded, already in Q'
97 else:
98 print '... discarded, already processed'
99
100 def go(self, media=False):
101 'Process next page in queue (if any)'
102 while self.q:
103 url = self.q.pop()
104 self.get_page(url, media)
105
106 def main():
107 if len(sys.argv) > 1:
108 url = sys.argv[1]
109 else:
110 try:
111 url = raw_input('Enter starting URL:')
112 except(KeyboardInterrupt, EOFError):
113 url = ''
114 if not url:
115 return
116 if not url.startswith('http://') and not url.startswith('ftp://'):
117 url = 'http://%s/' % url
118 robot = Crawler(url)
119 robot.go()
120
121 if __name__ == '__main__':
122 main()