Naive Website Crawl using Python
1 #!/usr/bin/python 2 import urllib2 3 import re 4 5 # download a web file (.html) of url with given name 6 def downURL(url, filename): 7 try: 8 fp = urllib2.urlopen(url) 9 except: 10 print 'download exception' 11 return False 12 op = open(filename, 'wb') 13 while True: 14 s = fp.read() 15 if not s: 16 break 17 op.write(s) 18 19 fp.close() 20 op.close() 21 return True 22 23 # get urls in a web 24 def getURLs(url): 25 try: 26 fp = urllib2.urlopen(url) 27 except: 28 print 'get url exception' 29 return [] 30 pattern = re.compile('http://[\w\.]+') 31 while True: 32 s = fp.read() 33 if not s: 34 break 35 urls = pattern.findall(s) 36 fp.close() 37 return urls 38 39 # crawl web in one level 40 def spider(startURL): 41 urls = [] 42 urls.append(startURL) 43 urllist = getURLs(startURL) 44 for url in urllist: 45 print url 46 if urls.count(url) == 0: 47 urls.append(url) 48 i = 0 49 while True: 50 if len(urls) <= 0: 51 break 52 else: 53 url = urls.pop(0) 54 i = i + 1 55 downURL(url, str(i) + '.html') 56 return True 57 58 # test 59 spider('http://www.baidu.com')