写了一个单线程的爬虫。多线程的还在研究!

废话不多说,直接上代码。请多多指点!!

 1 #!/usr/bin/env python
 2 #-*- coding = utf-8 -*-
 3 # author: h3i_dan
 4 # version: v1.0
 5 #########################
 6 
 7 import urllib
 8 from sgmllib import SGMLParser
 9 
10 class Urllist(SGMLParser):
11 
12     def reset(self):
13         SGMLParser.reset(self)
14         self.urls = []
15     
16     def start_a(self, attrs):
17         self.attrs = attrs
18         href = [v for  k,v in attrs if k == 'href']
19         if href:
20             self.urls.extend(href)
21 
22 def getUrl(url):
23     urls = []    
24     usock = urllib.urlopen(url)
25     if usock.code == 200:
26         parser = Urllist()
27         parser.feed(usock.read())
28         usock.close()
29         parser.close()
30     
31         for url in parser.urls:    
32             
33             if 'http' not in url:
34                 pass
35             else:
36                 urls.append(url)
37     else:
38         pass
39 
40 def spider(start_url, depth):
41     
42     if depth < 0:
43         return False
44     else:
45         urls = getUrl(start_url)   # there is repeatitive url in urls
46         global num
47         if len(urls) > 0:
48             for url in urls:
49                 print url, num
50                 num += 1
51                 spider(url, depth-1)
52         else:
53             return False
54     print '^^^^^^^^^^^^^^^^^^^^^^^^^^^-'
55     return True
56 
57 
58 if __name__ == '__main__':
59     
60     num = 0
61     spider('http://www.baidu.com', 1)

 

代码基本就是这样的。url 去重和 类似这样:http://www.xxx.com/file/xxx.doc 还没有想到好的办法去掉。多线程一直都搞不太懂。如果python大牛看到请多多指教了

posted on 2012-07-25 10:27  h3idan  阅读(199)  评论(0编辑  收藏  举报

导航