爬网页哦

 1 # -*- coding: UTF-8-*- 
 2 import urllib2,re
 3 
 4 mylist = []
 5 
 6 p = re.compile( r"<a.+?href=.+?>.+?</a>")
 7 pname = re.compile( r"(?<=>).*?(?=</a>)" )
 8 phref = re.compile( r"(?<=href\=\")http.*?(?=\")")
 9 
10 html_c = urllib2.urlopen("http://www.baidu.com/?vit=1").read()
11 
12 def geturltest(str):
13     html_c = urllib2.urlopen(str).read()
14     arr = p.findall(html_c)
15     for ever in arr:
16         con = phref.findall(ever)
17         if len(con) > 0:
18             print con[0]
19         
20 def geturl(str):
21     html_c = urllib2.urlopen(str).read()
22     arr = p.findall(html_c)
23     for ever in arr:
24         con = phref.findall(ever)
25         if len(con) == 0:
26             continue
27         else:
28             print con[0]
29             if mylist.__contains__(con[0]) == False:
30                 mylist.append(con[0])
31                 geturl(con[0])
32             else:
33                 pass
34 def main():
35     geturl("http://www.baidu.com/?vit=1")
36 
37 if __name__ == '__main__': main()

 

posted @ 2013-03-08 00:10  邵贤军  阅读(511)  评论(0编辑  收藏  举报