(转)Python:正则表达式找出网页上所有链接

转自:http://www.linuxany.com/archives/596.html

import re
import urllib
def test(html,rex):
    alist = []
    r = re.compile(rex)
    matchs = r.findall(html)
    if matchs != None:
        for found in matchs:
            if found not in alist:
                alist.append(found)         
    return alist
             
rex = r'<a\s*href=\"(.*?)\"'
page=urllib.urlopen('http://hi.baidu.com')
html=page.read()
page.close()
 
print test(html,rex)

 

posted @ 2014-01-21 09:20  少年的梦  阅读(1295)  评论(0编辑  收藏  举报