(转)Python:正则表达式找出网页上所有链接
转自:http://www.linuxany.com/archives/596.html
import re import urllib def test(html,rex): alist = [] r = re.compile(rex) matchs = r.findall(html) if matchs != None: for found in matchs: if found not in alist: alist.append(found) return alist rex = r'<a\s*href=\"(.*?)\"' page=urllib.urlopen('http://hi.baidu.com') html=page.read() page.close() print test(html,rex)