python 正则表达式
import urllib import re class getHead: def __init__(self,url): self.url=url def getContent(self): print "begin to download" page=urllib.urlopen(self.url) html=page.read() return html def getImage(self,pagestr): reg=r'src="(.+?\.jpg)"' image=re.compile(reg) imagelist=re.findall(image,pagestr) return imagelist def getALI(self,pagestr): reg=r'<li><a.+?</a></li>' data=re.compile(reg) datalist=re.findall(data,pagestr) return datalist get=getHead("http://www.sina.com.cn/") for data in get.getALI(get.getContent()): dr = re.compile(r'<[^>]+>',re.S) dd=dr.sub('',data) print dd for image in get.getImage(get.getContent()): if image.find(' ')!=-1: ss=image.split(' ') print ss[1].replace('data-src="',"") else: print image