python抓取网页图片
网页的图片大致是用Image导入的,使用的是相对路径,例如
<image src="image/bg.jpg"/>
通过匹配可以获取image/bg.jpg,与页面地址组合可以得到图片的地址
除了直接引入的图片,还有通过CSS,HTML引入的图片,也需要处理
# -*- coding: utf-8 -*- import urllib, httplib, urlparse import sys import re def httpExists(url): host, path = urlparse.urlsplit(url)[1:3] if ':' in host: # port specified, try to use it host, port = host.split(':', 1) try: port = int(port) except ValueError: print 'invalid port number %r' % (port,) return False else: # no port specified, use default port port = None try: connection = httplib.HTTPConnection(host, port=port) connection.request("HEAD", path) resp = connection.getresponse( ) if resp.status == 200: # normal 'found' status found = True elif resp.status == 302: # recurse on temporary redirect found = httpExists(urlparse.urljoin(url,resp.getheader('location', ''))) else: # everything else -> not found print "Status %d %s : %s" % (resp.status, resp.reason, url) found = False except Exception, e: print e.__class__, e, url found = False return found """根据url获取文件名""" def gGetFileName(url): if url==None: return None if url=="" : return "" arr=url.split("/") return arr[len(arr)-1] """根据url下载文件,文件名参数指定""" def gDownloadWithFilename(url,savePath,file): #参数检查,现忽略 try: urlopen=urllib.URLopener() fp = urlopen.open(url) data = fp.read() fp.close() print 'download file url :',url file=open(savePath + file,'w+b') file.write(data) file.close() except IOError: print "download error!"+ url def gDownload(url,savePath): fileName = gGetFileName(url) gDownloadWithFilename(url,savePath,fileName) def getRexgList(lines,regx,searchRegx): if lines==None : return lists =[] for line in lines: ismatch = re.search(regx,line,re.IGNORECASE) if ismatch : matchs = re.search(searchRegx,line,re.IGNORECASE) if matchs != None: groups = matchs.groups() for str in groups: if str not in lists: lists.append(str) return lists def checkLine(lines): for line in lines : matchs = re.search(r'url\((\S+)\)',re.IGNORECASE) if matchs != None : print matchs.groups() def getPageLines(url): if url==None : return if not httpExists(url): return try: page = urllib.urlopen(url) html = page.readlines() page.close() return html except: print "getPageLines() error!" return def getCurrentPageImage(url,savePath): lines = getPageLines(url) print 'lines.length',len(lines) regxlists = getRexgList(lines,r'src\s*="images(\S+)"',r'src\s*="(\S+)"') if regxlists==None: return print 'getCurrentPageImage() images.length',len(regxlists) for jpg in regxlists: jpg =url + jpg gDownload(jpg,savePath) def getCSSImages(link,savePath,url): lines = getPageLines(link) print 'lines.length',len(lines) regxlists = getRexgList(lines,r'url\((\S+)\)',r'url\((\S+)\)') if regxlists==None: return print 'getCurrentPageImage() images.length',len(regxlists) for jpg in regxlists: jpg =url + jpg gDownload(jpg,savePath) """根据url获取其上的相关htm、html链接,返回list""" def gGetHtmlLink(url): #参数检查,现忽略 rtnList=[] lines=getPageLines(url) regx = r"""href="?(\S+)\.htm""" for link in getRexgList(lines,regx,r'href="(\S+)"'): link =url + link if link not in rtnList: rtnList.append(link) print link return rtnList """根据url获取其上的相关css链接,返回list""" def gGetCSSLink(url): #参数检查,现忽略 rtnList=[] lines=getPageLines(url) regx = r"""href="?(\S+)\.css""" for link in getRexgList(lines,regx,r'href="(\S+)"'): link = url + link if link not in rtnList: rtnList.append(link) return rtnList def getPageImage(url,savePath): """getCurrentPageImage(url,savePath)""" """读取其他的CSS,html文件中的图片 links=gGetHtmlLink(url) for link in links: print u'get images on link-html读取' getCurrentPageImage(link,savePath)""" links=gGetCSSLink(url) for link in links: print 'get images on link:',link getCSSImages(link,savePath,url) if __name__ == '__main__': url = 'http://www.templatemo.com/templates/templatemo_281_chrome/' savePath = 'd:/tmp/' print 'download pic from [' + url +']' print 'save to [' +savePath+'] ...' getPageImage(url,savePath) print "download finished"
具体使用的时候根据URL的情况,具体分析得到图片地址的方式。