Python配合BeautifulSoup读取网络图片并保存在本地
本例为Python配合BeautifulSoup读取网络图片,并保存在本地。
BeautifulSoup可代替正则表达式,更好地解析Html文本,获取其中的指定内容,如Tag、Property等
# -*- coding: gbk -*- import urllib import urllib2 from bs4 import BeautifulSoup import time import re import os,sys import chardet def req(url): #url='http://www.szu.edu.cn/2014/news/index_1.html' header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} req=urllib2.Request(url,headers=header) data=urllib.urlopen(req).read() print data return data def reqImg(): #url='http://www.junmeng.com/tj/22376_4.html' url=r'http://www.junmeng.com/tj/22376.html' header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} patnLink=r'<a href=".*/tj/22376_\d*.html"><img src.+</a>' patnImg=r'<img src=.+>' savedir=r'C:\Users\hp\Desktop\results' if not os.path.exists(savedir): os.mkdir(savedir) for i in range(1,20): if i==1: tempurl=url else: tempurl='http://www.junmeng.com/tj/22376_%d.html'%i print tempurl #req=Request(tempurl,headers=header) data=urllib.urlopen(tempurl).read() #print data if i==19: patnLink=r'<a href=.*><img src=.*</a>' imgLinks=re.findall(patnLink,data) #print results link=imgLinks[0] #print link imgLink=link[link.find('src=')+5:link.find('.jpg')+4] print imgLink fullLink=r'http://www.junmeng.com%s'%imgLink lct=time.strftime('%Y%m%d%H%M%S') urllib.urlretrieve(fullLink,'%s\%s%d.jpg'%(savedir,lct,i)) #return data def reqImg2(): url=r'http://www.ik6.com/meinv/40569/index.html' header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} savedir=r'C:\Users\hp\Desktop\results' if not os.path.exists(savedir): os.mkdir(savedir) for i in range(1,10): if i==1: tempurl=url else: tempurl='http://www.ik6.com/meinv/40569/index_%d.html'%i print tempurl #req=Request(tempurl,headers=header) data=urllib.urlopen(tempurl).read() page=BeautifulSoup(data) imgsrc=page.find_all('center')[0].find_all('img')[0].get('lazysrc') print imgsrc lct=time.strftime('%Y%m%d%H%M%S') urllib.urlretrieve(imgsrc,'%s\%s%d.jpg'%(savedir,lct,i)) def reqImg3(): url=r'http://www.ik6.com/meinv/40572/index.html' header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} savedir=r'C:\Users\hp\Desktop\results' if not os.path.exists(savedir): os.mkdir(savedir) for i in range(1,10): if i==1: tempurl=url else: tempurl='http://www.ik6.com/meinv/40572/index_%d.html'%i print tempurl #req=Request(tempurl,headers=header) data=urllib.urlopen(tempurl).read() page=BeautifulSoup(data) imgsrc=page.find_all('center')[0].find_all('img')[0].get('lazysrc') print imgsrc lct=time.strftime('%Y%m%d%H%M%S') urllib.urlretrieve(imgsrc,'%s\%s%d.jpg'%(savedir,lct,i)) def reqImg4(url,themecount,imgcount): #url=r'http://www.ik6.com/meinv/40572/index.html' header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} savedir=r'C:\Users\hp\Desktop\result0128' if not os.path.exists(savedir): os.mkdir(savedir) newUrl=(url[:url.rfind('.htm')]+'_%d.html') print newUrl for i in range(1,imgcount+1): if i==1: tempurl=url else: tempurl=newUrl%i print tempurl try: data=urllib.urlopen(tempurl).read() if not data: print 'no response,exit' return page=BeautifulSoup(data) centers=page.find_all('center') if len(centers)==0: print 'response has no contents,exit' return else: imgsrc=centers[0].find_all('img')[0].get('lazysrc') print imgsrc #lct=time.strftime('%Y%m%d%H%M%S') #urllib.urlretrieve(imgsrc,'%s\%s%d.jpg'%(savedir,lct,i)) urllib.urlretrieve(imgsrc,'%s\%d_%d.jpg'%(savedir,themecount,i)) except Exception,e: return
使用:
req('http://blog.csdn.net/suwei19870312/article/details/8148427') req('http://www.taobao.com') reqImg() reqImg2() reqImg3() for i in range(1000): count=11170+i url=r'http://www.ik6.com/meinv/%d/index.html'%count reqImg4(url,8)