获取verycd上亚洲音乐主页上专辑封面 (BeautifulSoup,正则表达式)
使用正则表达式
#! /usr/bin/env python
#coding=utf-8
import urllib2, re
PATTERN = re.compile(r'<span class="post_origin_img" originLink="(.+?)">')
PATTERN2 = re.compile(r'<a href="/topics/(\d+)/"')
def getNo(url):
'''Get the no for each album'''
fd = urllib2.urlopen(url)
data = fd.read()
match =PATTERN2.findall(data)
if match is None:
print 'no match!'
match = list(set(match))
return match
def getPage(url):
'''Get each page for every album'''
fd = urllib2.urlopen(url)
data = fd.read()
match = PATTERN.search(data)
if match is None:
print 'no match!'
img_url = match.group(1)
#文件保存编码和文件编辑编码都是utf-8,所以decode一次,不然会出现乱码,但是不影响结果。
return img_url #match.decode('utf-8')
def getDownloadFile(url,savePath,file):
try:
fd = urllib2.urlopen(url)
data = fd.read()
fd.close()
print savePath+file+'.jpg'
file = open(savePath+file+'.jpg','w+b')
file.write(data)
file.close()
except IOError:
print "download error!"
if __name__ == '__main__':
#getPage('http://www.verycd.com/topics/2864988/')
all = getNo('http://www.verycd.com/sto/music/asia/')
print '总图片量'
print all
print len(all)
for x in range(len(all)-1):
Urlname = 'http://www.verycd.com/topics/'+all[x] +'/'
print Urlname
img_url = getPage(Urlname)
print img_url
getDownloadFile(img_url,'E:\\Code\\Python\\spider\\img\\',all[x])
print 'All done!'
使用BeautifulSoup更简单
#! /usr/bin/env python
#coding=utf-8
import urllib2
from BeautifulSoup import BeautifulSoup
# find all the album on this page
def getNo(url):
'''Get the no for each album'''
data = urllib2.urlopen(url).read()
soup = BeautifulSoup(data)
attrs = {'href':re.compile('^/topics/')}#使用BeautifulSoup的attrs参数来设定,从而可以使用一些关键字如class作为标签的属性
links = soup.findAll('a', attrs)
albumn_nos = set()#使用集合,以去掉重复的音乐编号
if links is None:
print 'no match!'
for link in links:
albumn_nos.add(link['href'].split('/')[2])
return albumn_nos
# get the image url from the album page
def getImgURL(filePath,url):
'''Get each page for every album'''
data = urllib2.urlopen(url).read()
soup = BeautifulSoup(data)
fname = soup.head.title.string[:-16] + ".jpg" #根据页面标题作为文件名称
print fname,
attrs = {'class':'post_origin_img'}
mspan = soup.find('span',attrs)
imgURL = mspan.find('img')['src'] #图片的真实地址
print imgURL
getDownloadFile(imgURL,filePath,fname) #存储到指定目录下
#save img to files accorrding to its url
def getDownloadFile(url,savePath,fname):
try:
fd = urllib2.urlopen(url)
data = fd.read()
mfile = open(savePath+fname,'w+b') #打开文件
mfile.write(data)
mfile.close()
fd.close()
except IOError:
print "download error!"
if __name__ == '__main__':
filePath = 'E:\\Code\\Python\\spider\\img2\\'
#getImgURL(filePath,'http://www.verycd.com/topics/38366/')
# http://www.verycd.com/sto/music/china/page2 # the second page
many = 5 #获取多少页面信息
#temp = {}
all = set()
mainpage = 'http://www.verycd.com/sto/music/china/' #话语音乐网址
for no in range(1,many+1): #得到前many页的地址,range不包括终止结点
otherpages = mainpage + 'page' + str(no)
other = getNo(otherpages)
#temp[no] = other # save each page's no
all = all|other # 合并两个集合,去掉重复的部分
#ands = temp[1]&temp[2] # 查看相同的部分,即左侧的今日热门每一页都有
count = len(all)
print 'There are %d albums.' % count #一共有多少个专辑被检索到
for x in all:
Urlname = 'http://www.verycd.com/topics/'+ x +'/'
#print Urlname
getImgURL(filePath,Urlname)
count -= 1
print "There are %d jobs left." % count
print 'All Jobs done!'
本文使用Blog_Backup未注册版本导出,请到soft.pt42.com注册。