获取verycd上亚洲音乐主页上专辑封面（BeautifulSoup，正则表达式）

使用正则表达式

#! /usr/bin/env python
#coding=utf-8
import urllib2, re

PATTERN = re.compile(r'<span class="post_origin_img" originLink="(.+?)">')
PATTERN2 = re.compile(r'<a href="/topics/(\d+)/"')

def getNo(url):
    '''Get the no for each album'''
    fd = urllib2.urlopen(url)
    data = fd.read()

    match =PATTERN2.findall(data)
    if match is None:
        print 'no match!'
    match = list(set(match))
    return match

def getPage(url):
    '''Get each page for every album'''
    fd = urllib2.urlopen(url)
    data = fd.read()

   match = PATTERN.search(data)
    if match is None:
        print 'no match!'
   img_url = match.group(1)

    #文件保存编码和文件编辑编码都是utf-8，所以decode一次，不然会出现乱码，但是不影响结果。
    return img_url #match.decode('utf-8')

def getDownloadFile(url,savePath,file):
    try:
        fd = urllib2.urlopen(url)
        data = fd.read()
        fd.close()
        print savePath+file+'.jpg'
        file = open(savePath+file+'.jpg','w+b')
        file.write(data)
        file.close()
    except IOError:
        print "download error!"


if __name__ == '__main__':
    #getPage('http://www.verycd.com/topics/2864988/')
    all = getNo('http://www.verycd.com/sto/music/asia/')
    print '总图片量'
    print all
    print len(all)

    for x in range(len(all)-1):
        Urlname = 'http://www.verycd.com/topics/'+all[x] +'/'
        print Urlname
        img_url = getPage(Urlname)
        print img_url
        getDownloadFile(img_url,'E:\\Code\\Python\\spider\\img\\',all[x])
    print 'All done!'

使用BeautifulSoup更简单

#! /usr/bin/env python

#coding=utf-8

import urllib2

from BeautifulSoup import BeautifulSoup

# find all the album on this page

def getNo(url):

'''Get the no for each album'''

data = urllib2.urlopen(url).read()

soup = BeautifulSoup(data)

attrs = {'href':re.compile('^/topics/')}#使用BeautifulSoup的attrs参数来设定，从而可以使用一些关键字如class作为标签的属性

links = soup.findAll('a', attrs)

albumn_nos = set()#使用集合，以去掉重复的音乐编号

if links is None:

print 'no match!'

for link in links:

albumn_nos.add(link['href'].split('/')[2])

return albumn_nos

# get the image url from the album page

def getImgURL(filePath,url):

'''Get each page for every album'''

data = urllib2.urlopen(url).read()

soup = BeautifulSoup(data)

fname = soup.head.title.string[:-16] + ".jpg" #根据页面标题作为文件名称

print fname,

attrs = {'class':'post_origin_img'}

mspan = soup.find('span',attrs)

imgURL = mspan.find('img')['src'] #图片的真实地址

print imgURL

getDownloadFile(imgURL,filePath,fname) #存储到指定目录下

#save img to files accorrding to its url

def getDownloadFile(url,savePath,fname):

try:

fd = urllib2.urlopen(url)

data = fd.read()

mfile = open(savePath+fname,'w+b') #打开文件

mfile.write(data)

mfile.close()

fd.close()

except IOError:

print "download error!"

if __name__ == '__main__':

filePath = 'E:\\Code\\Python\\spider\\img2\\'

#getImgURL(filePath,'http://www.verycd.com/topics/38366/')

# http://www.verycd.com/sto/music/china/page2 # the second page

many = 5 #获取多少页面信息

#temp = {}

all = set()

mainpage = 'http://www.verycd.com/sto/music/china/' #话语音乐网址

for no in range(1,many+1): #得到前many页的地址，range不包括终止结点

otherpages = mainpage + 'page' + str(no)

other = getNo(otherpages)

#temp[no] = other # save each page's no

all = all|other # 合并两个集合，去掉重复的部分

#ands = temp[1]&temp[2] # 查看相同的部分，即左侧的今日热门每一页都有

count = len(all)

print 'There are %d albums.' % count #一共有多少个专辑被检索到

for x in all:

Urlname = 'http://www.verycd.com/topics/'+ x +'/'

#print Urlname

getImgURL(filePath,Urlname)

count -= 1

print "There are %d jobs left." % count

print 'All Jobs done!'

本文使用Blog_Backup未注册版本导出，请到soft.pt42.com注册。

posted @ 2011-04-05 21:41 莫忆往西阅读(292) 评论(0) 编辑收藏举报

刷新页面返回顶部

Time Goes By

如花美眷似水流年

获取verycd上亚洲音乐主页上专辑封面（BeautifulSoup，正则表达式）

公告

Time Goes By

如花美眷 似水流年

获取verycd上亚洲音乐主页上专辑封面 （BeautifulSoup，正则表达式）

公告

如花美眷似水流年

获取verycd上亚洲音乐主页上专辑封面（BeautifulSoup，正则表达式）