获取verycd上亚洲音乐主页上专辑封面 (BeautifulSoup,正则表达式)

使用正则表达式

#! /usr/bin/env python
#coding=utf-8
import urllib2, re

PATTERN = re.compile(r'<span class="post_origin_img" originLink="(.+?)">')
PATTERN2 = re.compile(r'<a href="/topics/(\d+)/"')

def getNo(url):
    '''Get the no for each album'''
    fd = urllib2.urlopen(url)
    data = fd.read()
   
    match =PATTERN2.findall(data)
    if match is None:
        print 'no match!'
    match = list(set(match))
    return match


def getPage(url):
    '''Get each page for every album'''
    fd = urllib2.urlopen(url)
    data = fd.read()
   
   match = PATTERN.search(data)
    if match is None:
        print 'no match!'
   img_url = match.group(1)
   
    #文件保存编码和文件编辑编码都是utf-8,所以decode一次,不然会出现乱码,但是不影响结果。
    return img_url #match.decode('utf-8')

def getDownloadFile(url,savePath,file):
    try:
        fd = urllib2.urlopen(url)
        data = fd.read()
        fd.close()
        print savePath+file+'.jpg'
        file = open(savePath+file+'.jpg','w+b')
        file.write(data)
        file.close()
    except IOError:
        print "download error!"
       

if __name__ == '__main__':
    #getPage('http://www.verycd.com/topics/2864988/')
    all = getNo('http://www.verycd.com/sto/music/asia/')
    print '总图片量'
    print all
    print len(all)
   
    for x in range(len(all)-1):
        Urlname = 'http://www.verycd.com/topics/'+all[x] +'/'
        print Urlname
        img_url = getPage(Urlname)
        print img_url       
        getDownloadFile(img_url,'E:\\Code\\Python\\spider\\img\\',all[x])
    print 'All done!'

使用BeautifulSoup更简单

#! /usr/bin/env python

#coding=utf-8

import urllib2

from BeautifulSoup import BeautifulSoup

 

# find all the album on this page

def getNo(url):

    '''Get the no for each album'''

    data = urllib2.urlopen(url).read()

    soup = BeautifulSoup(data)

    attrs = {'href':re.compile('^/topics/')}#使用BeautifulSoup的attrs参数来设定,从而可以使用一些关键字如class作为标签的属性

    links = soup.findAll('a', attrs)

    albumn_nos = set()#使用集合,以去掉重复的音乐编号

    if links is None:

        print 'no match!'

    for link in links:

       albumn_nos.add(link['href'].split('/')[2])

    return albumn_nos

 

# get the image url from the album page

def getImgURL(filePath,url):

    '''Get each page for every album'''

   data = urllib2.urlopen(url).read()     

    soup = BeautifulSoup(data)

    fname = soup.head.title.string[:-16] + ".jpg" #根据页面标题作为文件名称

    print fname,

    attrs = {'class':'post_origin_img'}

    mspan = soup.find('span',attrs)

    imgURL = mspan.find('img')['src'] #图片的真实地址

    print imgURL

    getDownloadFile(imgURL,filePath,fname) #存储到指定目录下

    

#save img to files accorrding to its url

def getDownloadFile(url,savePath,fname):

    try:

        fd = urllib2.urlopen(url)

        data = fd.read()        

       mfile = open(savePath+fname,'w+b') #打开文件

        mfile.write(data)

        mfile.close()

        fd.close()

    except IOError:

        print "download error!"     

 

if __name__ == '__main__':

    filePath = 'E:\\Code\\Python\\spider\\img2\\'

    #getImgURL(filePath,'http://www.verycd.com/topics/38366/')

    # http://www.verycd.com/sto/music/china/page2 # the second page

    many = 5 #获取多少页面信息

    #temp = {}

    all = set()

    mainpage = 'http://www.verycd.com/sto/music/china/' #话语音乐网址

   for no in range(1,many+1):  #得到前many页的地址,range不包括终止结点

        otherpages = mainpage + 'page' + str(no)

        other = getNo(otherpages)

        #temp[no] = other # save each page's no

        all = all|other # 合并两个集合,去掉重复的部分

    #ands = temp[1]&temp[2] # 查看相同的部分,即左侧的今日热门每一页都有

    count = len(all)

    print 'There are %d albums.' % count #一共有多少个专辑被检索到

        

    for x in all:

        Urlname = 'http://www.verycd.com/topics/'+ x +'/'

        #print Urlname

        getImgURL(filePath,Urlname)

        count -= 1

        print "There are %d jobs left." % count

    print 'All Jobs done!'

    


本文使用Blog_Backup未注册版本导出,请到soft.pt42.com注册。

posted @ 2011-04-05 21:41  莫忆往西  阅读(292)  评论(0编辑  收藏  举报