采集百度top500歌曲，python2.7.2

http://blog.b999.net/post/141/

#-*- coding: UTF-8 -*-
'''
Created on 2012-3-8

@author: tiantian

Modify: 2012-4-15
The correct save to file in windows
'''
import urllib
import re
import platform
import os

top500 = 'http://list.mp3.baidu.com/top/top500.html'
#top500 = 'http://list.mp3.baidu.com/list/shaoergequ.html'

songs = []

if (os.path.exists('songs')== False):
os.mkdir('songs')

def main():

divr = '

.*?.*?

'
    mf = urllib.urlopen(top500)
    content = mf.read()
    content = content.decode('gbk')

    content = re.sub('\n+',' ',content)
    alldiv = re.findall(divr,content)
    i =0
    for div in alldiv:
        ulr = ''
        allul = re.findall(ulr,div)

        for ul in allul:
            lir = ''
            allli = re.findall(lir,ul)

            for li in allli:
                if i<245:
                    i = i+1
                    continue
                i = i+1
                songName = '

.*?(.*?).*?

'
name = re.findall(songName,li)
songAuthor = '

.*?(.*?).*?

'
                author = re.findall(songAuthor,li)

                songs.append([name[0],author[0]])

                songUrl = getSongUrl(name[0],author[0])

                sysstr = platform.system()
                if(sysstr =="Windows"):
                 filename = ('songs/'+name[0]+'-'+author[0]+'.mp3').encode('gbk')
                elif(sysstr == "Linux"):
                 filename = 'songs/'+name[0]+'-'+author[0]+'.mp3'
                else:
                 print ("Other System tasks")
                print filename

                try:
                    urllib.urlretrieve(songUrl,filename)
                    # 异常检查并不能判断是否下载成功，需要进行其他判断
                    print i,name[0],author[0],'下载成功'

                except Exception :
                    print i,name[0],author[0],'没下载成功'

def getSongUrl(songName,authorName):
    '''这里由于歌曲名称和作者名称的不完整，可能导致无法得到url，'''
    songUrl = 'http://box.zhangmen.baidu.com/x?op=12&count=1&mtype=1&title=%s

$%s$ $$&url=&listenreelect=0&.r=0.1696378872729838' % (urllib.quote(songName.encode('gbk')),urllib.quote(authorName.encode('gbk')))
    f = urllib.urlopen(songUrl)
    c = f.read()
    url1 = re.findall('.*?CDATA

(. * ?)

$(.*?)$ ].*?',c)
url2 = re.findall('.*?CDATA

(. * ?)

$(.*?)$ ].*?',c)
    if len(url1) <1:
        return 'http://box.zhangmen.baidu.com/unknow.mp3'

    try:
        return url1[0][:url1[0].rindex('/')+1] + url2[0]
    except Exception:
        return url1[0]

if __name__ == '__main__':
    main()

采集的mp3文件保存在新建的目录 songs下

阅读(554) | 评论(0) | 转发(1) |

上一篇：关于revision 的cover letter

下一篇：汽车操作系统革命：封闭还是开源?

相关热门文章

给主人留下些什么吧！~~

评论热议

posted @ 2016-02-01 00:00 张同光阅读(82) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

阅读排行：
· 无需6万激活码！GitHub神秘组织3小时极速复刻Manus，手把手教你使用OpenManus搭建本
· Manus爆火，是硬核还是营销？
· 终于写完轮子一部分：tcp代理了，记录一下
· 别再用vector＜bool＞了！Google高级工程师：这可能是STL最大的设计失误
· 单元测试从入门到精通

公告

昵称：张同光
园龄： 8年11个月
粉丝： 70
关注： 0

+加关注

2025年3月

日

一

二

三

四

五

六

张同光 (Tongguang Zhang)

张同光 (Tongguang Zhang)：Hello everyone !
Let us make progress together every day ! —— 微信号：ztguang

采集百度top500歌曲，python2.7.2

公告

搜索

常用链接

最新随笔

我的标签

积分与排名

随笔分类 (929)

随笔档案 (3269)

阅读排行榜

评论排行榜

推荐排行榜

最新评论

张同光 (Tongguang Zhang)

张同光 (Tongguang Zhang)：Hello everyone ! Let us make progress together every day ! —— 微信号：ztguang

采集百度top500歌曲，python2.7.2

公告

搜索

常用链接

最新随笔

我的标签

积分与排名

随笔分类 (929)

随笔档案 (3269)

阅读排行榜

评论排行榜

推荐排行榜

最新评论

张同光 (Tongguang Zhang)：Hello everyone !
Let us make progress together every day ! —— 微信号：ztguang