采集百度top500歌曲,python2.7.2
http://blog.b999.net/post/141/
#-*- coding: UTF-8 -*-
'''
Created on 2012-3-8
@author: tiantian
Modify: 2012-4-15
The correct save to file in windows
'''
import urllib
import re
import platform
import os
top500 = 'http://list.mp3.baidu.com/top/top500.html'
#top500 = 'http://list.mp3.baidu.com/list/shaoergequ.html'
songs = []
if (os.path.exists('songs')== False):
os.mkdir('songs')
def main():
divr = '
mf = urllib.urlopen(top500)
content = mf.read()
content = content.decode('gbk')
content = re.sub('\n+',' ',content)
alldiv = re.findall(divr,content)
i =0
for div in alldiv:
ulr = ''
allul = re.findall(ulr,div)
for ul in allul:
lir = ''
allli = re.findall(lir,ul)
for li in allli:
if i<245:
i = i+1
continue
i = i+1
songName = '
name = re.findall(songName,li)
songAuthor = '
author = re.findall(songAuthor,li)
songs.append([name[0],author[0]])
songUrl = getSongUrl(name[0],author[0])
sysstr = platform.system()
if(sysstr =="Windows"):
filename = ('songs/'+name[0]+'-'+author[0]+'.mp3').encode('gbk')
elif(sysstr == "Linux"):
filename = 'songs/'+name[0]+'-'+author[0]+'.mp3'
else:
print ("Other System tasks")
print filename
try:
urllib.urlretrieve(songUrl,filename)
# 异常检查并不能判断是否下载成功,需要进行其他判断
print i,name[0],author[0],'下载成功'
except Exception :
print i,name[0],author[0],'没下载成功'
def getSongUrl(songName,authorName):
'''这里由于歌曲名称和作者名称的不完整,可能导致无法得到url,'''
songUrl = 'http://box.zhangmen.baidu.com/x?op=12&count=1&mtype=1&title=%s$$%s$$$$&url=&listenreelect=0&.r=0.1696378872729838' % (urllib.quote(songName.encode('gbk')),urllib.quote(authorName.encode('gbk')))
f = urllib.urlopen(songUrl)
c = f.read()
url1 = re.findall('.*?CDATA\[(.*?)\]].*?',c)
url2 = re.findall('.*?CDATA\[(.*?)\]].*?',c)
if len(url1) <1:
return 'http://box.zhangmen.baidu.com/unknow.mp3'
try:
return url1[0][:url1[0].rindex('/')+1] + url2[0]
except Exception:
return url1[0]
if __name__ == '__main__':
main()
采集的mp3文件保存在新建的目录 songs下
阅读(554) | 评论(0) | 转发(1) |
<script>window._bd_share_config={"common":{"bdSnsKey":{},"bdText":"","bdMini":"2","bdMiniList":false,"bdPic":"","bdStyle":"0","bdSize":"16"},"share":{}};with(document)0[(getElementsByTagName('head')[0]||body).appendChild(createElement('script')).src='http://bdimg.share.baidu.com/static/api/js/share.js?v=89860593.js?cdnversion='+~(-new Date()/36e5)];</script>
#-*- coding: UTF-8 -*-
'''
Created on 2012-3-8
@author: tiantian
Modify: 2012-4-15
The correct save to file in windows
'''
import urllib
import re
import platform
import os
top500 = 'http://list.mp3.baidu.com/top/top500.html'
#top500 = 'http://list.mp3.baidu.com/list/shaoergequ.html'
songs = []
if (os.path.exists('songs')== False):
os.mkdir('songs')
def main():
divr = '
.*?.*?
'mf = urllib.urlopen(top500)
content = mf.read()
content = content.decode('gbk')
content = re.sub('\n+',' ',content)
alldiv = re.findall(divr,content)
i =0
for div in alldiv:
ulr = ''
allul = re.findall(ulr,div)
for ul in allul:
lir = ''
allli = re.findall(lir,ul)
for li in allli:
if i<245:
i = i+1
continue
i = i+1
songName = '
.*?(.*?).*?
'name = re.findall(songName,li)
songAuthor = '
.*?(.*?).*?
'author = re.findall(songAuthor,li)
songs.append([name[0],author[0]])
songUrl = getSongUrl(name[0],author[0])
sysstr = platform.system()
if(sysstr =="Windows"):
filename = ('songs/'+name[0]+'-'+author[0]+'.mp3').encode('gbk')
elif(sysstr == "Linux"):
filename = 'songs/'+name[0]+'-'+author[0]+'.mp3'
else:
print ("Other System tasks")
print filename
try:
urllib.urlretrieve(songUrl,filename)
# 异常检查并不能判断是否下载成功,需要进行其他判断
print i,name[0],author[0],'下载成功'
except Exception :
print i,name[0],author[0],'没下载成功'
def getSongUrl(songName,authorName):
'''这里由于歌曲名称和作者名称的不完整,可能导致无法得到url,'''
songUrl = 'http://box.zhangmen.baidu.com/x?op=12&count=1&mtype=1&title=%s$$%s$$$$&url=&listenreelect=0&.r=0.1696378872729838' % (urllib.quote(songName.encode('gbk')),urllib.quote(authorName.encode('gbk')))
f = urllib.urlopen(songUrl)
c = f.read()
url1 = re.findall('.*?CDATA\[(.*?)\]].*?',c)
url2 = re.findall('.*?CDATA\[(.*?)\]].*?',c)
if len(url1) <1:
return 'http://box.zhangmen.baidu.com/unknow.mp3'
try:
return url1[0][:url1[0].rindex('/')+1] + url2[0]
except Exception:
return url1[0]
if __name__ == '__main__':
main()
采集的mp3文件保存在新建的目录 songs下
相关热门文章
给主人留下些什么吧!~~
评论热议