MusiCode 批量下载指定歌手的所有专辑(已解除验证码限制)
一直想把喜欢的歌手的专辑全都归类并下载下来,由于那专辑数量实在太多了,再加上最近开始学习python,就想着何不用python写个脚本把下载过程自动化呢?所以就花了点时间写了这么个东西,分享给有需要的人。:)
写这个东西,一开始并没有想到抓取过于频繁、时间过长会出现验证码,由于验证码的问题试了几种方式都无法得到很好的解决,于是加上了生成下载清单这一步,
加这一步的时候,一开始是把最终下载地址存储起来,结果发现,下载地址居然会过期,没办法最后只有将下载页面地址存储下来,使用下载命令的时候,再去下载
页面获取最终下载地址。
这段脚本使用了两个开源的模块,gevent和BeautifulSoup。
updated-------------------------------------------------------------------------------------------
已解除验证码限制,若出现验证码,则会从验证码页面中提取出所需cookie并重新发起请求。
#coding=utf-8 import urllib,urllib2,re,os,json,gevent,traceback from BeautifulSoup import BeautifulSoup from gevent import monkey monkey.patch_all() rootUrl='http://music.baidu.com' artistId=2825 #想批量下载并归类你喜欢的歌手的所有专辑?那就把这里替换成该歌手在百度音乐的Id吧,例如:http://music.baidu.com/artist/2825 pagesize=10 savePath='G:\\crawl\\david bowie\\' #改成你想存储的文件夹 listDir='_____downlist\\' handleCount=0 BAIDUVERIFY='' def crawlList(): artistUrl=rootUrl+'/artist/'+str(artistId) homeHtml=request(artistUrl) soup=BeautifulSoup(homeHtml) try: pagecount=len(soup.findAll("div",{"class":"page-inner"})[1].findAll(text=re.compile(r'\d+'))) except: print traceback.print_exc() print homeHtml return jobs=[] listPath=savePath+listDir if not os.path.exists(listPath): os.mkdir(listPath) for i in range(pagecount): jobs.append(gevent.spawn(crawlPage,i)) gevent.joinall(jobs) def request(url): global BAIDUVERIFY req=urllib2.Request(url) if BAIDUVERIFY!='': req.add_header('Cookie','BAIDUVERIFY='+BAIDUVERIFY+';') resp=urllib2.urlopen(req) html= resp.read() verify=getBaiduVerify(html) if verify!='': print u'成功提取验证码并重新发起请求' BAIDUVERIFY=verify return request(url) return html def getBaiduVerify(html): vcode=re.search(r'name=\"vcode\" value=\"(.*?)\"' , html, re.I) id=re.search(r'name=\"id\" value=\"(.*?)\"' , html, re.I) di=re.search(r'name=\"di\" value=\"(.*?)\"' , html, re.I) if vcode and id and di: return vcode.group(1)+':'+id.group(1)+':'+di.group(1) return '' def crawlPage(page): start=page*pagesize albumListUrl='http://music.baidu.com/data/user/getalbums?start=%d&ting_uid=%d&order=time' % (start,artistId) print albumListUrl albumListHtml=json.loads(request(albumListUrl))["data"]["html"] albumListSoup=BeautifulSoup(albumListHtml) covers=albumListSoup.findAll('a',{'class':'cover'}) pagePath=savePath+listDir+str(page)+'\\' if not os.path.exists(pagePath): os.mkdir(pagePath) for cover in covers: try: crawlAlbum(pagePath,rootUrl+cover['href'],cover['title']) except: print traceback.print_exc() def crawlAlbum(pagePath,albumUrl,title): print albumUrl,title albumHtml=request(albumUrl) albumSoup=BeautifulSoup(albumHtml) musicWraps=albumSoup.findAll('span',{'class':'song-title '}) title=re.subn(r'\\|\/|:|\*|\?|\"|\<|\>|\|','',title)[0] path=savePath+title+'\\' albumListPath=pagePath+title+'.txt' albumFile=open(albumListPath,'w') for wrap in musicWraps: link=wrap.find('a') try: musicPage=rootUrl+link['href'] albumFile.write('%s\t%s\t%s\n' % (musicPage,link['title'],path)) #真实下载地址会过期,这里保存下载页面 except: print traceback.print_exc() albumFile.close() def crawlDownloadUrl(musicPage): downPage=musicPage+'/download' downHtml=request(downPage) downUrl=re.search('http://[^ ]*xcode.[a-z0-9]*' , downHtml, re.M).group() return downUrl def downList(): listPath=savePath+listDir jobs=[] for pageDir in os.listdir(listPath): jobs.append(gevent.spawn(downPage,listPath+pageDir)) gevent.joinall(jobs) def downPage(pagePath): for filename in os.listdir(pagePath): filePath=pagePath+'\\'+filename albumFile=open(filePath,'r') try: for args in albumFile.readlines(): arrArgs=args.split('\t') downMusic(arrArgs[0],arrArgs[1],arrArgs[2].replace('\n','')) except: print traceback.print_exc() finally: albumFile.close() def downMusic(musicPage,title,path): global handleCount if not os.path.exists(path): os.mkdir(path) handleCount+=1 print handleCount,musicPage,title,path filename=path+re.subn(r'\\|\/|:|\*|\?|\"|\<|\>|\|','',title)[0]+'.mp3' if os.path.isfile(filename): return downUrl=crawlDownloadUrl(musicPage) try: urllib.urlretrieve(downUrl,filename) except: print traceback.print_exc() os.remove(filename) if __name__=='__main__': print u'命令:\n\tlist\t生成下载清单\n\tdown\t开始下载\n\texit\t退出' cmd=raw_input('>>>') while cmd!='exit': if cmd=='list': crawlList() print u'已生成下载清单' elif cmd=='down': downList() print u'下载完成' else: print 'unknow cmd' cmd=raw_input('>>>')