python写的批量下载baidu mp3的程序 至少到09-9-18仍然可用
windows平台下的Py2.6 移植到linux应该也很容易
默认10线程下载
其中多线程下载部分 是参考 http://hi.baidu.com/zjw0358/blog
mydown.py
#!/usr/bin/env python
# coding=utf-8
import httplib,urllib,urllib2
import re,os
from downmp3 import GetSize,DownMp3
def BaiduUrlDecode(enurl):
import string
from urllib import unquote
k = u'0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
decoded = ''
# print enurl
try:
key = k.index(u't') - k.index(enurl[1])
for i in range(enurl.__len__()):
char = enurl[i]
if char in k:
decoded += k[(k.index(char) + key) % 62]
else:
decoded += char
except IndexError:
print 'enurl IndexError:',enurl,'$'
# print 'enurl[1]',enurl[1]
return unquote(decoded.encode('gbk'))
def BadiuUrlProcess(baidu_url):
import re
from urllib import quote
match_CHchar = r'([^:._,~`!@#\|{}\^\*\(\)<>;%/\"\'\[\]\+\=\?\-\$\&\\\w]+)' #匹配中文
ch_pattern = re.compile(match_CHchar)
page_url = baidu_url
if ch_pattern.search(baidu_url):
page_url = ch_pattern.sub(quote(ch_pattern.search(baidu_url).groups()[0].encode('gbk')),baidu_url)
return page_url
if __name__ == "__main__":
print """ 支持百度MP3的大部分列表音乐的下载,默认采用10线程下载
[1] 新歌top100
[2] 歌曲top500
[3] 歌手top200 (暂不支持下载)
[4] 中文金曲榜
[5] 经典老歌
[6] 热舞dj
[7] 流金岁月
[8] 电视金曲
[9] 歌曲列表
[0] 退出
--by auxten auxtenwpc[at]gmail[dot]com
""".decode('utf-8').encode('gbk')
id = int(raw_input('输入你想下载的list的编号: '.decode('utf-8').encode('gbk')))
if id == 1: topid = '/list/newhits.html?id=1?top1'
elif id == 2: topid = '/topso/mp3topsong.html?id=1?top2'
elif id == 3: topid = '/list/tvs.html?id=1?top5';exit(1)
elif id == 4: topid = '/list/bangping.html?id=1'#;exit(1)
elif id == 5: topid = '/list/oldsong.html?top6'
elif id == 6: topid = '/list/dj.html'
elif id == 7: topid = '/list/liujinsuiyue.html'
elif id == 8: topid = '/list/tvs.html?id=1?top5'
elif id == 9: topid = '/list/tvs.html?id=1?top5'#;exit(1)
elif id == 0: exit(1)
# topid = '/list/oldsong.html?top6'
# topid = '/list/tvs.html?id=1?top5'
print "Processing please wait.:)"
errorlist = []
conn = httplib.HTTPConnection('list.mp3.baidu.com')
conn.request("GET",topid)
response = conn.getresponse()
html = response.read().decode('gb18030')
# print html.encode('gbk')
conn.close()
match_type1 = r'">(\d{,3})\.</td>' #编号
match_type2 = r'">(\d{,3})\.</td>[\s\S]*?<a href="(.*?)" target="_blank">(.*?)</a>[^)].*' #no url songname for 流金岁月
match_type3 = r'">(\d{,3})\.</td>[\s\S]*?<a href="(.*?)" target="_blank">(.*?)</a>[^)].*">(.*)</a>\)' #name+author
match_type4 = r'<tr>[\s]*?<td>(\d{,3})</td>[\s\S]*?<a href="(http://.*?)" target="_blank">(.*?)</a></td>[\s\S]*?target="_blank">(.*?)</a>' #no url songname singer 中文金曲榜
list_number = re.findall(match_type1, html)
list_all = re.findall(match_type3, html)
if list_all == []:
# print 'list_all empty1!'
list_all = re.findall(match_type2, html)
if list_all == []:
# print 'list_all empty2!'
list_all = re.findall(match_type4, html)
# print 'list_all',list_all
# print 'list_all__len__',list_all.__len__()
conn = httplib.HTTPConnection('mp3.baidu.com')
songnumlst = range(0,list_all.__len__())
for num in songnumlst:
try:
try: authorname = '-'+list_all[num][3]
except IndexError:
authorname = ''
print list_all[num][0].encode('gbk'),list_all[num][2].encode('gbk'),authorname.encode('gbk')
# print num
conn.request("GET",BadiuUrlProcess(list_all[num][1]))
# print "URL!",BadiuUrlProcess(list_all[num][1]).encode('gbk')
response = conn.getresponse()
html = response.read().decode('gb18030')
conn.close()
# print html.encode('gbk')
html = re.search(r'<a href="(.*?)" title', html).groups()[0]
down_page_url = BadiuUrlProcess(html)
html = urllib.urlopen(down_page_url).read().decode('gb18030')
# print html.encode('gbk')
mp3_url_list = re.findall(r'{var B="(.*?)".*?{var C=\["(.*?)","(.*?)","(.*?)","(.*?)"\];', html)[0]
# print mp3_url_list
mp3_url_list = [BaiduUrlDecode(i) for i in mp3_url_list ]
# print mp3_url_list
# for i in mp3_url_list:
# print i.decode('utf-8').encode('gbk')
print 'Downloading .'
except UnicodeDecodeError:
print "Error in main loop"#"UnicodeDecodeError"
songnumlst.append(num)
continue
try: assert DownMp3(urlist = mp3_url_list, name = '%s%s' % (list_all[num][2],authorname) ) == 0
except AssertionError:
print 'DownMp3 error!'
songnumlst.append(num)
#!/usr/bin/env python
# coding=utf-8
import httplib,urllib,urllib2
import re,os
from downmp3 import GetSize,DownMp3
def BaiduUrlDecode(enurl):
import string
from urllib import unquote
k = u'0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
decoded = ''
# print enurl
try:
key = k.index(u't') - k.index(enurl[1])
for i in range(enurl.__len__()):
char = enurl[i]
if char in k:
decoded += k[(k.index(char) + key) % 62]
else:
decoded += char
except IndexError:
print 'enurl IndexError:',enurl,'$'
# print 'enurl[1]',enurl[1]
return unquote(decoded.encode('gbk'))
def BadiuUrlProcess(baidu_url):
import re
from urllib import quote
match_CHchar = r'([^:._,~`!@#\|{}\^\*\(\)<>;%/\"\'\[\]\+\=\?\-\$\&\\\w]+)' #匹配中文
ch_pattern = re.compile(match_CHchar)
page_url = baidu_url
if ch_pattern.search(baidu_url):
page_url = ch_pattern.sub(quote(ch_pattern.search(baidu_url).groups()[0].encode('gbk')),baidu_url)
return page_url
if __name__ == "__main__":
print """ 支持百度MP3的大部分列表音乐的下载,默认采用10线程下载
[1] 新歌top100
[2] 歌曲top500
[3] 歌手top200 (暂不支持下载)
[4] 中文金曲榜
[5] 经典老歌
[6] 热舞dj
[7] 流金岁月
[8] 电视金曲
[9] 歌曲列表
[0] 退出
--by auxten auxtenwpc[at]gmail[dot]com
""".decode('utf-8').encode('gbk')
id = int(raw_input('输入你想下载的list的编号: '.decode('utf-8').encode('gbk')))
if id == 1: topid = '/list/newhits.html?id=1?top1'
elif id == 2: topid = '/topso/mp3topsong.html?id=1?top2'
elif id == 3: topid = '/list/tvs.html?id=1?top5';exit(1)
elif id == 4: topid = '/list/bangping.html?id=1'#;exit(1)
elif id == 5: topid = '/list/oldsong.html?top6'
elif id == 6: topid = '/list/dj.html'
elif id == 7: topid = '/list/liujinsuiyue.html'
elif id == 8: topid = '/list/tvs.html?id=1?top5'
elif id == 9: topid = '/list/tvs.html?id=1?top5'#;exit(1)
elif id == 0: exit(1)
# topid = '/list/oldsong.html?top6'
# topid = '/list/tvs.html?id=1?top5'
print "Processing please wait.:)"
errorlist = []
conn = httplib.HTTPConnection('list.mp3.baidu.com')
conn.request("GET",topid)
response = conn.getresponse()
html = response.read().decode('gb18030')
# print html.encode('gbk')
conn.close()
match_type1 = r'">(\d{,3})\.</td>' #编号
match_type2 = r'">(\d{,3})\.</td>[\s\S]*?<a href="(.*?)" target="_blank">(.*?)</a>[^)].*' #no url songname for 流金岁月
match_type3 = r'">(\d{,3})\.</td>[\s\S]*?<a href="(.*?)" target="_blank">(.*?)</a>[^)].*">(.*)</a>\)' #name+author
match_type4 = r'<tr>[\s]*?<td>(\d{,3})</td>[\s\S]*?<a href="(http://.*?)" target="_blank">(.*?)</a></td>[\s\S]*?target="_blank">(.*?)</a>' #no url songname singer 中文金曲榜
list_number = re.findall(match_type1, html)
list_all = re.findall(match_type3, html)
if list_all == []:
# print 'list_all empty1!'
list_all = re.findall(match_type2, html)
if list_all == []:
# print 'list_all empty2!'
list_all = re.findall(match_type4, html)
# print 'list_all',list_all
# print 'list_all__len__',list_all.__len__()
conn = httplib.HTTPConnection('mp3.baidu.com')
songnumlst = range(0,list_all.__len__())
for num in songnumlst:
try:
try: authorname = '-'+list_all[num][3]
except IndexError:
authorname = ''
print list_all[num][0].encode('gbk'),list_all[num][2].encode('gbk'),authorname.encode('gbk')
# print num
conn.request("GET",BadiuUrlProcess(list_all[num][1]))
# print "URL!",BadiuUrlProcess(list_all[num][1]).encode('gbk')
response = conn.getresponse()
html = response.read().decode('gb18030')
conn.close()
# print html.encode('gbk')
html = re.search(r'<a href="(.*?)" title', html).groups()[0]
down_page_url = BadiuUrlProcess(html)
html = urllib.urlopen(down_page_url).read().decode('gb18030')
# print html.encode('gbk')
mp3_url_list = re.findall(r'{var B="(.*?)".*?{var C=\["(.*?)","(.*?)","(.*?)","(.*?)"\];', html)[0]
# print mp3_url_list
mp3_url_list = [BaiduUrlDecode(i) for i in mp3_url_list ]
# print mp3_url_list
# for i in mp3_url_list:
# print i.decode('utf-8').encode('gbk')
print 'Downloading .'
except UnicodeDecodeError:
print "Error in main loop"#"UnicodeDecodeError"
songnumlst.append(num)
continue
try: assert DownMp3(urlist = mp3_url_list, name = '%s%s' % (list_all[num][2],authorname) ) == 0
except AssertionError:
print 'DownMp3 error!'
songnumlst.append(num)
HttpGetThread.py
#!/usr/bin/env python
#coding=utf-8
import re
import os
import sys
import time
import glob
import string
import socket
import getopt
import urllib
import urllib2
import threading
#############################################################################
#
# self-defined exception classes
#
#############################################################################
class ConnectionError(Exception): pass
class URLUnreachable(Exception):pass
class CanotDownload(Exception):pass
#############################################################################
#
# multiple threads download module starts here
#
#############################################################################
class HttpGetThread(threading.Thread):
def __init__(self, name, url, filename, range=0):
threading.Thread.__init__(self, name=name)
self.url = url
self.filename = filename
self.range = range
self.totalLength = range[1] - range[0] +1
try:
self.downloaded = os.path.getsize(self.filename)
except OSError:
self.downloaded = 0
self.percent = self.downloaded/float(self.totalLength)*100
self.headerrange = (self.range[0]+self.downloaded, self.range[1])
self.bufferSize = 8192
def run(self):
try:
self.downloaded = os.path.getsize(self.filename)
except OSError:
self.downloaded = 0
self.percent = self.downloaded/float(self.totalLength)*100
#self.headerrange = (self.range[0]+self.downloaded, self.range[1])
self.bufferSize = 8192
#request = urllib2.Request(self.url)
#request.add_header('Range', 'bytes=%d-%d' %self.headerrange)
downloadAll = False
retries = 1
while not downloadAll:
if retries > 10:
break
try:
self.headerrange = (self.range[0]+self.downloaded, self.range[1])
request = urllib2.Request(self.url)
request.add_header('Range', 'bytes=%d-%d' %self.headerrange)
conn = urllib2.urlopen(request)
startTime = time.time()
data = conn.read(self.bufferSize)
while data:
f = open(self.filename, 'ab')
f.write(data)
f.close()
self.time = int(time.time() - startTime)
self.downloaded += len(data)
self.percent = self.downloaded/float(self.totalLength) *100
data = conn.read(self.bufferSize)
downloadAll = True
except Exception, err:
retries += 1
time.sleep(1)
continue
def Split(size,blocks):
ranges = []
blocksize = size / blocks
for i in xrange(blocks-1):
ranges.append(( i*blocksize, i*blocksize+blocksize-1))
ranges.append(( blocksize*(blocks-1), size-1))
return ranges
def GetHttpFileSize(url):
length = 0
try:
conn = urllib.urlopen(url)
headers = conn.info().headers
for header in headers:
if header.find('Length') != -1:
length = header.split(':')[-1].strip()
length = int(length)
except Exception, err:
pass
return length
def hasLive(ts):
for t in ts:
if t.isAlive():
return True
return False
def MyHttpGet(url, output=None, connections=4):
"""
arguments:
url, in GBK encoding
output, default encoding, do no convertion
connections, integer
"""
length = GetHttpFileSize(url)
print length
mb = length/1024/1024.0
if length == 0:
raise URLUnreachable
blocks = connections
if output:
filename = output
else:
output = url.split('/')[-1]
ranges = Split(length, blocks)
names = ["%s_%d" %(output,i) for i in xrange(blocks)]
ts = []
for i in xrange(blocks):
t = HttpGetThread(i, url, names[i], ranges[i])
t.setDaemon(True)
t.start()
ts.append(t)
live = hasLive(ts)
startSize = sum([t.downloaded for t in ts])
startTime = time.time()
etime = 0
lastd = 0
nobytecounter = 0
while live:
try:
etime = time.time() - startTime
d = sum([t.downloaded for t in ts])/float(length)*100
if lastd == d:
nobytecounter += 1
else:
nobytecounter = 0
lastd = d
if nobytecounter > 100:
raise CanotDownload
downloadedThistime = sum([t.downloaded for t in ts])-startSize
try:
rate = downloadedThistime / float(etime)/1024
except:
rate = 100.0
progressStr = u'\rFilesize: %d(%.2fM) Downloaded: %.2f%% Avg rate: %.1fKB/s' %(length, mb, d, rate)
sys.stdout.write(progressStr)
sys.stdout.flush()
#sys.stdout.write('\b'*(len(progressStr)+1))
live = hasLive(ts)
time.sleep(0.2)
except URLUnreachable:
print
print "Url Unreachable"
for n in names:
try:
os.remove(n)
except:
pass
return -1
except CanotDownload:
print
print "can't download!"
for n in names:
try:
os.remove(n)
except:
pass
return -1
except KeyboardInterrupt:
print
print "Exit"
for n in names:
try:
os.remove(n)
except:
pass
sys.exit(1)
print
try:
f = open(filename, 'wb')
for n in names:
f.write(open(n,'rb').read())
try:
os.remove(n)
except:
pass
f.close()
except :
print
print 'File write Error'
for n in names:
try:
os.remove(n)
except:
pass
return -1
return 0
if __name__ == '__main__':
MyHttpGet('http://jsz.com.cn./18/Hongdou.mp3','Hongdou.mp3',4)
#!/usr/bin/env python
#coding=utf-8
import re
import os
import sys
import time
import glob
import string
import socket
import getopt
import urllib
import urllib2
import threading
#############################################################################
#
# self-defined exception classes
#
#############################################################################
class ConnectionError(Exception): pass
class URLUnreachable(Exception):pass
class CanotDownload(Exception):pass
#############################################################################
#
# multiple threads download module starts here
#
#############################################################################
class HttpGetThread(threading.Thread):
def __init__(self, name, url, filename, range=0):
threading.Thread.__init__(self, name=name)
self.url = url
self.filename = filename
self.range = range
self.totalLength = range[1] - range[0] +1
try:
self.downloaded = os.path.getsize(self.filename)
except OSError:
self.downloaded = 0
self.percent = self.downloaded/float(self.totalLength)*100
self.headerrange = (self.range[0]+self.downloaded, self.range[1])
self.bufferSize = 8192
def run(self):
try:
self.downloaded = os.path.getsize(self.filename)
except OSError:
self.downloaded = 0
self.percent = self.downloaded/float(self.totalLength)*100
#self.headerrange = (self.range[0]+self.downloaded, self.range[1])
self.bufferSize = 8192
#request = urllib2.Request(self.url)
#request.add_header('Range', 'bytes=%d-%d' %self.headerrange)
downloadAll = False
retries = 1
while not downloadAll:
if retries > 10:
break
try:
self.headerrange = (self.range[0]+self.downloaded, self.range[1])
request = urllib2.Request(self.url)
request.add_header('Range', 'bytes=%d-%d' %self.headerrange)
conn = urllib2.urlopen(request)
startTime = time.time()
data = conn.read(self.bufferSize)
while data:
f = open(self.filename, 'ab')
f.write(data)
f.close()
self.time = int(time.time() - startTime)
self.downloaded += len(data)
self.percent = self.downloaded/float(self.totalLength) *100
data = conn.read(self.bufferSize)
downloadAll = True
except Exception, err:
retries += 1
time.sleep(1)
continue
def Split(size,blocks):
ranges = []
blocksize = size / blocks
for i in xrange(blocks-1):
ranges.append(( i*blocksize, i*blocksize+blocksize-1))
ranges.append(( blocksize*(blocks-1), size-1))
return ranges
def GetHttpFileSize(url):
length = 0
try:
conn = urllib.urlopen(url)
headers = conn.info().headers
for header in headers:
if header.find('Length') != -1:
length = header.split(':')[-1].strip()
length = int(length)
except Exception, err:
pass
return length
def hasLive(ts):
for t in ts:
if t.isAlive():
return True
return False
def MyHttpGet(url, output=None, connections=4):
"""
arguments:
url, in GBK encoding
output, default encoding, do no convertion
connections, integer
"""
length = GetHttpFileSize(url)
print length
mb = length/1024/1024.0
if length == 0:
raise URLUnreachable
blocks = connections
if output:
filename = output
else:
output = url.split('/')[-1]
ranges = Split(length, blocks)
names = ["%s_%d" %(output,i) for i in xrange(blocks)]
ts = []
for i in xrange(blocks):
t = HttpGetThread(i, url, names[i], ranges[i])
t.setDaemon(True)
t.start()
ts.append(t)
live = hasLive(ts)
startSize = sum([t.downloaded for t in ts])
startTime = time.time()
etime = 0
lastd = 0
nobytecounter = 0
while live:
try:
etime = time.time() - startTime
d = sum([t.downloaded for t in ts])/float(length)*100
if lastd == d:
nobytecounter += 1
else:
nobytecounter = 0
lastd = d
if nobytecounter > 100:
raise CanotDownload
downloadedThistime = sum([t.downloaded for t in ts])-startSize
try:
rate = downloadedThistime / float(etime)/1024
except:
rate = 100.0
progressStr = u'\rFilesize: %d(%.2fM) Downloaded: %.2f%% Avg rate: %.1fKB/s' %(length, mb, d, rate)
sys.stdout.write(progressStr)
sys.stdout.flush()
#sys.stdout.write('\b'*(len(progressStr)+1))
live = hasLive(ts)
time.sleep(0.2)
except URLUnreachable:
print "Url Unreachable"
for n in names:
try:
os.remove(n)
except:
pass
return -1
except CanotDownload:
print "can't download!"
for n in names:
try:
os.remove(n)
except:
pass
return -1
except KeyboardInterrupt:
print "Exit"
for n in names:
try:
os.remove(n)
except:
pass
sys.exit(1)
try:
f = open(filename, 'wb')
for n in names:
f.write(open(n,'rb').read())
try:
os.remove(n)
except:
pass
f.close()
except :
print 'File write Error'
for n in names:
try:
os.remove(n)
except:
pass
return -1
return 0
if __name__ == '__main__':
MyHttpGet('http://jsz.com.cn./18/Hongdou.mp3','Hongdou.mp3',4)
downmp3.py
#!/usr/bin/env python
#coding=utf-8
url = ['http://www.efu.com.cn/topic/611/1.mp3',
'http://bbs.baby8.cn//upload/vip/10/200831715101730.mp3',
'http://www1.neacn.com/file_db/music/1153/f/26.wma',
'http://www.tklk8.cn/bbs/uploadfile/2009-8/200981013174970090.mp3',
'http://media.winglish.com/sound/winhao/free_content/music/music_20090507.mp3'
]
from HttpGetThread import *
def GetSize(resource_url):
import httplib
from urlparse import urlparse
try:
parsedurl = urlparse(resource_url)
host = parsedurl[1]
path = parsedurl[2]
httpConn = httplib.HTTPConnection(host);
httpConn.request("GET", path)
r = httpConn.getresponse()
httpConn.close()
if r.status == 200:
size = r.getheader('Content-Length')
size = int(size) / 1024
else:
print r.status, r.reason
size = -1
except :
size = -1
print parsedurl
return size
def DownMp3(urlist,name):
mp3url = urlist[0]
for url in urlist:
Size = GetSize(url)
# print 'Size :',Size
if Size < 1536 or Size > 10240:
pass
else:
mp3url = url
# print 'change source!'
break
fulname = name+'.'+mp3url.rsplit('.',1)[1]
try: assert MyHttpGet(mp3url.decode('gb18030').encode('gbk'),fulname,10) == 0
except (AssertionError,UnicodeDecodeError):
return -1
else:
return 0
if __name__ == '__main__':
DownMp3(url,'红豆'.decode('utf-8').encode('gbk'))
#!/usr/bin/env python
#coding=utf-8
url = ['http://www.efu.com.cn/topic/611/1.mp3',
'http://bbs.baby8.cn//upload/vip/10/200831715101730.mp3',
'http://www1.neacn.com/file_db/music/1153/f/26.wma',
'http://www.tklk8.cn/bbs/uploadfile/2009-8/200981013174970090.mp3',
'http://media.winglish.com/sound/winhao/free_content/music/music_20090507.mp3'
]
from HttpGetThread import *
def GetSize(resource_url):
import httplib
from urlparse import urlparse
try:
parsedurl = urlparse(resource_url)
host = parsedurl[1]
path = parsedurl[2]
httpConn = httplib.HTTPConnection(host);
httpConn.request("GET", path)
r = httpConn.getresponse()
httpConn.close()
if r.status == 200:
size = r.getheader('Content-Length')
size = int(size) / 1024
else:
print r.status, r.reason
size = -1
except :
size = -1
print parsedurl
return size
def DownMp3(urlist,name):
mp3url = urlist[0]
for url in urlist:
Size = GetSize(url)
# print 'Size :',Size
if Size < 1536 or Size > 10240:
pass
else:
mp3url = url
# print 'change source!'
break
fulname = name+'.'+mp3url.rsplit('.',1)[1]
try: assert MyHttpGet(mp3url.decode('gb18030').encode('gbk'),fulname,10) == 0
except (AssertionError,UnicodeDecodeError):
return -1
else:
return 0
if __name__ == '__main__':
DownMp3(url,'红豆'.decode('utf-8').encode('gbk'))
setup.py
# coding=utf-8
from distutils.core import setup
import py2exe
setup(
console=['mydown.py'],
options={
"py2exe":{
"includes": ["downmp3"],
"compressed": True,
"optimize": 2,
"bundle_files": 1,
}
},
zipfile = None,
)
# coding=utf-8
from distutils.core import setup
import py2exe
setup(
console=['mydown.py'],
options={
"py2exe":{
"includes": ["downmp3"],
"compressed": True,
"optimize": 2,
"bundle_files": 1,
}
},
zipfile = None,
)
嗯 就是这样 我比较懒 看代码吧