python爬取酷狗音乐
1、酷狗音乐型md5加密给我上身体(这应该就是加密了吧,,要不然挺尴尬T_T),我这个不是爬取酷狗TOP500,而是搜索之后在下载歌曲
如下图上,当你播放歌曲跳到另一个页面(酷狗有一个专门播放歌曲的页面),F12打开network,然后刷新页面,就会发现歌曲下载地址在下图所示类型数据包中
2、然后我们去分析它的请求头
https://wwwapi.kugou.com/yy/index.php?r=play/getdata&callback=jQuery1910026577801017397817_1596164131720&hash=AC9B095DCF364F751BCED2E148638AFA&album_id=&dfid=0zPSYM1q2qH90xArIr1yOGtc&mid=5cd316b98602d9b29f31085c96e6c682&platid=4&_=1596164131721
他用的get请求将参数加到链接后面,我们只需要分析一下参数就可以了,这里我就直接说结果了
只需要它的hash值改变,那么这个请求链接就是那首歌的信息
也就可以说hash值相当于歌曲的唯一标识id
3、然后我们去找hash值,刚开始以为hash值是在js文件中生成的,搜索之后部分js代码如下
找了半天都没有找到是怎么来的,代码里面好像option就是参数,其他地方直接传过来的,,,那么估计hash值不是js中生成的了,那么就去其他文件里面找找有没有这个hash值的来源
在这个文件中有歌曲的信息,你把列表信息打开,就会发现有一个FileHash,和哪个一对比,果然一样
(我的不一样,是因为这两首歌都不一样,我只是给你们说一下过程,不要在意细节)
然后我们就要去分析这个数据包的请求头
https://complexsearch.kugou.com/v2/search/song?callback=callback123&keyword=%E7%A8%BB%E9%A6%99&page=1&pagesize=30&bitrate=0&isfuzzy=0&tag=em&inputtype=0&platform=WebFilter&userid=-1&clientver=2000&iscorrection=1&privilege_filter=0&srcappid=2919&clienttime=1596167570360&mid=1596167570360&uuid=1596167570360&dfid=-&signature=BA1355A64A559B7A779E96825A7631E8 callback: callback123 keyword: 稻香 page: 1 pagesize: 30 bitrate: 0 isfuzzy: 0 tag: em inputtype: 0 platform: WebFilter userid: -1 clientver: 2000 iscorrection: 1 privilege_filter: 0 srcappid: 2919 clienttime: 1596167570360 mid: 1596167570360 uuid: 1596167570360 dfid: - signature: BA1355A64A559B7A779E96825A7631E8
这个请求也是get请求,我们分析它的参数就可以了,你多分析几个这样的数据包,看一下参数就会看出来那些值在改变,这里我说结果
clienttime和mid和uuid和signature一直在改变,而且clienttime和mid和uuid的值都一样
然后我们就就接着去找js代码
大致一找就会发现clienttime和mid和uuid的值都是时间戳(我怎么说看着这么眼熟),但是没有signature
o.join方法就是把o这个列表里面的信息变成一个字符串,例如:
<script type="text/javascript"> var arr = new Array(3) arr[0] = "George" arr[1] = "John" arr[2] = "Thomas" document.write(arr.join()) </script> 输出: George,John,Thomas
我们断点调试发现o里面的数据是
0: "NVPh5oo715z5DIWAeQlhMDsWXXQV4hwt" 1: "bitrate=0" 2: "callback=callback123" 3: "clienttime=1596179805676" 4: "clientver=2000" 5: "dfid=-" 6: "inputtype=0" 7: "iscorrection=1" 8: "isfuzzy=0" 9: "keyword=稻香" 10: "mid=1596179805676" 11: "page=1" 12: "pagesize=30" 13: "platform=WebFilter" 14: "privilege_filter=0" 15: "srcappid=2919" 16: "tag=em" 17: "userid=-1" 18: "uuid=1596179805676" 19: "NVPh5oo715z5DIWAeQlhMDsWXXQV4hwt" length: 20
然后我们去看faultylabs.MD5代码
1 faultylabs.MD5 = function(a) { 2 function b(a) { 3 var b = (a >>> 0).toString(16); 4 return "00000000".substr(0, 8 - b.length) + b 5 } 6 function c(a) { 7 for (var b = [], c = 0; c < a.length; c++) 8 b = b.concat(k(a[c])); 9 return b 10 } 11 function d(a) { 12 for (var b = [], c = 0; 8 > c; c++) 13 b.push(255 & a), 14 a >>>= 8; 15 return b 16 } 17 function e(a, b) { 18 return a << b & 4294967295 | a >>> 32 - b 19 } 20 function f(a, b, c) { 21 return a & b | ~a & c 22 } 23 function g(a, b, c) { 24 return c & a | ~c & b 25 } 26 function h(a, b, c) { 27 return a ^ b ^ c 28 } 29 function i(a, b, c) { 30 return b ^ (a | ~c) 31 } 32 function j(a, b) { 33 return a[b + 3] << 24 | a[b + 2] << 16 | a[b + 1] << 8 | a[b] 34 } 35 function k(a) { 36 for (var b = [], c = 0; c < a.length; c++) 37 if (a.charCodeAt(c) <= 127) 38 b.push(a.charCodeAt(c)); 39 else 40 for (var d = encodeURIComponent(a.charAt(c)).substr(1).split("%"), e = 0; e < d.length; e++) 41 b.push(parseInt(d[e], 16)); 42 return b 43 } 44 function l() { 45 for (var a = "", c = 0, d = 0, e = 3; e >= 0; e--) 46 d = arguments[e], 47 c = 255 & d, 48 d >>>= 8, 49 c <<= 8, 50 c |= 255 & d, 51 d >>>= 8, 52 c <<= 8, 53 c |= 255 & d, 54 d >>>= 8, 55 c <<= 8, 56 c |= d, 57 a += b(c); 58 return a 59 } 60 function m(a) { 61 for (var b = new Array(a.length), c = 0; c < a.length; c++) 62 b[c] = a[c]; 63 return b 64 } 65 function n(a, b) { 66 return 4294967295 & a + b 67 } 68 function o() { 69 function a(a, b, c, d) { 70 var f = v; 71 v = u, 72 u = t, 73 t = n(t, e(n(s, n(a, n(b, c))), d)), 74 s = f 75 } 76 var b = p.length; 77 p.push(128); 78 var c = p.length % 64; 79 if (c > 56) { 80 for (var k = 0; 64 - c > k; k++) 81 p.push(0); 82 c = p.length % 64 83 } 84 for (k = 0; 56 - c > k; k++) 85 p.push(0); 86 p = p.concat(d(8 * b)); 87 var m = 1732584193 88 , o = 4023233417 89 , q = 2562383102 90 , r = 271733878 91 , s = 0 92 , t = 0 93 , u = 0 94 , v = 0; 95 for (k = 0; k < p.length / 64; k++) { 96 s = m, 97 t = o, 98 u = q, 99 v = r; 100 var w = 64 * k; 101 a(f(t, u, v), 3614090360, j(p, w), 7), 102 a(f(t, u, v), 3905402710, j(p, w + 4), 12), 103 a(f(t, u, v), 606105819, j(p, w + 8), 17), 104 a(f(t, u, v), 3250441966, j(p, w + 12), 22), 105 a(f(t, u, v), 4118548399, j(p, w + 16), 7), 106 a(f(t, u, v), 1200080426, j(p, w + 20), 12), 107 a(f(t, u, v), 2821735955, j(p, w + 24), 17), 108 a(f(t, u, v), 4249261313, j(p, w + 28), 22), 109 a(f(t, u, v), 1770035416, j(p, w + 32), 7), 110 a(f(t, u, v), 2336552879, j(p, w + 36), 12), 111 a(f(t, u, v), 4294925233, j(p, w + 40), 17), 112 a(f(t, u, v), 2304563134, j(p, w + 44), 22), 113 a(f(t, u, v), 1804603682, j(p, w + 48), 7), 114 a(f(t, u, v), 4254626195, j(p, w + 52), 12), 115 a(f(t, u, v), 2792965006, j(p, w + 56), 17), 116 a(f(t, u, v), 1236535329, j(p, w + 60), 22), 117 a(g(t, u, v), 4129170786, j(p, w + 4), 5), 118 a(g(t, u, v), 3225465664, j(p, w + 24), 9), 119 a(g(t, u, v), 643717713, j(p, w + 44), 14), 120 a(g(t, u, v), 3921069994, j(p, w), 20), 121 a(g(t, u, v), 3593408605, j(p, w + 20), 5), 122 a(g(t, u, v), 38016083, j(p, w + 40), 9), 123 a(g(t, u, v), 3634488961, j(p, w + 60), 14), 124 a(g(t, u, v), 3889429448, j(p, w + 16), 20), 125 a(g(t, u, v), 568446438, j(p, w + 36), 5), 126 a(g(t, u, v), 3275163606, j(p, w + 56), 9), 127 a(g(t, u, v), 4107603335, j(p, w + 12), 14), 128 a(g(t, u, v), 1163531501, j(p, w + 32), 20), 129 a(g(t, u, v), 2850285829, j(p, w + 52), 5), 130 a(g(t, u, v), 4243563512, j(p, w + 8), 9), 131 a(g(t, u, v), 1735328473, j(p, w + 28), 14), 132 a(g(t, u, v), 2368359562, j(p, w + 48), 20), 133 a(h(t, u, v), 4294588738, j(p, w + 20), 4), 134 a(h(t, u, v), 2272392833, j(p, w + 32), 11), 135 a(h(t, u, v), 1839030562, j(p, w + 44), 16), 136 a(h(t, u, v), 4259657740, j(p, w + 56), 23), 137 a(h(t, u, v), 2763975236, j(p, w + 4), 4), 138 a(h(t, u, v), 1272893353, j(p, w + 16), 11), 139 a(h(t, u, v), 4139469664, j(p, w + 28), 16), 140 a(h(t, u, v), 3200236656, j(p, w + 40), 23), 141 a(h(t, u, v), 681279174, j(p, w + 52), 4), 142 a(h(t, u, v), 3936430074, j(p, w), 11), 143 a(h(t, u, v), 3572445317, j(p, w + 12), 16), 144 a(h(t, u, v), 76029189, j(p, w + 24), 23), 145 a(h(t, u, v), 3654602809, j(p, w + 36), 4), 146 a(h(t, u, v), 3873151461, j(p, w + 48), 11), 147 a(h(t, u, v), 530742520, j(p, w + 60), 16), 148 a(h(t, u, v), 3299628645, j(p, w + 8), 23), 149 a(i(t, u, v), 4096336452, j(p, w), 6), 150 a(i(t, u, v), 1126891415, j(p, w + 28), 10), 151 a(i(t, u, v), 2878612391, j(p, w + 56), 15), 152 a(i(t, u, v), 4237533241, j(p, w + 20), 21), 153 a(i(t, u, v), 1700485571, j(p, w + 48), 6), 154 a(i(t, u, v), 2399980690, j(p, w + 12), 10), 155 a(i(t, u, v), 4293915773, j(p, w + 40), 15), 156 a(i(t, u, v), 2240044497, j(p, w + 4), 21), 157 a(i(t, u, v), 1873313359, j(p, w + 32), 6), 158 a(i(t, u, v), 4264355552, j(p, w + 60), 10), 159 a(i(t, u, v), 2734768916, j(p, w + 24), 15), 160 a(i(t, u, v), 1309151649, j(p, w + 52), 21), 161 a(i(t, u, v), 4149444226, j(p, w + 16), 6), 162 a(i(t, u, v), 3174756917, j(p, w + 44), 10), 163 a(i(t, u, v), 718787259, j(p, w + 8), 15), 164 a(i(t, u, v), 3951481745, j(p, w + 36), 21), 165 m = n(m, s), 166 o = n(o, t), 167 q = n(q, u), 168 r = n(r, v) 169 } 170 return l(r, q, o, m).toUpperCase() 171 } 172 var p = null 173 , q = null; 174 return "string" == typeof a ? p = k(a) : a.constructor == Array ? 0 === a.length ? p = a : "string" == typeof a[0] ? p = c(a) : "number" == typeof a[0] ? p = a : q = typeof a[0] : "undefined" != typeof ArrayBuffer ? a instanceof ArrayBuffer ? p = m(new Uint8Array(a)) : a instanceof Uint8Array || a instanceof Int8Array ? p = m(a) : a instanceof Uint32Array || a instanceof Int32Array || a instanceof Uint16Array || a instanceof Int16Array || a instanceof Float32Array || a instanceof Float64Array ? p = m(new Uint8Array(a.buffer)) : q = typeof a : q = typeof a, 175 q && alert("MD5 type mismatch, cannot process " + q), 176 o() 177 }
我感觉我已经分析不出来了,兄弟们,谁要是弄出来了给我说一说可不^_^,我感觉太太太太太复杂了!!!!!
虽然我们不能从头开始模拟,但是我们可以下载一首歌曲找到这首歌曲的hash值,然后再去模拟请求
这也是没有办法中的办法了,唉,没办法,,,这加密我也看不懂呀(但是代码还是可以爬取vip音乐)
注意:
请求到的数据包不是json格式,不能转化成字典了,,,唉,,,,,可以用re模块来弄
然后就是最后歌曲的链接也需要处理一下,因为原链接里面包含"\"
代码:
1 import requests 2 import hashlib 3 import time 4 import json 5 import re 6 7 class Spider: 8 def __init__(self): 9 self.headers = { 10 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36' 11 12 } 13 ''' 14 self.url = 'https://complexsearch.kugou.com/v2/search/song?' 15 music_name = input('输入歌曲名字:') 16 self.clienttime = input('输入参数时间戳:') 17 self.signature = input('输入参数hash值:') 18 19 self.data = [ 20 'callback=callback123', 21 'keyword={}'.format(music_name), 22 'page=1', 23 'pagesize=30', 24 'bitrate=0', 25 'isfuzzy=0', 26 'tag=em', 27 'inputtype=0', 28 'platform=WebFilter', 29 'userid=-1', 30 'clientver=2000', 31 'iscorrection=1', 32 'privilege_filter=0', 33 'srcappid=2919', 34 'clienttime={}'.format(self.clienttime), 35 'mid={}'.format(self.clienttime), 36 'uuid={}'.format(self.clienttime), 37 'dfid=-', 38 'signature={}'.format(self.signature) 39 ]''' 40 ''' 41 def get_para(self): 42 u = "&".join(self.data) 43 response = requests.get(self.url+u, headers=self.headers) 44 print(response.text) 45 response.encoding = response.apparent_encoding # 这个apparent_encoding就是让系统根据页面来判断用何种编码 46 response = response.json() # 得到josn字典dict 47 print('----------',response) 48 music_list = response['data']['lists'] 49 print("共计" + str(len(music_list)) + "结果: ") 50 self.all_singers = [] # 放置所有歌手人名 51 self.names = [] # 放置歌曲名字 52 self.all_hash = [] # 放置所有rid,rid是网页所需参数 53 a = 0 54 for music in music_list: 55 # print(music) 56 singer = music["SingerName"] # 歌手名 57 name = str(a) + " " + music["FileName"] # 歌曲名 58 rid = music["FileHash"] # 取出rid,之后要对这个字符串进行切割 59 60 self.all_singers.append(singer) # 将对应信息放到列表中 61 self.names.append(name) 62 self.all_hash.append(rid) 63 a = a + 1 64 infs = dict(zip(self.names, self.all_singers)) 65 infs = json.dumps(infs, ensure_ascii=False, indent=4, separators=(',', ':')) 66 infs = infs.replace('"', ' ') 67 infs = infs.replace(':', '——————') 68 print(infs) 69 self.get_music_playurl()''' 70 71 def down_music(self,url): 72 music = requests.get(url, headers=self.headers).content 73 with open('1.mp3', 'wb') as f: 74 f.write(music) 75 76 def get_music_playurl(self): 77 self.url2 = 'https://wwwapi.kugou.com/yy/index.php?' 78 hash_result = input("请输入需下载歌曲的哈希值:") 79 self.data2 = [ 80 'r=play/getdata', 81 'callback=jQuery1910026577801017397817_1596164131720', 82 'hash={}'.format(hash_result), 83 'album_id=', 84 'dfid=0zPSYM1q2qH90xArIr1yOGtc', 85 'mid=5cd316b98602d9b29f31085c96e6c682', 86 'platid=4', 87 '_=1596164131722' 88 ] 89 u = "&".join(self.data2) 90 response = requests.get(self.url2+u, headers=self.headers) 91 html = response.text 92 pattern = 'play_url":"(.*?)"' 93 url = re.search(pattern, html).group(1) 94 url = url.replace('\\','') 95 print(url) 96 self.down_music(url) 97 98 ''' 99 def get_md5(t): 100 t = t.encode('utf-8') 101 md5 = hashlib.md5(t).hexdigest() 102 return md5 103 ''' 104 if __name__ == '__main__': 105 spider = Spider() 106 spider.get_music_playurl() 107 108 ''' 109 例子:稻香 110 0A62227CAAB66F54D43EC084B4BDD81F 111 '''