喜马拉雅音频获取
''' 18b608469d0f3590019d6debfa2537db (himalaya-1622708589034) (56) 1622704460086 服务器时间 (60) 1622704430067 系统当前时间 ''' import requests from lxml import etree import hashlib import random #随机数 import time # 解码js反推解密请求头 def Sign(headers): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36', 'Host': 'www.ximalaya.com', 'Cookie': '_xmLog=h5&cd0c8f27-1907-40a9-b88d-6c2bbbd0d3da&2.2.17; 1&remember_me=y; 1&_token=326676124&668C8190240NB324166601BB6E71B86E87886115A93F7E42C87D2AB6C5A627548C8E449A44B42MAA74BF85A07816A_; x_xmly_traffic=utm_source%253A%2526utm_medium%253A%2526utm_campaign%253A%2526utm_content%253A%2526utm_term%253A%2526utm_from%253A; Hm_lvt_4a7d8ec50cfd6af753c4f8aee3425070=1622618068,1622621264,1622683454,1622704399; Hm_lpvt_4a7d8ec50cfd6af753c4f8aee3425070=1622704399', } # 服务器时间 url = 'https://www.ximalaya.com/revision/time' response = requests.get(url,headers=headers).text # 生成当前系统时间戳 nowTime = str(round(time.time() * 1000)) # .encode() 编码 # .hexdigest() 解码 # .format() 替换 # round(random.random() * 100) 保留100以内的整数 sign = hashlib.md5('himalaya-{}'.format(response).encode()).hexdigest() + "({})".format(str(round(random.random() * 100))) + response + "({})".format(str(round(random.random() * 100))) +nowTime headers['xm-sign'] = str(sign) return headers # 爬取源代码请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36', } # get请求 def get_html(url): response = requests.get(url, headers=headers) response.encoding = response.apparent_encoding return response # 页面解析 def parse_html(html): e = etree.HTML(html) return e # xpath 解析页面 def main(url): html = get_html(url) e = parse_html(html.text) titles = e.xpath('//*[@id="anchor_sound_list"]/div[2]/ul/li/div[2]/a/@title') # 标题 href = e.xpath('//div[@class="text lF_"]/a/@href') # 链接 href =iter(href) # 迭代器 for title,ds in zip(titles,href): id =ds.split('/')[3] get_m4a(title,id) # 接收id值 构建请求头 def get_m4a(title,id): url = "https://www.ximalaya.com/revision/play/v1/audio?id=%s&ptype=1" % id headers = Sign(headers=url) response = requests.get(url, headers=headers).json() down_load(response["data"]["src"], title) # 保存文件 def down_load(url, title="默认文档"): res = requests.get(url) f = open("./喜马拉雅/%s.m4a" % title, "wb") f.write(res.content) f.close() print("%s下载成功!" % title) if __name__ == '__main__': pag = int(input('你要爬取多少页:')) for i in range(1, pag + 1): print("https://www.ximalaya.com/youshengshu/22963309/p%s/" % i) main("https://www.ximalaya.com/youshengshu/22963309/p%s/" % i)
js反推加密请求 参考https://pan.baidu.com/s/1Fp9rp8E5cKXXK57kLFfVsw 提取码b4qx