喜马拉雅音频获取

'''
 18b608469d0f3590019d6debfa2537db (himalaya-1622708589034)
 (56)
 1622704460086 服务器时间
 (60)
 1622704430067 系统当前时间

'''
import requests
from lxml import etree

import hashlib
import random #随机数
import time



# 解码js反推解密请求头
def Sign(headers):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
        'Host': 'www.ximalaya.com',
        'Cookie': '_xmLog=h5&cd0c8f27-1907-40a9-b88d-6c2bbbd0d3da&2.2.17; 1&remember_me=y; 1&_token=326676124&668C8190240NB324166601BB6E71B86E87886115A93F7E42C87D2AB6C5A627548C8E449A44B42MAA74BF85A07816A_; x_xmly_traffic=utm_source%253A%2526utm_medium%253A%2526utm_campaign%253A%2526utm_content%253A%2526utm_term%253A%2526utm_from%253A; Hm_lvt_4a7d8ec50cfd6af753c4f8aee3425070=1622618068,1622621264,1622683454,1622704399; Hm_lpvt_4a7d8ec50cfd6af753c4f8aee3425070=1622704399',
    }
    # 服务器时间
    url = 'https://www.ximalaya.com/revision/time'
    response = requests.get(url,headers=headers).text
    # 生成当前系统时间戳
    nowTime = str(round(time.time() * 1000))

    # .encode() 编码
    # .hexdigest() 解码
    # .format() 替换
    # round(random.random() * 100) 保留100以内的整数
    sign = hashlib.md5('himalaya-{}'.format(response).encode()).hexdigest() + "({})".format(str(round(random.random() * 100))) + response + "({})".format(str(round(random.random() * 100))) +nowTime
    headers['xm-sign'] = str(sign)
    return headers

# 爬取源代码请求头
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
}

# get请求
def get_html(url):
    response = requests.get(url, headers=headers)
    response.encoding = response.apparent_encoding
    return response

# 页面解析
def parse_html(html):
    e = etree.HTML(html)
    return e

# xpath 解析页面
def main(url):
    html = get_html(url)
    e = parse_html(html.text)

    titles = e.xpath('//*[@id="anchor_sound_list"]/div[2]/ul/li/div[2]/a/@title') # 标题
    href = e.xpath('//div[@class="text lF_"]/a/@href') # 链接
    href =iter(href) # 迭代器
    for title,ds in zip(titles,href):
        id =ds.split('/')[3]
        get_m4a(title,id)

# 接收id值 构建请求头
def get_m4a(title,id):
    url = "https://www.ximalaya.com/revision/play/v1/audio?id=%s&ptype=1" % id
    headers = Sign(headers=url)
    response = requests.get(url, headers=headers).json()
    down_load(response["data"]["src"], title)

# 保存文件
def down_load(url, title="默认文档"):
    res = requests.get(url)
    f = open("./喜马拉雅/%s.m4a" % title, "wb")
    f.write(res.content)
    f.close()
    print("%s下载成功!" % title)


if __name__ == '__main__':
    pag = int(input('你要爬取多少页:'))
    for i in range(1, pag + 1):
        print("https://www.ximalaya.com/youshengshu/22963309/p%s/" % i)
        main("https://www.ximalaya.com/youshengshu/22963309/p%s/" % i)

js反推加密请求 参考https://pan.baidu.com/s/1Fp9rp8E5cKXXK57kLFfVsw  提取码b4qx

posted @ 2021-06-07 09:19  .笨蛋小韩  阅读(298)  评论(0编辑  收藏  举报