python爬虫-喜马拉雅_晚安妈妈睡前故事
这里先说下思路:
1、首先要获取当前书的音频信息
'''获取当前书的音频信息''' all_list = [] for url in self.book_url: r = requests.get(url, headers=self.headers) ret = r.content.decode() # ret通过requests请求得到的网页源代码,是一个json数据类型 pyhton_dict = json.loads(ret) # 通过json.loads(ret)把ret这个json类型的字符串变成python的dict # print(pyhton_dict) pythonData = pyhton_dict['data']['tracksAudioPlay'] # print(pythonData) for book in pythonData: # 取出每个音频的播放地址和名字 list = {} list['src'] = book['src'] list['name'] = book['trackName'] print(list) all_list.append(list) return all_list # 所有音频的信息,只是一个list
2、然后遍历保存
for i in all_list: # 遍历每个音频,保存 print(i) i['name'] = re.sub('"', '', i['name']) # 如果有文件名称是"结尾,需要改成空 with open('D:\\xima\\{}.m4a'.format(self.name + i['name']), 'ab') as f: # wb会覆盖之前数据,ab不覆盖保存 r = requests.get(i['src'], headers=self.headers) ret = r.content f.write(ret) print("下载完毕")
3、最后直接上代码啦!
import requests from lxml import etree import re import json class Xima(object): def __init__(self, name): self.name = name self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36" } self.start_url = "https://www.ximalaya.com/revision/play/album?albumId=260744&pageNum={}&sort=-1&pageSize=30" # {} 占位 self.book_url = [] for i in range(30): url = self.start_url.format(i+1) # format格式插入 self.book_url.append(url) print(self.book_url) print(len(self.book_url)) def get_book_msg(self): '''获取当前书的音频信息''' all_list = [] for url in self.book_url: r = requests.get(url, headers=self.headers) ret = r.content.decode() # ret通过requests请求得到的网页源代码,是一个json数据类型 pyhton_dict = json.loads(ret) # 通过json.loads(ret)把ret这个json类型的字符串变成python的dict # print(pyhton_dict) pythonData = pyhton_dict['data']['tracksAudioPlay'] # print(pythonData) for book in pythonData: # 取出每个音频的播放地址和名字 list = {} list['src'] = book['src'] list['name'] = book['trackName'] print(list) all_list.append(list) return all_list # 所有音频的信息,只是一个list def save(self, all_list): print("开始下载") for i in all_list: # 遍历每个音频,保存 print(i) i['name'] = re.sub('"', '', i['name']) # 如果有文件名称是"结尾,需要改成空 with open('D:\\xima\\{}.m4a'.format(self.name + i['name']), 'ab') as f: # wb会覆盖之前数据,ab不覆盖保存 r = requests.get(i['src'], headers=self.headers) ret = r.content f.write(ret) print("下载完毕") def run(self): all_list = self.get_book_msg() self.save(all_list) if __name__ == "__main__": xima = Xima('晚安妈妈睡前故事') xima.run()
引用"虫师":生活依旧,工作依旧,学习依旧,趁着对技术还热情,继续学习与总结。