import requests import os headers={ 'Cookie': '_ga=GA1.2.701818100.1612092981; _gid=GA1.2.748589379.1612092981; Hm_lvt_cdb524f42f0ce19b169a8071123a4797=1612092982; Hm_lpvt_cdb524f42f0ce19b169a8071123a4797=1612094717; kw_token=ZALW965FXG', 'csrf': 'ZALW965FXG', 'Host': 'www.kuwo.cn', 'Referer': 'https://www.kuwo.cn/singer_detail/1600', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36' } if not os.path.exists('mics'): os.mkdir('mics') def Index(page): # url ='https://www.kuwo.cn/api/www/artist/artistMusic?artistid=1600&pn='+ str(page) + '&rn=30&httpsStatus=1&reqId=9d0df070-63bc-11eb-8632-19dcd503126a' url='https://www.kuwo.cn/api/www/artist/artistMusic?artistid=1600&pn='+ str(page) + '&rn=30&httpsStatus=1&reqId=50b03180-63ca-11eb-b714-332080487537' response = requests.get(url=url,headers=headers).json() musicList = response['data']['list'] print(musicList) for music in musicList: rid=music['rid'] name=music['name'] musicSave(rid,name) def musicSave(rid,name): # url='https://www.kuwo.cn/url?format=mp3&rid='+ str(rid) + '&response=url&type=convert_url3&br=128kmp3&from=web&t=1612094725726&httpsStatus=1&reqId=9a3777e1-63bc-11eb-8632-19dcd503126a' url='https://www.kuwo.cn/url?format=mp3&rid='+ str(rid) + '&response=url&type=convert_url3&br=128kmp3&from=web&t=1612100615341&httpsStatus=1&reqId=50b38ce1-63ca-11eb-b714-332080487537' response=requests.get(url=url,headers=headers).json() mp3path=response['url'] print(mp3path) data = requests.get(url=mp3path).content ***** # 文件存储 # a 追加 b进制读写(音乐文件是字节数据) print(mp3path) with open('mics\{}.mp3'.format(name),'ab') as f: f.write(data) print('{}.mp3已经下载完成',format(name)) for page in range(1,11): Index(page)
1.地址,文件地址和播放地址需要抓取
2.'Cookie': 'csrf'网页刷新后需要更新,大量爬虫可以使用代理ip和伪造User-Agent,或者js逆向后续更新。
出现错误,data = requests.get(url=mp3path).content 五颗红星 原来哪里我添加headers后get不了导致失败,后面把headers去掉后就能用了
data = requests.get(mp3path,headers=headers).content
1.演示一下用免费代理ip爬虫
import urllib.request def creat_proxy_handler(): url="https://www.baidu.com" # 添加代理 proxy_list=[ {"http":"60.168.207.219:9999"}, {"http":"58.23.67.208:9999"}, {"http":"42.7.28.217:9999"}, {"http":"61.145.49.177:9999"}, {"http":"36.250.156.78:9999"}, {"http":"36.248.133.145:9999"}, {"http":"42.56.238.117:9999"}, {"http":"36.249.119.34:9999"}, {"http":"58.22.177.60:9999"} ] for proxy in proxy_list: print(proxy) # 遍历出来的ip创建处理器 # 代理处理器 proxy_handler=urllib.request.ProxyHandler(proxy) #创建自己的opener opener=urllib.request.build_opener(proxy_handler) try: # 拿着代理ip去发送请求 data = opener.open(url,timeout=1).read() print("haha") except Exception as e: print(e) creat_proxy_handler()
带着cookie去自动登录
import urllib.request from http import cookiejar from urllib import parse """ 直接获取 个人中心 1代码登录 2.自动带着cookies 1.代码登录 1.1登录的网址 login_url ='https://www.yaozh.com/login' 1.2登录的参数 1.3发送登录请求 2.代码带着cookes 访问 """ login_url ='https://www.yaozh.com/login' login_from_data={ "username":"xiaomaoera12", "pwd":"lina081012", "formhash":"89B42EA5FF", "backurl":"https%3A%2F%2Fjob.yaozh.com%2FtopicComp%2F14" } # 1.3发送登录请求POST cook_jar = cookiejar.CookieJar() # 定义有添加cook功能的处理器 cook_hanlder = urllib.request.HTTPCookieProcessor(cook_jar) # 根据处理器生成opener opener = urllib.request.build_opener(cook_hanlder) headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', } login_str = parse.urlencode(login_from_data).encode("utf-8") login_request= urllib.request.Request(login_url,headers=headers,data=login_str) opener.open(login_request) center_url="https://www.yaozh.com/member/" center_request = urllib.request.Request(center_url,headers=headers) response = opener.open(center_url) data=response.read() print(data) with open('02cook.html','wb') as f: f.write(data)