爬取douyin主页时第一页数据
import requests import json from urllib import parse import re # 抖音视频的URL : Request URL: url="https://www.iesdouyin.com/web/api/v2/aweme/post/?sec_uid=MS4wLjABAAAAPiYAuTiuaI39UV-QJtyFYdT74-Y0NdNOddeS3JUaeWg&count=21&max_cursor=0&_signature=xHcvZwAApmfZvSVp7lyHfMR3L3" headers = { 'User-Agent':"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1" } #调用requests中的get获取抖音作者主页的网页链接 r = requests.get(url=url, headers=headers,stream=True) #输出访问状态,如为<200>即为访问成功 print("初始访问状态:",r) #使用json解析获取的网页内容 data_json = json.loads(r.text) #使用json解析网页后,data_json的内容为dict格式,我们可以通过以下方式查看健名 #print(data_json) path='/' for i in range(len(data_json['aweme_list'])): #url_1为我们获取的视频链接 url_1 = data_json['aweme_list'][i]['video']['play_addr_lowbr']['url_list'][0] #t为我们获取的视频标题 t = data_json['aweme_list'][i]['desc'] # requests发送浏览器发送get请求,得到数据 r = requests.get(url=url_1, headers=headers,stream=True) print(r) #输出r访问状态 # 获取数据的二进制长度 reponse_body_lenth = int(r.headers.get("Content-Length")) # 打印数据的长度 print("视频的数据长度为:", reponse_body_lenth) #path_1为完整文件保存路径 path_1 = path+t+'.mp4' #去除文件名中特殊字符否则报错 rstr = r"[\/\\\:;\*#¥%$!@^……&()\?\"\<\>\|]" # '/ \ : * ? " < > |' path_1 = re.sub(rstr, "", path_1) # 替换为"" # 保存抖音视频mp4格式,二进制读取
#去除文件名中特殊字符否则报错
#rstr = r"[\/\\\:;\*#¥%$!@^……&()\?\"\<\>\|\s] " # '/ \ : * ? " < > |'
#path_1 = re.sub(rstr, "", path_1) # 替换为""
# 保存抖音视频mp4格式,二进制读取
#path_1 =path_1.replace("/","").replace('\n#',' ').replace(' #',' ')
#with open(path_1, "wb") as xh:
with open(path_1, "wb") as xh: # 先定义初始进度为0 write_all = 0 for chunk in r.iter_content(chunk_size=1000000): write_all += xh.write(chunk) # 打印下载进度 print("下载进度:%02.6f%%" % (100 * write_all / reponse_body_lenth))
使用中有报错,json解析错误:
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
有可能是heder头传参数问题,包括格式与数据传输不足导致,这里添加了一个cookei就可以了
import requests import json from urllib import parse import re # 抖音视频的URL : Request URL: #url="https://www.iesdouyin.com/web/api/v2/aweme/post/?sec_uid=MS4wLjABAAAAPiYAuTiuaI39UV-QJtyFYdT74-Y0NdNOddeS3JUaeWg&count=21&max_cursor=0&_signature=xHcvZwAApmfZvSVp7lyHfMR3L3" url = "https://www.iesdouyin.com/web/api/v2/aweme/post/?reflow_source=reflow_page&sec_uid=MS4wLjABAAAA7gdVQLJr7WPYgx3YO90RZw9SU9DoJCgFVbvAwbd6kkQ&count=21&max_cursor=0&msToken=6u7tnUcZLCxkZ3I_bvz2Nm9EYGn6yTJq5f7YyvRVdFgCvoYt_6tvpDxBWiXczXekd_AYDc-xUFWffe1M7yUtIqRWgdkOlvXuRRT4KxFItTZS6Dr5u6Y8pyFI9sVujg==&X-Bogus=DFSzKwVOUKhANCFitc0FcF9WX7jS&_signature=_02B4Z6wo00001vG.7sAAAIDCcb0UgkgR9N7xr-pAANiFol76tIVSE6ObmhzdoBocqDTHelV68upzkUi6jjJr6FNy4kCFmZQ8Bxln96efDYyxEUR-oxiTX1Jnui7jAW3KWjMOFVJ7sszUVig69a"; headers = { 'User-Agent':"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Mobile Safari/537.36", 'cookie': "ttwid=1%7C8SJ6I0piwi6aLXcYCfO44e_ICVpTT7wImJ5bi31yH0w%7C1679115782%7C8138e7a0e3a9cc329791eaaa731418451697249b392e58a28cf987f27e4284e8; __ac_nonce=06417ab78003ab1d14e43; __ac_signature=_02B4Z6wo00f01TncvSwAAIDBud5HbZ28K5057LmAACpj8a; _tea_utm_cache_1243={%22utm_source%22:%22copy%22%2C%22utm_medium%22:%22android%22%2C%22utm_campaign%22:%22client_share%22}; s_v_web_id=verify_lfg3om5y_TIAGl8SV_p9PF_4kOf_9ThY_Pp4K2j7RHQo1; _tea_utm_cache_2018={%22utm_source%22:%22copy%22%2C%22utm_medium%22:%22android%22%2C%22utm_campaign%22:%22client_share%22}; msToken=6u7tnUcZLCxkZ3I_bvz2Nm9EYGn6yTJq5f7YyvRVdFgCvoYt_6tvpDxBWiXczXekd_AYDc-xUFWffe1M7yUtIqRWgdkOlvXuRRT4KxFItTZS6Dr5u6Y8pyFI9sVujg==; msToken=DQwwyXA5lreBS2fXFA-e8qh0B0oDVvZItWU2kl-qGbfa6a910HP1J_SuCTaH6alQPLRXtU-te0ZvCbCRgifaB5xZKP0W80VYgLew-UyrOzZZ_zDvnsVxt-Dri2NZNw==; ttcid=417a2e9adcf04735b4567e5d71b7d2a291; tt_scid=7YmVP3cRSd4tStECPikphUZGHF0bQ9TmfbfJJUBrILLhomHyOD3UXugYIFhia6pUdf37" } #调用requests中的get获取抖音作者主页的网页链接 r = requests.get(url=url, headers=headers,stream=True) #输出访问状态,如为<200>即为访问成功 print("初始访问状态:",r) #使用json解析获取的网页内容 data_json = json.loads(r.text) #使用json解析网页后,data_json的内容为dict格式,我们可以通过以下方式查看健名 #print(data_json) path='/' for i in range(len(data_json['aweme_list'])): #url_1为我们获取的视频链接 url_1 = data_json['aweme_list'][i]['video']['play_addr_lowbr']['url_list'][0] #t为我们获取的视频标题 t = data_json['aweme_list'][i]['desc'] # requests发送浏览器发送get请求,得到数据 r = requests.get(url=url_1, headers=headers,stream=True) print(r) #输出r访问状态 # 获取数据的二进制长度 reponse_body_lenth = int(r.headers.get("Content-Length")) # 打印数据的长度 print("视频的数据长度为:", reponse_body_lenth) #path_1为完整文件保存路径 path_1 = path+t+'.mp4' #去除文件名中特殊字符否则报错 rstr = r"[\/\\\:;\*#¥%$!@^……&()\?\"\<\>\|]" # '/ \ : * ? " < > |' path_1 = re.sub(rstr, "", path_1) # 替换为"" # 保存抖音视频mp4格式,二进制读取 with open(path_1, "wb") as xh: # 先定义初始进度为0 write_all = 0 for chunk in r.iter_content(chunk_size=1000000): write_all += xh.write(chunk) # 打印下载进度 print("下载进度:%02.6f%%" % (100 * write_all / reponse_body_lenth))