python 爬取原力文档日语学习资料
参考
https://blog.csdn.net/weixin_46184311/article/details/115291441
代码
import requests, json, re, time, urllib.request
import time
import wget
def getParameter(url): # 获取文档参数
text_response = requests.get(url=url, headers=headers).text
actual_page = int(re.search('actual_page: (\d+), //真实页数', text_response).group(1)) # 页数
aid = re.search('aid: (\d+), //解密后的id', text_response).group(1) # aid
view_token = re.search('view_token: \'(.*?)\'', text_response).group(1) # view_token
print('actual_page:', actual_page, '\naid:', aid, '\nview_token:', view_token)
return actual_page, aid, view_token
def requests_data(parameter, page): # 请求数据
url = 'https://openapi.book118.com/getPreview.html'
params = {
'project_id': '1',
'aid': parameter[1],
'view_token': parameter[2],
'page': page,
}
response = requests.get(url=url, headers=headers, params=params).text
json_data = re.search('jsonpReturn\((.*?)\);', response).group(1) # 使用正则表达式所需数据
data = json.loads(json_data)['data']
return data
if __name__ == '__main__':
results = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
text_url = 'https://max.book118.com/html/2023/0208/6230222112005044.shtm'
parameter = getParameter(text_url)
print(parameter)
for page in range(1, parameter[0]+1, 6):
print(page)
result = requests_data(parameter, page)
print(result)
for id, url in result.items():
url = 'https:'+url
print(f'downloading: {id}, {url} ')
wget.download(url=url, out=f'imgs/{id}.png')
time.sleep(1)
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 【自荐】一款简洁、开源的在线白板工具 Drawnix
· 园子的第一款AI主题卫衣上架——"HELLO! HOW CAN I ASSIST YOU TODAY
· Docker 太简单,K8s 太复杂?w7panel 让容器管理更轻松!