python 爬取原力文档日语学习资料
参考
https://blog.csdn.net/weixin_46184311/article/details/115291441
代码
import requests, json, re, time, urllib.request
import time
import wget
def getParameter(url): # 获取文档参数
text_response = requests.get(url=url, headers=headers).text
actual_page = int(re.search('actual_page: (\d+), //真实页数', text_response).group(1)) # 页数
aid = re.search('aid: (\d+), //解密后的id', text_response).group(1) # aid
view_token = re.search('view_token: \'(.*?)\'', text_response).group(1) # view_token
print('actual_page:', actual_page, '\naid:', aid, '\nview_token:', view_token)
return actual_page, aid, view_token
def requests_data(parameter, page): # 请求数据
url = 'https://openapi.book118.com/getPreview.html'
params = {
'project_id': '1',
'aid': parameter[1],
'view_token': parameter[2],
'page': page,
}
response = requests.get(url=url, headers=headers, params=params).text
json_data = re.search('jsonpReturn\((.*?)\);', response).group(1) # 使用正则表达式所需数据
data = json.loads(json_data)['data']
return data
if __name__ == '__main__':
results = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
text_url = 'https://max.book118.com/html/2023/0208/6230222112005044.shtm'
parameter = getParameter(text_url)
print(parameter)
for page in range(1, parameter[0]+1, 6):
print(page)
result = requests_data(parameter, page)
print(result)
for id, url in result.items():
url = 'https:'+url
print(f'downloading: {id}, {url} ')
wget.download(url=url, out=f'imgs/{id}.png')
time.sleep(1)