bookSpyder
备注
Highlights information that users should take into account, even when skimming.
提示
Optional information to help a user be more successful.
重要
Crucial information necessary for users to succeed.
警告
Critical content demanding immediate user attention due to potential risks.
注意
Negative potential consequences of an action.
import re
import time
import requests
from bs4 import BeautifulSoup
import os
index_url = 'https://wizardforcel.gitbooks.io/ios-sec-wiki/content/'
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36'
}
css_url = 'gitbook/style.css'
font_url = 'gitbook/fonts/fontawesome/fontawesome-webfont.woff'
# 下载资源
def get_source(soup,tag,attr):
link_items = soup.find_all(tag)
for link in link_items:
if link.has_attr(attr):
url = link.get(attr)
if "http" not in url and ".html" not in url:
url = url.replace('../','')
download(url)
# 根据资源链接下载资源
def download(url):
(filepath, tempfilename) = os.path.split(url)
if not(os.path.exists(url)):
if not(os.path.exists(filepath)):
os.makedirs(filepath)
downlaod_url = index_url + url
r = requests.get(downlaod_url)
with open(url,'wb') as f:
f.write(r.content)
# 获取目录
def get_content_url():
res = requests.get(index_url,headers=headers).text
soup = BeautifulSoup(res,'html5lib')
items = soup.find_all('ul',class_="summary")[0]
li_list = items.find_all('li')
list1 = []
for url_li in li_list[3:]:
try:
url_href = url_li.find('a').get('href')
url_content = url_href.replace('./','')
list1.append(url_content)
except Exception as err:
pass
else:
pass
return list1
# 处理某一页
def deal_page(page_url):
res = requests.get(index_url + page_url,headers=headers,stream=True).text
soup = BeautifulSoup(res,'html5lib')
title = soup.find('title')
print(title.string)
get_source(soup,'link','href')
get_source(soup,['script','img'],'src')
download(page_url)
# 下载文档
def get_books():
download(css_url)
download(font_url)
pages = get_content_url()
for page in pages:
deal_page(page)
if __name__ == "__main__":
get_books()
- 本文链接: https://www.cnblogs.com/gshang/articles/15853910.html
- 版权声明: 本博客所有文章除特别声明外,均采用 CC-BY-NC-SA 4.0 许可协议。转载请注明出处!