bookSpyder

备注
Highlights information that users should take into account, even when skimming.

提示
Optional information to help a user be more successful.

重要
Crucial information necessary for users to succeed.

警告
Critical content demanding immediate user attention due to potential risks.

注意
Negative potential consequences of an action.

import re 
import time
import requests
from bs4 import BeautifulSoup
import os


index_url = 'https://wizardforcel.gitbooks.io/ios-sec-wiki/content/'
headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36'
}
css_url = 'gitbook/style.css'
font_url = 'gitbook/fonts/fontawesome/fontawesome-webfont.woff'

# 下载资源
def get_source(soup,tag,attr):
    link_items = soup.find_all(tag)
    for link in link_items:
        if link.has_attr(attr):
            url = link.get(attr)
            if "http" not in url and ".html" not in url:
                url = url.replace('../','')
                download(url)
                
# 根据资源链接下载资源
def download(url):
    (filepath, tempfilename) = os.path.split(url)
    if not(os.path.exists(url)):
        if not(os.path.exists(filepath)):
            os.makedirs(filepath)
        downlaod_url = index_url + url
        r = requests.get(downlaod_url) 
        with open(url,'wb') as f:
            f.write(r.content)  

# 获取目录
def get_content_url():
    res = requests.get(index_url,headers=headers).text
    soup = BeautifulSoup(res,'html5lib')
    items = soup.find_all('ul',class_="summary")[0]
    li_list = items.find_all('li') 
    list1 = []
    for url_li in li_list[3:]:
        try:
            url_href = url_li.find('a').get('href')
            url_content =  url_href.replace('./','')
            list1.append(url_content)
        except Exception as err:
            pass
        else:
            pass
    return list1     

# 处理某一页
def deal_page(page_url):
        res = requests.get(index_url + page_url,headers=headers,stream=True).text
        soup = BeautifulSoup(res,'html5lib')
        title = soup.find('title')
        print(title.string)
        get_source(soup,'link','href')
        get_source(soup,['script','img'],'src')
        download(page_url)    

# 下载文档 
def get_books():
    download(css_url)  
    download(font_url)     
    pages = get_content_url()
    for page in pages:
        deal_page(page)   


if __name__ == "__main__":
    get_books()
posted @ 2022-01-28 21:50  GShang  阅读(22)  评论(0编辑  收藏  举报