博客园下载所有文章保存为Mardown

简易代码

import requests
from bs4 import BeautifulSoup
import re
import html2text
import os

session = requests.session()

cookies = {
  #换成自己的cookies
}

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Pragma': 'no-cache',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
}
total_page=31 # 总页数加1
name="angelyan" # 博客名字
for page in range(1, total_page):
    params = {
        'page': page,
    }
    response = session.get('https://www.cnblogs.com/%s'%name, cookies=cookies, headers=headers, params=params)
    soup = BeautifulSoup(response.text, "lxml")
    days = soup.find_all("div", class_="day")
    for d in days:
        a_url = d.find("a", class_=re.compile('^postTitle2')).attrs["href"]
        print(a_url)
        res = session.get(a_url, cookies=cookies, headers=headers)
        sup = BeautifulSoup(res.text, "lxml")
        try:
            title = sup.find("h1", class_="postTitle").text.strip()
        except:
            continue
        html = sup.find("div", class_="post")
        print(title)
        markdown = html2text.html2text(str(html))
        # print(markdown)
        with open(os.path.join(r"./博客园", "%s.md" % title), "w", encoding="utf-8") as f:
            f.write(markdown)

 

posted @ 2023-10-26 13:30  Maple_feng  阅读(48)  评论(0编辑  收藏  举报