python 实现博客园文章导出为word文档
确保安装了以下Python库:
pip install requests beautifulsoup4 python-doc
import requests
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Inches
from urllib.parse import urljoin
import os
def get_article_content(url):
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
title = soup.find('h1').get_text()
article_body = soup.find('div', {'id': 'cnblogs_post_body'}) # 根据实际情况调整选择器
return title, article_body
def add_text_to_doc(doc, text):
doc.add_paragraph(text)
def add_image_to_doc(doc, image_url):
response = requests.get(image_url)
response.raise_for_status()
with open('temp_image', 'wb') as file:
file.write(response.content)
doc.add_picture('temp_image', width=Inches(6))
os.remove('temp_image') # 删除临时图片文件
def process_article_to_word(article_url, output_filename):
title, content = get_article_content(article_url)
doc = Document()
doc.add_heading(title, level=1)
for element in content.descendants:
if element.name == 'p':
add_text_to_doc(doc, element.get_text())
elif element.name == 'img':
img_url = urljoin(article_url, element['src'])
add_image_to_doc(doc, img_url)
doc.save(output_filename)
print(f'Article saved to {output_filename}')
if __name__ == '__main__':
article_url = 'https://www.cnblogs.com/2022-yang/p/18236357' # 替换为实际的博客园文章 URL
output_filename = 'bo.docx'
process_article_to_word(article_url, output_filename)
主要注意的是:
1
# 示例博客园文章链接 blog_url = 'https://www.cnblogs.com/your-article-url'
示例:我的其中一篇博客url:https://www.cnblogs.com/2022-yang/p/18252952
2
# 生成Word文档 output_filename = 'bo.docx'
bo.docx为word文档名称
唯一的就是只能一篇一篇的导,好像确实没有必要。