python 实现博客园文章导出为word文档

确保安装了以下Python库:

pip install requests beautifulsoup4 python-doc

 

import requests
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Inches
from urllib.parse import urljoin
import os

def get_article_content(url):
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')

title = soup.find('h1').get_text()
article_body = soup.find('div', {'id': 'cnblogs_post_body'}) # 根据实际情况调整选择器

return title, article_body

def add_text_to_doc(doc, text):
doc.add_paragraph(text)

def add_image_to_doc(doc, image_url):
response = requests.get(image_url)
response.raise_for_status()
with open('temp_image', 'wb') as file:
file.write(response.content)
doc.add_picture('temp_image', width=Inches(6))
os.remove('temp_image') # 删除临时图片文件

def process_article_to_word(article_url, output_filename):
title, content = get_article_content(article_url)
doc = Document()
doc.add_heading(title, level=1)

for element in content.descendants:
if element.name == 'p':
add_text_to_doc(doc, element.get_text())
elif element.name == 'img':
img_url = urljoin(article_url, element['src'])
add_image_to_doc(doc, img_url)

doc.save(output_filename)
print(f'Article saved to {output_filename}')

if __name__ == '__main__':
article_url = 'https://www.cnblogs.com/2022-yang/p/18236357' # 替换为实际的博客园文章 URL
output_filename = 'bo.docx'
process_article_to_word(article_url, output_filename)


 

 主要注意的是:


1

 # 示例博客园文章链接
    blog_url = 'https://www.cnblogs.com/your-article-url'
示例:我的其中一篇博客url:https://www.cnblogs.com/2022-yang/p/18252952
2
# 生成Word文档
output_filename = 'bo.docx'
bo.docx为word文档名称

唯一的就是只能一篇一篇的导,好像确实没有必要。
posted @ 2024-06-19 21:14  *太¥^白%  阅读(6)  评论(0编辑  收藏  举报