获取网页的markdown

# 获取网页源码
import re

import html2text
import requests
def preprocess_html(html):
    # 删除没有 src 属性的 img 标签
    processed_html = re.sub(r'<img(?![^>]*\ssrc=)[^>]*>', '', html)
    return processed_html
page_url = 'https://www.ysxiao.cn/c/202212/57443.html'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
}
def requests_page(url):
    fp = requests.get(url=url, headers=headers, timeout=10)
    fp.encoding = 'utf-8'
    return fp.text
fp = requests_page(page_url)
if isinstance(fp, bytes):
    original_format = fp.decode('utf-8')
else:
    original_format = fp
original_format = preprocess_html(original_format)
markdown = html2text.html2text(original_format)
print(markdown)

  

posted @ 2024-10-08 11:30  布都御魂  阅读(5)  评论(0编辑  收藏  举报