获取网页的markdown
# 获取网页源码
import re
import html2text
import requests
def preprocess_html(html):
# 删除没有 src 属性的 img 标签
processed_html = re.sub(r'<img(?![^>]*\ssrc=)[^>]*>', '', html)
return processed_html
page_url = 'https://www.ysxiao.cn/c/202212/57443.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
}
def requests_page(url):
fp = requests.get(url=url, headers=headers, timeout=10)
fp.encoding = 'utf-8'
return fp.text
fp = requests_page(page_url)
if isinstance(fp, bytes):
original_format = fp.decode('utf-8')
else:
original_format = fp
original_format = preprocess_html(original_format)
markdown = html2text.html2text(original_format)
print(markdown)