# -*- coding:utf8 -*-"""html2markdown converts an html string to markdown while preserving unsupported markup."""## Copyright 2017-2018 David Lönnhager (dlon)## Permission is hereby granted, free of charge, to any person obtaining a copy of# this software and associated documentation files (the "Software"), to deal in# the Software without restriction, including without limitation the rights to# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies# of the Software, and to permit persons to whom the Software is furnished# to do so, subject to the following conditions:## The above copyright notice and this permission notice shall be included in all# copies or substantial portions of the Software.## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.#from html.parser import HTMLParser
import bs4
from bs4 import BeautifulSoup
import re
import sys
if sys.version_info[0] > 2:
unicode = str_supportedTags = {
# NOTE: will be ignored if they have unsupported attributes (cf. _supportedAttributes)#注意:如果它们具有不受支持的属性,则将被忽略(参见_supportedAttributes)'blockquote',
'p',
'a',
'h1','h2','h3','h4','h5','h6',
'strong','b',
'em','i',
'ul','ol','li',
'br',
'img',
'pre','code',
'hr'}
_supportedAttributes = (
'a href',
'a title',
'img alt',
'img src',
'img title',
)
_inlineTags = {
# these can be mixed with markdown (when unprocessed)# block tags will be surrounded by newlines and be unprocessed inside# (unless supported tag + supported attribute[s])#这些可以与markdown混合(未处理时),块标签将被换行符包围,并且在内部未处理,(除非支持的标签+支持的属性)'a',
'abbr',
'acronym',
'audio',
'b',
'bdi',
'bdo',
'big',
#'br','button',
#'canvas','cite',
'code',
'data',
'datalist',
'del',
'dfn',
'em',
#'embed','i',
#'iframe',#'img',#'input','ins',
'kbd',
'label',
'map',
'mark',
'meter',
#'noscript','object',
#'output','picture',
#'progress','q',
'ruby',
's',
'samp',
#'script','select',
'slot',
'small',
'span',
'strike',
'strong',
'sub',
'sup',
'svg',
'template',
'textarea',
'time',
'u',
'tt',
'var',
#'video','wbr',
}
def_supportedAttrs(tag):
sAttrs = [attr.split(' ')[1] for attr in _supportedAttributes if attr.split(' ')[0]==tag.name]
for attr in tag.attrs:
if attr notin sAttrs:
returnFalsereturnTruedef_recursivelyValid(tag):
# not all tags require this property# requires: <blockquote><p style="...">asdf</p></blockquote># does not: <div><p style="...">asdf</p></div> children = tag.find_all(recursive = False)
for child in children:
ifnot _recursivelyValid(child):
returnFalseif tag.name == '[document]':
returnTrueelif tag.name in _inlineTags:
returnTrue# elif tag.name not in _supportedTags:# return False# if not _supportedAttrs(tag):# return FalsereturnTrue_escapeCharSequence = tuple(r'\`*_[]#')
_escapeCharRegexStr = '([{}])'.format(''.join(re.escape(c) for c in _escapeCharSequence))
_escapeCharSub = re.compile(_escapeCharRegexStr).sub
def_escapeCharacters(tag):
"""non-recursively escape underlines and asterisks
in the tag"""# 在标签中非递归地转义下划线和星号for i,c inenumerate(tag.contents):
iftype(c) != bs4.element.NavigableString:
continue c.replace_with(_escapeCharSub(r'\\\1', c))
def_breakRemNewlines(tag):
"""non-recursively break spaces and remove newlines in the tag"""#非递归地分隔空格并删除标记中的换行符for i,c inenumerate(tag.contents):
iftype(c) != bs4.element.NavigableString:
continue c.replace_with(re.sub(r' {2,}', ' ', c).replace('\n',''))
def_markdownify(tag, _listType=None, _blockQuote=False, _listIndex=1):
"""recursively converts a tag into markdown"""# 递归地将标签转换为markdown# print(tag.name) children = tag.find_all(recursive=False)
if tag.name == '[document]':
for child in children:
_markdownify(child)
return# if tag.name in _inlineTags:# for child in children:# _markdownify(child)# tag.unwrap()# if tag.name not in _inlineTags:# # print(tag.name)# tag.insert_before('\n\n')# tag.insert_after('\n\n')# else:# _escapeCharacters(tag)# for child in children:# _markdownify(child)# return# if tag.name not in ('pre', 'code'):# _escapeCharacters(tag)# _breakRemNewlines(tag)if tag.name=='div':
tag.insert_before('\n')
tag.insert_after('\n')
for child in children:
_markdownify(child)
tag.unwrap()
returnelif tag.name=='span'or tag.name=='strong':
tag.unwrap()
if tag.name == 'p':
if tag.string != None:
if tag.string.strip() == u'':
tag.string = u'\xa0' tag.unwrap()
returnif _blockQuote:
tag.insert_before('\n')
tag.insert_after('\n')
tag.unwrap()
for child in children:
_markdownify(child)
elif tag.name == 'br':
tag.string = ' \n' tag.unwrap()
elif tag.name == 'img':
alt = '' title = ''if tag.has_attr('alt'):
alt = tag['alt']
if tag.has_attr('title') and tag['title']:
title = ' "%s"' % tag['title']
if tag.has_attr('src'):
tag.string = '' % (alt, tag['src'], title)
tag.unwrap()
elif tag.name == 'hr':
tag.string = '\n---\n' tag.unwrap()
elif tag.name == 'pre':
tag.insert_before('\n')
tag.insert_after('\n')
if tag.code:
ifnot _supportedAttrs(tag.code):
returnfor child in tag.code.find_all(recursive=False):
if child.name != 'br':
return# code blockfor br in tag.code.find_all('br'):
br.string = '\n' br.unwrap()
tag.code.unwrap()
lines = unicode(tag).strip().split('\n')
lines[0] = lines[0][5:]
lines[-1] = lines[-1][:-6]
ifnot lines[-1]:
lines.pop()
for i,line inenumerate(lines):
line = line.replace(u'\xa0', ' ')
lines[i] = ' %s' % line
tag.replace_with(BeautifulSoup('\n'.join(lines), 'html.parser'))
pattern = re.compile('brush:(.*?);')
try:
result=pattern.findall(tag.get("class")[0])[0]
except Exception:
result=""passif tag.parent isNone:
if children:
for child in children:
_markdownify(child,_listType=tag.name, _blockQuote=True)
else:
tag.insert_before('```{}\n'.format(result))
tag.insert_after('\n```\n')
try:
tag.unwrap()
except Exception:
passreturn# returnelif tag.name == 'code'and tag.parent!=None:
# inline code# if children:# return tag.insert_before('```\n ')
tag.insert_after('\n```\n')
tag.unwrap()
elif _recursivelyValid(tag):
if tag.name == 'blockquote':
# ! FIXME: hack tag.insert_before('<<<BLOCKQUOTE: ')
tag.insert_after('>>>')
tag.unwrap()
for child in children:
_markdownify(child, _blockQuote=True)
returnelif tag.name == 'a':
# process children firstfor child in children:
if child.name=='img'and tag.has_attr('href'):
_markdownify(child)
tag.unwrap()
return _markdownify(child)
ifnot tag.has_attr('href'):
returnif tag.string != tag.get('href') or tag.has_attr('title'):
title = ''if tag.has_attr('title'):
title = ' "%s"' % tag['title']
tag.string = '[%s](%s%s)' % (BeautifulSoup(unicode(tag), 'html.parser').string,
tag.get('href', ''),
title)
else:
# ! FIXME: hack tag.string = '%s' % tag.string
tag.unwrap()
returnelif tag.name == 'h1':
tag.insert_before('\n\n# ')
tag.insert_after('\n\n')
tag.unwrap()
elif tag.name == 'h2':
tag.insert_before('\n\n## ')
tag.insert_after('\n\n')
tag.unwrap()
elif tag.name == 'h3':
tag.insert_before('\n\n### ')
tag.insert_after('\n\n')
tag.unwrap()
elif tag.name == 'h4':
tag.insert_before('\n\n#### ')
tag.insert_after('\n\n')
tag.unwrap()
elif tag.name == 'h5':
tag.insert_before('\n\n##### ')
tag.insert_after('\n\n')
tag.unwrap()
elif tag.name == 'h6':
tag.insert_before('\n\n###### ')
tag.insert_after('\n\n')
tag.unwrap()
elif tag.name in ('ul', 'ol'):
# tag.insert_before('\n\n')# tag.insert_after('\n\n') tag.unwrap()
for i, child inenumerate(children):
_markdownify(child, _listType=tag.name, _listIndex=i+1)
returnelif tag.name == 'li':
ifnot _listType:
# <li> outside of list; ignorereturnif _listType == 'ul':
tag.insert_before('+ ')
else:
tag.insert_before('%d. ' % _listIndex)
for child in children:
_markdownify(child)
# for c in tag.contents:# if type(c) != bs4.element.NavigableString:# continue# c.replace_with('\n '.join(c.split('\n')))# tag.insert_after('\n') tag.unwrap()
returnelif tag.name in ('b'):
tag.insert_before('__')
tag.insert_after('__')
tag.unwrap()
elif tag.name in ('em','i'):
tag.insert_before('_')
tag.insert_after('_')
tag.unwrap()
for child in children:
_markdownify(child)
defconvert(html):
"""converts an html string to markdown while preserving unsupported markup."""#在保留不支持的标记的同时,将html字符串转换为markdown。 bs = BeautifulSoup(html, 'html.parser')
_markdownify(bs)
ret = unicode(bs).replace(u'\xa0', ' ')
ret = re.sub(r'\n{3,}', r'\n\n', ret)
# ! FIXME: hack ret = re.sub(r'<<<FLOATING LINK: (.+)>>>', r'<\1>', ret)
# ! FIXME: hack sp = re.split(r'(<<<BLOCKQUOTE: .*?>>>)', ret, flags=re.DOTALL)
for i,e inenumerate(sp):
if e[:len('<<<BLOCKQUOTE:')] == '<<<BLOCKQUOTE:':
sp[i] = '> ' + e[len('<<<BLOCKQUOTE:') : -len('>>>')]
sp[i] = sp[i].replace('\n', '\n> ')
ret = ''.join(sp)
html_parser = HTMLParser()
text3 = html_parser.unescape(ret)
return text3.strip('\n')
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 【自荐】一款简洁、开源的在线白板工具 Drawnix
· 园子的第一款AI主题卫衣上架——"HELLO! HOW CAN I ASSIST YOU TODAY
· Docker 太简单,K8s 太复杂?w7panel 让容器管理更轻松!