html格式的文字去掉html tag转为纯text文字

使用lxml

import lxml.etree
import lxml.html


with open('/tmp/hzh/a.html', 'r') as file:
    data = file.read()
html_str = '<p>hzh。<div>ddiivv</div></p>   \n  <p> l1</p>'
root = lxml.html.fromstring(html_str)

# optionally remove tags that are not usually rendered in browsers
# javascript, HTML/HEAD, comments, add the tag names you dont want at the end
lxml.etree.strip_elements(root, lxml.etree.Comment, "script", "head")

# complete text. Remove tags and convert to string.
result_str = lxml.html.tostring(root, method="text", encoding='unicode')
print(result_str)

如果想细粒度控制,则可以用

html_str = '<p>hzh。<div>ddiivv</div></p>   \n  <p> l1</p>'
root = lxml.html.fromstring(html_str)
print(lxml.etree.tostring(root, pretty_print=True, encoding='unicode'))
# <div>ddiivv</div> 去掉,会去掉tag里面的内容
lxml.etree.strip_elements(root, 'div', with_tail=False)    # result is:    hzh。   \n    l1

root = lxml.html.fromstring(html_str)
# 去掉 div tag,保留tag里面的内容
lxml.etree.strip_tags(root, 'div')
# 最外面有个div是因为你创建的时候有两个p tag并列在,新建一个div成为他们的root,每个element必须有个root。
print(lxml.etree.tostring(root, pretty_print=True, encoding='unicode'))  #result is: <div><p>hz<br/>h。</p>ddiivv   \n  <p> l1</p></div>

使用xpath的string()格式

参考文章见12.

如果使用webdriver就更简单了

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By

# 初始化WebDriver(这里使用的是Chrome,你也可以选择其他的如Firefox等)
chrome_options = Options()
chrome_options.add_argument("start-maximized")
chrome_service = ChromeService(executable_path='/home/hzh/disk2/dl/chromedriver')
driver = webdriver.Chrome(service=chrome_service, options=chrome_options)

# 打开网址
driver.get("https://www.cnblogs.com/welhzh/p/17272452.html")

# 获取元素
element = driver.find_element(By.ID, "cnblogs_post_body")

# 获取纯文本内容
text_content = element.text
print(text_content)

# 关闭浏览器
driver.quit()
posted @ 2023-04-07 16:08  微信公众号--共鸣圈  阅读(84)  评论(0编辑  收藏  举报