html格式的文字去掉html tag转为纯text文字
使用lxml
import lxml.etree
import lxml.html
with open('/tmp/hzh/a.html', 'r') as file:
data = file.read()
html_str = '<p>hzh。<div>ddiivv</div></p> \n <p> l1</p>'
root = lxml.html.fromstring(html_str)
# optionally remove tags that are not usually rendered in browsers
# javascript, HTML/HEAD, comments, add the tag names you dont want at the end
lxml.etree.strip_elements(root, lxml.etree.Comment, "script", "head")
# complete text. Remove tags and convert to string.
result_str = lxml.html.tostring(root, method="text", encoding='unicode')
print(result_str)
如果想细粒度控制,则可以用
html_str = '<p>hzh。<div>ddiivv</div></p> \n <p> l1</p>'
root = lxml.html.fromstring(html_str)
print(lxml.etree.tostring(root, pretty_print=True, encoding='unicode'))
# <div>ddiivv</div> 去掉,会去掉tag里面的内容
lxml.etree.strip_elements(root, 'div', with_tail=False) # result is: hzh。 \n l1
root = lxml.html.fromstring(html_str)
# 去掉 div tag,保留tag里面的内容
lxml.etree.strip_tags(root, 'div')
# 最外面有个div是因为你创建的时候有两个p tag并列在,新建一个div成为他们的root,每个element必须有个root。
print(lxml.etree.tostring(root, pretty_print=True, encoding='unicode')) #result is: <div><p>hz<br/>h。</p>ddiivv \n <p> l1</p></div>
使用xpath的string()格式
如果使用webdriver就更简单了
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
# 初始化WebDriver(这里使用的是Chrome,你也可以选择其他的如Firefox等)
chrome_options = Options()
chrome_options.add_argument("start-maximized")
chrome_service = ChromeService(executable_path='/home/hzh/disk2/dl/chromedriver')
driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
# 打开网址
driver.get("https://www.cnblogs.com/welhzh/p/17272452.html")
# 获取元素
element = driver.find_element(By.ID, "cnblogs_post_body")
# 获取纯文本内容
text_content = element.text
print(text_content)
# 关闭浏览器
driver.quit()
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
支付宝扫一扫捐赠
支付宝扫一扫捐赠
微信公众号: 共鸣圈
欢迎讨论,邮件: 924948$qq.com 请把$改成@
QQ群:263132197
QQ: 924948