BeautifulSoup 去除指定的html样式属性
只想保留table的"colspan", "rowspan" 两个属性值,其余的属性都去掉
最开始想到的是正则匹配,发现太费劲
后面发现BeautifulSoup可以解决
# bs4 去除特定属性 def remove_css_tags(): html_str = '''<table cellspacing="0" cellpadding="0" width="90%" align="center" style="border:1px solid #000000; text-align:center; border-collapse:collapse;"><tbody><tr style="background-color: rgb(217, 217, 217);" class="firstRow"><td width="12" height="1" style="border:1px solid #000000;"><p>产品</p></td><td width="29" colspan="2" height="1" style="border:1px solid #000000;"><p><strong><span style="color: black;">价格类型</span></strong></p></td><td width="20" height="1" style="border:1px solid #000000;"><p><strong><span style="color: black;">价格</span></strong></p></td><td width="16" height="1" style="border:1px solid #000000;"><p><strong><span style="color: black;">涨跌</span></strong></p></td><td width="21" height="1" style="border:1px solid #000000;"><p><strong><span style="color: black;">单位</span></strong></p></td></tr><tr style=";height:35px"><td width="12" rowspan="15" height="35" style="border:1px solid #000000;"><p><strong>液氯</strong></p></td><td width="8" rowspan="8" height="35" style="border:1px solid #000000;"><p><span style="color: black;">厂家</span></p></td><td width="20" height="35" style="border:1px solid #000000;"><p><span style="color: black;">茌平信发</span></p></td><td width="20" height="35" style="border:1px solid #000000;"><p><span style="color: black;">2000</span></p></td><td width="16" valign="top" height="35" style="border:1px solid #000000;"><p><span style="color: red;">+500</span></p></td><td width="21" height="35" style="border:1px solid #000000;"><p><span style="color: black;">元/吨</span></p></td></tr><tr style=";height:8px"><td width="20" height="8" style="border:1px solid #000000;"><p><span style="color: black;">东营华泰</span></p></td><td width="20" height="8" style="border:1px solid #000000;"><p><span style="color: black;">2000</span></p></td><td width="16" valign="top" height="8" style="border:1px solid #000000;"><p><span style="color: red;">+500</span></p></td><td width="21" height="8" style="border:1px solid #000000;"><p><span style="color: black;">元/吨</span></p></td></tr><tr style=";height:1px"><td width="20" height="1" style="border:1px solid #000000;"><p><span style="color: black;">河北冀衡</span></p></td><td width="20" valign="top" height="1" style="border:1px solid #000000;"><p><span style="color: black;">2100</span></p></td><td width="16" valign="top" height="1" style="border:1px solid #000000;"><p><span style="color: red;">+500</span></p></td><td width="21" height="1" style="border:1px solid #000000;"><p><span style="color: black;">元/吨</span></p></td></tr><tr style=";height:1px"><td width="20" height="1" style="border:1px solid #000000;"><p><span style="color: black;">沧州聚隆</span></p></td><td width="20" height="1" style="border:1px solid #000000;"><p><span style="color: black;">1800</span></p></td><td width="16" valign="top" height="1" style="border:1px solid #000000;"><p><span style="color: red;">+500</span></p></td><td width="21" height="1" style="border:1px solid #000000;"><p><span style="color: black;">元/吨</span></p></td></tr><tr style=";height:8px"><td width="20" height="8" style="border:1px solid #000000;"><p><span style="color: black;">江苏新浦</span></p></td><td width="20" height="8" style="border:1px solid #000000;"><p><span style="color: black;">3300</span></p></td><td width="16" height="8" style="border:1px solid #000000;"><p><span style="color: black;">0</span></p></td><td width="21" height="8" style="border:1px solid #000000;"><p><span style="color: black;">元/吨</span></p></td></tr><tr style=";height:35px"><td width="20" height="35" style="border:1px solid #000000;"><p><span style="color: black;">江苏富强</span></p></td><td width="20" height="35" style="border:1px solid #000000;"><p><span style="color: black;">封盘</span></p></td><td width="16" height="35" style="border:1px solid #000000;"><p><span style="color: rgb(34, 42, 53);">0</span></p></td><td width="21" height="35" style="border:1px solid #000000;"><p><span style="color: black;">元/吨</span></p></td></tr><tr style=";height:8px"><td width="20" height="8" style="border:1px solid #000000;"><p><span style="color: black;">内蒙吉兰泰</span></p></td><td width="20" height="8" style="border:1px solid #000000;"><p><span style="color: black;">2800</span></p></td><td width="16" height="8" style="border:1px solid #000000;"><p><span style="color: rgb(34, 42, 53);">0</span></p></td><td width="21" height="8" style="border:1px solid #000000;"><p><span style="color: black;">元/吨</span></p></td></tr><tr style=";height:36px"><td width="20" height="36" style="border:1px solid #000000;"><p><span style="color: black;">方大锦化</span></p></td><td width="20" height="36" style="border:1px solid #000000;"><p><span style="color: black;">2000</span></p></td><td width="16" valign="top" height="36" style="border:1px solid #000000;"><p><span style="color: red;">+500</span></p></td><td width="21" height="36" style="border:1px solid #000000;"><p><span style="color: black;">元/吨</span></p></td></tr><tr style=";height:1px"><td width="8" rowspan="6" height="1" style="border:1px solid #000000;"><p><span style="color: black;">市 场</span></p></td><td width="20" height="1" style="border:1px solid #000000;"><p><span style="color: black;">山东地区</span></p></td><td width="20" height="1" style="border:1px solid #000000;"><p><span style="color: black;">1800-2000</span></p></td><td width="16" valign="top" height="1" style="border:1px solid #000000;"><p><span style="color: red;">+300/500</span></p></td><td width="21" height="1" style="border:1px solid #000000;"><p><span style="color: black;">元/吨</span></p></td></tr><tr style=";height:1px"><td width="20" height="1" style="border:1px solid #000000;"><p><span style="color: black;">河北地区</span></p></td><td width="20" height="1" style="border:1px solid #000000;"><p><span style="color: black;">1300/2100</span></p></td><td width="16" height="1" style="border:1px solid #000000;"><p><span style="color: red;">+0/500</span></p></td><td width="21" height="1" style="border:1px solid #000000;"><p><span style="color: black;">元/吨</span></p></td></tr><tr style=";height:1px"><td width="20" height="1" style="border:1px solid #000000;"><p><span style="color: black;">江苏地区</span></p></td><td width="20" height="1" style="border:1px solid #000000;"><p><span style="color: black;">2200-3100</span></p></td><td width="16" height="1" style="border:1px solid #000000;"><p><span style="color: black;">0</span></p></td><td width="21" height="1" style="border:1px solid #000000;"><p><span style="color: black;">元/吨</span></p></td></tr><tr style=";height:1px"><td width="20" height="1" style="border:1px solid #000000;"><p><span style="color: black;">河南地区</span></p></td><td width="20" height="1" style="border:1px solid #000000;"><p><span style="color: black;">1500</span></p></td><td width="16" valign="top" height="1" style="border:1px solid #000000;"><p><span style="color: black;">0</span></p></td><td width="21" height="1" style="border:1px solid #000000;"><p><span style="color: black;">元/吨</span></p></td></tr><tr style=";height:1px"><td width="20" height="1" style="border:1px solid #000000;"><p><span style="color: black;">辽宁地区</span></p></td><td width="20" height="1" style="border:1px solid #000000;"><p><span style="color: black;">1800/2000</span></p></td><td width="16" height="1" style="border:1px solid #000000;"><p><span style="color: red;">+500</span></p></td><td width="21" height="1" style="border:1px solid #000000;"><p><span style="color: black;">元/吨</span></p></td></tr><tr style=";height:1px"><td width="20" height="1" style="border:1px solid #000000;"><p><span style="color: black;">内蒙古地区</span></p></td><td width="20" height="1" style="border:1px solid #000000;"><p><span style="color: black;">2800</span></p></td><td width="16" height="1" style="border:1px solid #000000;"><p><span style="color: rgb(34, 42, 53);">0</span></p></td><td width="21" height="1" style="border:1px solid #000000;"><p><span style="color: black;">元/吨</span></p></td></tr><tr style=";height:188px"><td width="8" height="188" style="border:1px solid #000000;"><p><span style="color: black;">市场简述及后市预测</span></p></td><td width="78" colspan="4" height="188" style="border:1px solid #000000;"><p><span style="color: black;">液氯:今日国内液氯市场低位反弹,市场呈上行趋势。华泰装置检修、大地装置降负荷,鲁中东部供应端减少,但鲁西区域企业复产,加之配套下游停车,市场处于博弈阶段,但企业存看涨心态,鉴于此,预计明日不排除有继续上行可能。</span></p></td></tr></tbody></table>''' soup = BeautifulSoup(html_str, "html.parser") remove_html = remove_attrs(soup, whitelist=["colspan", "rowspan"]) # for tag in soup(): # for attr in tag.attrs: # print(attr) # if attr not in ["colspan", "rowspan"]: # del tag[attr] # for attribute in ["colspan", "rowspan"]: # del tag[attribute] print(remove_html)
#去除指定的css属性 def remove_attrs(soup, whitelist=["colspan", "rowspan"]): for tag in soup.findAll(True): for attr in [attr for attr in tag.attrs if attr not in whitelist]: del tag[attr] return soup