数据清洗之微博内容清洗

获取文字加表情(alt标签的属性)

#!/usr/bin/env python  
# encoding: utf-8
from functools import reduce
from lxml import html
from bs4 import BeautifulSoup
html="""
<div><span class="url-icon"><img alt="[馋嘴]" src="//h5.sinaimg.cn/m/emoticon/icon/default/d_chanzui-ad3f4f182c.png" style="width:1em; height:1em;"/></span>听着就很好吃​</div>
"""

def main():
    bs=BeautifulSoup(html,'html.parser')
    main_div=bs.find('div')
    contents=parse_div(main_div)
    print(contents)
def parse_div(div_tags):
    contents=div_tags.contents
    result=[]
    for content in contents:
        if isinstance(content,str):
            content=content.replace('\n','').replace(' ','')
            result.append(content)
        elif content.has_attr('alt'):
            result.append(content.get('alt',''))
        else:
            new_contents=parse_div(content)
            result.append(new_contents)
    return ''.join(result)
#最优解
def main(self, htmlstr):
        root = html.fromstring(htmlstr)
        nodes = root.xpath(".//text()|.//@alt")
        return ''.join([i.replace('\n','').replace(" ", "").replace("\u200b", "") for i in nodes])



if __name__ == '__main__':
        main()





posted @ 2018-07-20 11:40  公众号python学习开发  阅读(2224)  评论(0编辑  收藏  举报