数据清洗之微博内容清洗

获取文字加表情(alt标签的属性)

#!/usr/bin/env python  
# encoding: utf-8
from functools import reduce
from lxml import html
from bs4 import BeautifulSoup
html="""
<div><span class="url-icon"><img alt="[馋嘴]" src="//h5.sinaimg.cn/m/emoticon/icon/default/d_chanzui-ad3f4f182c.png" style="1em; height:1em;"/></span>听着就很好吃​</div>
"""

def main():
    bs=BeautifulSoup(html,'html.parser')
    main_div=bs.find('div')
    contents=parse_div(main_div)
    print(contents)
def parse_div(div_tags):
    contents=div_tags.contents
    result=[]
    for content in contents:
        if isinstance(content,str):
            content=content.replace('
','').replace(' ','')
            result.append(content)
        elif content.has_attr('alt'):
            result.append(content.get('alt',''))
        else:
            new_contents=parse_div(content)
            result.append(new_contents)
    return ''.join(result)
#最优解
def main(self, htmlstr):
        root = html.fromstring(htmlstr)
        nodes = root.xpath(".//text()|.//@alt")
        return ''.join([i.replace('
','').replace(" ", "").replace("u200b", "") for i in nodes])



if __name__ == '__main__':
        main()





原文地址:https://www.cnblogs.com/c-x-a/p/9340620.html