Python text from html

From RHS Wiki
Jump to navigation Jump to search
def clean_text(content):
    soup = BeautifulSoup(content, features="html.parser")
    # body = soup.find('body')
    # data = body.findAll(text=True)
    data = soup.findAll(text=True)
    # clean = soup.get_text()
    clean = list(filter(visible, data))
    text = '\n'.join([x.strip() for x in clean if x.strip()])
    return text


def visible(element):
    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
        return False
    elif re.match('<!--.*-->', str(element.encode('utf-8'))):
        return False
    return True