Difference between revisions of "Python text from html"

From RHS Wiki
Jump to navigation Jump to search
(Created page with "<syntaxhighlight lang="python3"> def clean_text(content): soup = BeautifulSoup(content, features="html.parser") # body = soup.find('body') # data = body.findAll(te...")
Tag: visualeditor
 
(No difference)

Latest revision as of 09:38, 28 May 2019

def clean_text(content):
    soup = BeautifulSoup(content, features="html.parser")
    # body = soup.find('body')
    # data = body.findAll(text=True)
    data = soup.findAll(text=True)
    # clean = soup.get_text()
    clean = list(filter(visible, data))
    text = '\n'.join([x.strip() for x in clean if x.strip()])
    return text


def visible(element):
    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
        return False
    elif re.match('<!--.*-->', str(element.encode('utf-8'))):
        return False
    return True