Python text from html

From RHS Wiki
Revision as of 09:38, 28 May 2019 by Rafahsolis (talk | contribs) (Created page with "<syntaxhighlight lang="python3"> def clean_text(content): soup = BeautifulSoup(content, features="html.parser") # body = soup.find('body') # data = body.findAll(te...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search
def clean_text(content):
    soup = BeautifulSoup(content, features="html.parser")
    # body = soup.find('body')
    # data = body.findAll(text=True)
    data = soup.findAll(text=True)
    # clean = soup.get_text()
    clean = list(filter(visible, data))
    text = '\n'.join([x.strip() for x in clean if x.strip()])
    return text


def visible(element):
    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
        return False
    elif re.match('<!--.*-->', str(element.encode('utf-8'))):
        return False
    return True