Python text from html
Jump to navigation
Jump to search
def clean_text(content):
soup = BeautifulSoup(content, features="html.parser")
# body = soup.find('body')
# data = body.findAll(text=True)
data = soup.findAll(text=True)
# clean = soup.get_text()
clean = list(filter(visible, data))
text = '\n'.join([x.strip() for x in clean if x.strip()])
return text
def visible(element):
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
return False
elif re.match('<!--.*-->', str(element.encode('utf-8'))):
return False
return True