Python text from html
Revision as of 09:38, 28 May 2019 by Rafahsolis (talk | contribs) (Created page with "<syntaxhighlight lang="python3"> def clean_text(content): soup = BeautifulSoup(content, features="html.parser") # body = soup.find('body') # data = body.findAll(te...")
def clean_text(content):
soup = BeautifulSoup(content, features="html.parser")
# body = soup.find('body')
# data = body.findAll(text=True)
data = soup.findAll(text=True)
# clean = soup.get_text()
clean = list(filter(visible, data))
text = '\n'.join([x.strip() for x in clean if x.strip()])
return text
def visible(element):
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
return False
elif re.match('<!--.*-->', str(element.encode('utf-8'))):
return False
return True