def text_cleaner(text):
rules = [
{r'>\s+': u'>'}, # remove spaces after a tag opens or closes
{r'\s+': u' '}, # replace consecutive spaces
{r'\s*
\s*': u'\n'}, # newline after a
{r'(div)\s*>\s*': u'\n'}, # newline after
and and ...
{r'(p|h\d)\s*>\s*': u'\n\n'}, # newline after and and ...
{r'.*<\s*(/head|body)[^>]*>': u''}, # remove to
{r']*>.*': r'\1'}, # show links instead of texts
{r'[ \t]*<[^<]*?/?>': u''}, # remove remaining tags
{r'^\s+': u''} # remove spaces at the beginning
]
for rule in rules:
for (k, v) in rule.items():
regex = re.compile(k)
text = regex.sub(v, text)
text = text.rstrip()
return text.lower()