def text_cleaner(text): rules = [ {r'>\s+': u'>'}, # remove spaces after a tag opens or closes {r'\s+': u' '}, # replace consecutive spaces {r'\s*\s*': u'\n'}, # newline after a
{r'\s*': u'\n'}, # newline after

and and

... {r'\s*': u'\n\n'}, # newline after

and and

... {r'.*<\s*(/head|body)[^>]*>': u''}, # remove to {r']*>.*': r'\1'}, # show links instead of texts {r'[ \t]*<[^<]*?/?>': u''}, # remove remaining tags {r'^\s+': u''} # remove spaces at the beginning ] for rule in rules: for (k, v) in rule.items(): regex = re.compile(k) text = regex.sub(v, text) text = text.rstrip() return text.lower()