Created
April 10, 2023 11:07
-
-
Save dynamicguy/afc2147b79bbd283a7a91b53eca5e299 to your computer and use it in GitHub Desktop.
Revisions
-
dynamicguy created this gist
Apr 10, 2023 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,18 @@ def text_cleaner(text): rules = [ {r'>\s+': u'>'}, # remove spaces after a tag opens or closes {r'\s+': u' '}, # replace consecutive spaces {r'\s*<br\s*/?>\s*': u'\n'}, # newline after a <br> {r'</(div)\s*>\s*': u'\n'}, # newline after </p> and </div> and <h1/>... {r'</(p|h\d)\s*>\s*': u'\n\n'}, # newline after </p> and </div> and <h1/>... {r'<head>.*<\s*(/head|body)[^>]*>': u''}, # remove <head> to </head> {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'}, # show links instead of texts {r'[ \t]*<[^<]*?/?>': u''}, # remove remaining tags {r'^\s+': u''} # remove spaces at the beginning ] for rule in rules: for (k, v) in rule.items(): regex = re.compile(k) text = regex.sub(v, text) text = text.rstrip() return text.lower()