pmbaumgartner · January 10, 2022 15:49 · Jan 10, 2022
diff --git a/cleaning_tokenizer.py b/cleaning_tokenizer.py
@@ -0,0 +1,19 @@
+from spacy.tokenizer import Tokenizer
+
+class CTLTokenizer(Tokenizer):
+    # https://stackoverflow.com/a/58718664
+    def __call__(self, string) -> spacy.tokens.Doc:
+        string = self.clean_string(string)
+        doc = super().__call__(string)
+        return doc
+
+    def clean_string(self, string: str) -> str:
+        """String cleaning function. You can call this to clean a string
+        without tokenizing.
+
+        e.g.
+            nlp.tokenizer.clean_string('Some example sentence')
+        """
+        if not string.endswith("."):
+            string = string + "."
+        return string
No results found