lppier · August 23, 2019 02:49 · Aug 23, 2019 · Aug 23, 2019
diff --git a/detect_percentage_english.py b/detect_percentage_english.py
@@ -0,0 +1,27 @@
+import string
+import urllib.request
+from nltk.corpus import words
+
+punctuation = set(string.punctuation)
+
+def remove_punc(str):
+    return ''.join(c for c in str if c not in punctuation)
+
+total_count = 0
+eng_count = 0
+
+with open('hsbc_th_supplement-pdf-page-1-text.txt') as f:
+    for line in f:
+        text_words = remove_punc(line).lower().split()
+        print(text_words)
+        total_count += len(text_words)
+        for word in text_words:
+            print(f"Finding {word}")
+            if word in words.words():
+                eng_count += 1
+
+print('%s English words found' % eng_count)
+print('%s total words found' % total_count)
+
+percentage_eng = 0 if total_count == 0 else (float(eng_count) / total_count * 100)
+print('%s%% of words were English' % percentage_eng)
No results found