brennerhaverlock · August 11, 2022 10:11 · Mar 13, 2019 · Mar 13, 2019 · Mar 13, 2019
diff --git a/text_cleaning_nltk.py b/text_cleaning_nltk.py
@@ -4,11 +4,6 @@
 import re
 from nltk.corpus import stopwords
 import time
-from tensorflow.python.layers.core import Dense
-from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import tensor_array_ops
-print('TensorFlow Version: {}'.format(tf.__version__))
 
 clean_texts = []
 for text in dataset.review:

diff --git a/NLP → text_cleaning_nltk.py b/NLP → text_cleaning_nltk.py
diff --git a/NLP b/NLP
@@ -0,0 +1,57 @@
+import pandas as pd
+import numpy as np
+import tensorflow as tf
+import re
+from nltk.corpus import stopwords
+import time
+from tensorflow.python.layers.core import Dense
+from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import tensor_array_ops
+print('TensorFlow Version: {}'.format(tf.__version__))
+
+clean_texts = []
+for text in dataset.review:
+    clean_texts.append(clean_text(text))
+print("Texts are complete.")
+
+def clean_text(text, remove_stopwords = True):
+    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
+
+    # Convert words to lower case
+    text = text.lower()
+    # Replace contractions with their longer forms 
+    if True:
+        # We are not using "text.split()" here
+        #since it is not fool proof, e.g. words followed by punctuations "Are you kidding?I think you aren't."
+        text = re.findall(r"[\w']+", text)
+        new_text = []
+        for word in text:
+            if word in contractions:
+                new_text.append(contractions[word])
+            else:
+                new_text.append(word)
+        text = " ".join(new_text)
+
+    # Format words and remove unwanted characters
+    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)# remove links
+    text = re.sub(r'\<a href', ' ', text)# remove html link tag
+    text = re.sub(r'&amp;', '', text) 
+    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
+    text = re.sub(r'<br />', ' ', text)
+    text = re.sub(r'\'', ' ', text)
+
+    # Optionally, remove stop words
+    if remove_stopwords:
+        text = text.split()
+        stops = set(stopwords.words("english"))
+        text = [w for w in text if not w in stops]
+        text = " ".join(text)
+
+    return text
+
+clean_texts = []
+for text in dataset.review:
+    clean_texts.append(clean_text(text))
+print("Texts are complete.")
+
No results found