Last active
August 11, 2022 10:11
-
-
Save brennerhaverlock/e1589d1f160ce20db4f94e98ef534612 to your computer and use it in GitHub Desktop.
Revisions
-
brennerhaverlock revised this gist
Mar 13, 2019 . 1 changed file with 0 additions and 5 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -4,11 +4,6 @@ import re from nltk.corpus import stopwords import time clean_texts = [] for text in dataset.review: -
brennerhaverlock renamed this gist
Mar 13, 2019 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
brennerhaverlock created this gist
Mar 13, 2019 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,57 @@ import pandas as pd import numpy as np import tensorflow as tf import re from nltk.corpus import stopwords import time from tensorflow.python.layers.core import Dense from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors from tensorflow.python.ops import array_ops from tensorflow.python.ops import tensor_array_ops print('TensorFlow Version: {}'.format(tf.__version__)) clean_texts = [] for text in dataset.review: clean_texts.append(clean_text(text)) print("Texts are complete.") def clean_text(text, remove_stopwords = True): '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings''' # Convert words to lower case text = text.lower() # Replace contractions with their longer forms if True: # We are not using "text.split()" here #since it is not fool proof, e.g. words followed by punctuations "Are you kidding?I think you aren't." text = re.findall(r"[\w']+", text) new_text = [] for word in text: if word in contractions: new_text.append(contractions[word]) else: new_text.append(word) text = " ".join(new_text) # Format words and remove unwanted characters text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)# remove links text = re.sub(r'\<a href', ' ', text)# remove html link tag text = re.sub(r'&', '', text) text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text) text = re.sub(r'<br />', ' ', text) text = re.sub(r'\'', ' ', text) # Optionally, remove stop words if remove_stopwords: text = text.split() stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) return text clean_texts = [] for text in dataset.review: clean_texts.append(clean_text(text)) print("Texts are complete.")