Skip to content

Instantly share code, notes, and snippets.

@brennerhaverlock
Last active August 11, 2022 10:11
Show Gist options
  • Save brennerhaverlock/e1589d1f160ce20db4f94e98ef534612 to your computer and use it in GitHub Desktop.
Save brennerhaverlock/e1589d1f160ce20db4f94e98ef534612 to your computer and use it in GitHub Desktop.
clean_your_text_reviews
import pandas as pd
import numpy as np
import tensorflow as tf
import re
from nltk.corpus import stopwords
import time
clean_texts = []
for text in dataset.review:
clean_texts.append(clean_text(text))
print("Texts are complete.")
def clean_text(text, remove_stopwords = True):
'''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
# Convert words to lower case
text = text.lower()
# Replace contractions with their longer forms
if True:
# We are not using "text.split()" here
#since it is not fool proof, e.g. words followed by punctuations "Are you kidding?I think you aren't."
text = re.findall(r"[\w']+", text)
new_text = []
for word in text:
if word in contractions:
new_text.append(contractions[word])
else:
new_text.append(word)
text = " ".join(new_text)
# Format words and remove unwanted characters
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)# remove links
text = re.sub(r'\<a href', ' ', text)# remove html link tag
text = re.sub(r'&amp;', '', text)
text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
text = re.sub(r'<br />', ' ', text)
text = re.sub(r'\'', ' ', text)
# Optionally, remove stop words
if remove_stopwords:
text = text.split()
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops]
text = " ".join(text)
return text
clean_texts = []
for text in dataset.review:
clean_texts.append(clean_text(text))
print("Texts are complete.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment