Last active
August 11, 2022 10:11
-
-
Save brennerhaverlock/e1589d1f160ce20db4f94e98ef534612 to your computer and use it in GitHub Desktop.
clean_your_text_reviews
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import numpy as np | |
| import tensorflow as tf | |
| import re | |
| from nltk.corpus import stopwords | |
| import time | |
| clean_texts = [] | |
| for text in dataset.review: | |
| clean_texts.append(clean_text(text)) | |
| print("Texts are complete.") | |
| def clean_text(text, remove_stopwords = True): | |
| '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings''' | |
| # Convert words to lower case | |
| text = text.lower() | |
| # Replace contractions with their longer forms | |
| if True: | |
| # We are not using "text.split()" here | |
| #since it is not fool proof, e.g. words followed by punctuations "Are you kidding?I think you aren't." | |
| text = re.findall(r"[\w']+", text) | |
| new_text = [] | |
| for word in text: | |
| if word in contractions: | |
| new_text.append(contractions[word]) | |
| else: | |
| new_text.append(word) | |
| text = " ".join(new_text) | |
| # Format words and remove unwanted characters | |
| text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)# remove links | |
| text = re.sub(r'\<a href', ' ', text)# remove html link tag | |
| text = re.sub(r'&', '', text) | |
| text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text) | |
| text = re.sub(r'<br />', ' ', text) | |
| text = re.sub(r'\'', ' ', text) | |
| # Optionally, remove stop words | |
| if remove_stopwords: | |
| text = text.split() | |
| stops = set(stopwords.words("english")) | |
| text = [w for w in text if not w in stops] | |
| text = " ".join(text) | |
| return text | |
| clean_texts = [] | |
| for text in dataset.review: | |
| clean_texts.append(clean_text(text)) | |
| print("Texts are complete.") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment