brennerhaverlock · August 11, 2022 10:11
diff --git a/text_cleaning_nltk.py b/text_cleaning_nltk.py
 import pandas as pd
 import numpy as np
 import tensorflow as tf
 import re
 from nltk.corpus import stopwords
 import time

 clean_texts = []
 for text in dataset.review:
    clean_texts.append(clean_text(text))
 print("Texts are complete.")

 def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    # Replace contractions with their longer forms 
    if True:
        # We are not using "text.split()" here
        #since it is not fool proof, e.g. words followed by punctuations "Are you kidding?I think you aren't."
        text = re.findall(r"[\w']+", text)
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)# remove links
    text = re.sub(r'\<a href', ' ', text)# remove html link tag
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text
   
 clean_texts = []
 for text in dataset.review:
    clean_texts.append(clean_text(text))
 print("Texts are complete.")
	import pandas as pd
	import numpy as np
	import tensorflow as tf
	import re
	from nltk.corpus import stopwords
	import time

	clean_texts = []
	for text in dataset.review:
	clean_texts.append(clean_text(text))
	print("Texts are complete.")

	def clean_text(text, remove_stopwords = True):
	'''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''

	# Convert words to lower case
	text = text.lower()
	# Replace contractions with their longer forms
	if True:
	# We are not using "text.split()" here
	#since it is not fool proof, e.g. words followed by punctuations "Are you kidding?I think you aren't."
	text = re.findall(r"[\w']+", text)
	new_text = []
	for word in text:
	if word in contractions:
	new_text.append(contractions[word])
	else:
	new_text.append(word)
	text = " ".join(new_text)

	# Format words and remove unwanted characters
	text = re.sub(r'https?:\/\/.[\r\n]', '', text, flags=re.MULTILINE)# remove links
	text = re.sub(r'\<a href', ' ', text)# remove html link tag
	text = re.sub(r'&', '', text)
	text = re.sub(r'[_"\-;%()\|+&=*%.,!?:#$@\[\]/]', ' ', text)
	text = re.sub(r'<br />', ' ', text)
	text = re.sub(r'\'', ' ', text)

	# Optionally, remove stop words
	if remove_stopwords:
	text = text.split()
	stops = set(stopwords.words("english"))
	text = [w for w in text if not w in stops]
	text = " ".join(text)

	return text

	clean_texts = []
	for text in dataset.review:
	clean_texts.append(clean_text(text))
	print("Texts are complete.")