Skip to content

Instantly share code, notes, and snippets.

@brennerhaverlock
Last active August 11, 2022 10:11
Show Gist options
  • Select an option

  • Save brennerhaverlock/e1589d1f160ce20db4f94e98ef534612 to your computer and use it in GitHub Desktop.

Select an option

Save brennerhaverlock/e1589d1f160ce20db4f94e98ef534612 to your computer and use it in GitHub Desktop.

Revisions

  1. brennerhaverlock revised this gist Mar 13, 2019. 1 changed file with 0 additions and 5 deletions.
    5 changes: 0 additions & 5 deletions text_cleaning_nltk.py
    Original file line number Diff line number Diff line change
    @@ -4,11 +4,6 @@
    import re
    from nltk.corpus import stopwords
    import time
    from tensorflow.python.layers.core import Dense
    from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
    from tensorflow.python.ops import array_ops
    from tensorflow.python.ops import tensor_array_ops
    print('TensorFlow Version: {}'.format(tf.__version__))

    clean_texts = []
    for text in dataset.review:
  2. brennerhaverlock renamed this gist Mar 13, 2019. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  3. brennerhaverlock created this gist Mar 13, 2019.
    57 changes: 57 additions & 0 deletions NLP
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,57 @@
    import pandas as pd
    import numpy as np
    import tensorflow as tf
    import re
    from nltk.corpus import stopwords
    import time
    from tensorflow.python.layers.core import Dense
    from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
    from tensorflow.python.ops import array_ops
    from tensorflow.python.ops import tensor_array_ops
    print('TensorFlow Version: {}'.format(tf.__version__))

    clean_texts = []
    for text in dataset.review:
    clean_texts.append(clean_text(text))
    print("Texts are complete.")

    def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''

    # Convert words to lower case
    text = text.lower()
    # Replace contractions with their longer forms
    if True:
    # We are not using "text.split()" here
    #since it is not fool proof, e.g. words followed by punctuations "Are you kidding?I think you aren't."
    text = re.findall(r"[\w']+", text)
    new_text = []
    for word in text:
    if word in contractions:
    new_text.append(contractions[word])
    else:
    new_text.append(word)
    text = " ".join(new_text)

    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)# remove links
    text = re.sub(r'\<a href', ' ', text)# remove html link tag
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)

    # Optionally, remove stop words
    if remove_stopwords:
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)

    return text

    clean_texts = []
    for text in dataset.review:
    clean_texts.append(clean_text(text))
    print("Texts are complete.")