vgpena · November 17, 2020 16:39 · Sep 3, 2017 · Sep 3, 2017 · Sep 3, 2017 · Sep 3, 2017
diff --git a/readme.md b/readme.md
@@ -1,7 +1,7 @@
 # Text Classification with Keras and TensorFlow
-## Blog post is here
+## Blog post is [here](https://vgpena.github.io/classifying-tweets-with-keras-and-tensorflow/)
 
-If you want an intro to neural nets and the "long version" of what this is and what it does, read my blog post.
+If you want an intro to neural nets and the "long version" of what this is and what it does, read my [blog post](https://vgpena.github.io/classifying-tweets-with-keras-and-tensorflow/).
 
 Data can be downloaded [here](http://thinknook.com/twitter-sentiment-analysis-training-corpus-dataset-2012-09-22/). Many thanks to ThinkNook for putting such a great resource out there.
 

diff --git a/loadModel.py b/loadModel.py
@@ -0,0 +1,52 @@
+import json
+import numpy as np
+import keras
+import keras.preprocessing.text as kpt
+from keras.preprocessing.text import Tokenizer
+from keras.models import model_from_json
+
+# we're still going to use a Tokenizer here, but we don't need to fit it
+tokenizer = Tokenizer(num_words=3000)
+# for human-friendly printing
+labels = ['negative', 'positive']
+
+# read in our saved dictionary
+with open('dictionary.json', 'r') as dictionary_file:
+    dictionary = json.load(dictionary_file)
+
+# this utility makes sure that all the words in your input
+# are registered in the dictionary
+# before trying to turn them into a matrix.
+def convert_text_to_index_array(text):
+    words = kpt.text_to_word_sequence(text)
+    wordIndices = []
+    for word in words:
+        if word in dictionary:
+            wordIndices.append(dictionary[word])
+        else:
+            print("'%s' not in training corpus; ignoring." %(word))
+    return wordIndices
+
+# read in your saved model structure
+json_file = open('model.json', 'r')
+loaded_model_json = json_file.read()
+json_file.close()
+# and create a model from that
+model = model_from_json(loaded_model_json)
+# and weight your nodes with your saved values
+model.load_weights('model.h5')
+
+# okay here's the interactive part
+while 1:
+    evalSentence = raw_input('Input a sentence to be evaluated, or Enter to quit: ')
+
+    if len(evalSentence) == 0:
+        break
+
+    # format your input for the neural net
+    testArr = convert_text_to_index_array(evalSentence)
+    input = tokenizer.sequences_to_matrix([testArr], mode='binary')
+    # predict which bucket your input belongs in
+    pred = model.predict(input)
+    # and print it for the humons
+    print("%s sentiment; %f%% confidence" % (labels[np.argmax(pred)], pred[0][np.argmax(pred)] * 100))
diff --git a/makeModel.py b/makeModel.py
@@ -0,0 +1,79 @@
+import json
+import keras
+import keras.preprocessing.text as kpt
+from keras.preprocessing.text import Tokenizer
+import numpy as np
+
+# extract data from a csv
+# notice the cool options to skip lines at the beginning
+# and to only take data from certain columns
+training = np.genfromtxt('/path/to/your/data.csv', delimiter=',', skip_header=1, usecols=(1, 3), dtype=None)
+
+# create our training data from the tweets
+train_x = [x[1] for x in training]
+# index all the sentiment labels
+train_y = np.asarray([x[0] for x in training])
+
+# only work with the 3000 most popular words found in our dataset
+max_words = 3000
+
+# create a new Tokenizer
+tokenizer = Tokenizer(num_words=max_words)
+# feed our tweets to the Tokenizer
+tokenizer.fit_on_texts(train_x)
+
+# Tokenizers come with a convenient list of words and IDs
+dictionary = tokenizer.word_index
+# Let's save this out so we can use it later
+with open('dictionary.json', 'w') as dictionary_file:
+    json.dump(dictionary, dictionary_file)
+
+def convert_text_to_index_array(text):
+    # one really important thing that `text_to_word_sequence` does
+    # is make all texts the same length -- in this case, the length
+    # of the longest text in the set.
+    return [dictionary[word] for word in kpt.text_to_word_sequence(text)]
+
+allWordIndices = []
+# for each tweet, change each token to its ID in the Tokenizer's word_index
+for text in train_x:
+    wordIndices = convert_text_to_index_array(text)
+    allWordIndices.append(wordIndices)
+
+# now we have a list of all tweets converted to index arrays.
+# cast as an array for future usage.
+allWordIndices = np.asarray(allWordIndices)
+
+# create one-hot matrices out of the indexed tweets
+train_x = tokenizer.sequences_to_matrix(allWordIndices, mode='binary')
+# treat the labels as categories
+train_y = keras.utils.to_categorical(train_y, 2)
+
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Activation
+
+model = Sequential()
+model.add(Dense(512, input_shape=(max_words,), activation='relu'))
+model.add(Dropout(0.5))
+model.add(Dense(256, activation='sigmoid'))
+model.add(Dropout(0.5))
+model.add(Dense(2, activation='softmax'))
+
+model.compile(loss='categorical_crossentropy',
+    optimizer='adam',
+    metrics=['accuracy'])
+
+model.fit(train_x, train_y,
+    batch_size=32,
+    epochs=5,
+    verbose=1,
+    validation_split=0.1,
+    shuffle=True)
+
+model_json = model.to_json()
+with open('model.json', 'w') as json_file:
+    json_file.write(model_json)
+
+model.save_weights('model.h5')
+
+print('saved model!')
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,16 @@
+backports.weakref==1.0rc1
+bleach==1.5.0
+funcsigs==1.0.2
+html5lib==0.9999999
+Keras==2.0.6
+Markdown==2.2.0
+mock==2.0.0
+numpy==1.13.1
+pbr==3.1.1
+protobuf==3.3.0
+PyYAML==3.12
+scipy==0.19.1
+six==1.10.0
+tensorflow==1.2.0
+Theano==0.9.0
+Werkzeug==0.12.2
diff --git a/readme.md b/readme.md
@@ -12,6 +12,7 @@ You need Python 2 to run this project; I also recommend [Virtualenv](https://vir
 Run `pip install` to install everything listed in `requirements.txt`.
 
 ## Usage
+You need to train your net once, and then you can load those settings and use it whenever you want without having to retrain it.
 
 ### Training
 Change line 10 of `makeModel.py` to point to wherever you downloaded your data as a CSV.

diff --git a/readme.md b/readme.md
@@ -1 +1,24 @@
-# Hi there
+# Text Classification with Keras and TensorFlow
+## Blog post is here
+
+If you want an intro to neural nets and the "long version" of what this is and what it does, read my blog post.
+
+Data can be downloaded [here](http://thinknook.com/twitter-sentiment-analysis-training-corpus-dataset-2012-09-22/). Many thanks to ThinkNook for putting such a great resource out there.
+
+## Installation
+
+You need Python 2 to run this project; I also recommend [Virtualenv](https://virtualenv.pypa.io/en/stable/) and [iPython](https://ipython.org/).
+
+Run `pip install` to install everything listed in `requirements.txt`.
+
+## Usage
+
+### Training
+Change line 10 of `makeModel.py` to point to wherever you downloaded your data as a CSV.
+
+Then run `Python makeModel.py` (or, if you're in iPython, `run makeModel.py`). Then go do something else for the 40-60 minutes that it takes to train your neural net.
+
+When creating the net finishes, three new files should have been created: `dictionary.json`, `model.json`, and `model.h5`. You will need these to use the net.
+
+### Classification
+To use the net to classify data, run `loadModel.py` and type into the console when prompted. Hitting Enter without typing anything will quit the program.
diff --git a/readme.md b/readme.md
@@ -0,0 +1 @@
+# Hi there