revox · November 26, 2021 15:09 · Nov 28, 2017 · Nov 28, 2017
diff --git a/tweet_word_frequency.py b/tweet_word_frequency.py
@@ -2,9 +2,6 @@
 from nltk.tokenize import TweetTokenizer
 from nltk.corpus import stopwords
 from datetime import datetime
-# conda install -c conda-forge matplotlib
-import matplotlib.pyplot as plt
-import matplotlib.dates as mdates
 import pandas as pd
 import numpy as np
 import string

diff --git a/tweet_word_frequency.py b/tweet_word_frequency.py
@@ -0,0 +1,53 @@
+import nltk, sys, csv
+from nltk.tokenize import TweetTokenizer
+from nltk.corpus import stopwords
+from datetime import datetime
+# conda install -c conda-forge matplotlib
+import matplotlib.pyplot as plt
+import matplotlib.dates as mdates
+import pandas as pd
+import numpy as np
+import string
+from collections import Counter
+
+
+# function to tokenize a text: 1. lowercase, 2. tokenize, 3. stopwords removal, 4. digits removal
+def process(text, tokenizer=TweetTokenizer(), stopwords=[]):
+    text = text.lower()
+    tokens = tokenizer.tokenize(text)
+    return [word for word in tokens if word not in stopwords and not word.isdigit()]
+
+
+# *** word frequency mining ****
+# tokenizer
+tweet_tokenizer = TweetTokenizer()
+# punctuation list
+punct = list(string.punctuation)
+# download 127 Englisg stop words
+nltk.download('stopwords')
+# list of stop words and punctuations
+stopword_list = stopwords.words('english') + punct + ['rt', 'via']
+
+# record the number of occurences for each word
+tf = Counter()
+all_dates = []
+with open('brexit_data.csv', 'rU') as inputfile:
+    tweetreader = csv.reader(inputfile,delimiter='|')
+    # get the text and the time
+    for row in tweetreader:
+        message = row[2]
+        tokens = process(text = message, tokenizer = tweet_tokenizer, stopwords = stopword_list)
+        all_dates.append(row[1])
+        # update word frequency
+        tf.update(tokens)
+
+# convert the counter to a sorted list (tf_sorted is a list of 2-tuples)
+tf_list_sorted = sorted(tf.items(), key = lambda pair: pair[1], reverse = True)
+# print each word and its frequency
+csvfile = open('text_data.csv', 'w')
+csvwriter = csv.writer(csvfile)
+
+for item in tf_list_sorted:
+    print item[0].encode('utf-8'), item[1]
+    csvwriter.writerow([item[0].encode('utf-8'), item[1]])
+