Skip to content

Instantly share code, notes, and snippets.

@revox
Last active November 26, 2021 15:09
Show Gist options
  • Select an option

  • Save revox/49975a50d0b96cf580ca966d157186f6 to your computer and use it in GitHub Desktop.

Select an option

Save revox/49975a50d0b96cf580ca966d157186f6 to your computer and use it in GitHub Desktop.
import nltk, sys, csv
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from datetime import datetime
# conda install -c conda-forge matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
import numpy as np
import string
from collections import Counter
# function to tokenize a text: 1. lowercase, 2. tokenize, 3. stopwords removal, 4. digits removal
def process(text, tokenizer=TweetTokenizer(), stopwords=[]):
text = text.lower()
tokens = tokenizer.tokenize(text)
return [word for word in tokens if word not in stopwords and not word.isdigit()]
# *** word frequency mining ****
# tokenizer
tweet_tokenizer = TweetTokenizer()
# punctuation list
punct = list(string.punctuation)
# download 127 Englisg stop words
nltk.download('stopwords')
# list of stop words and punctuations
stopword_list = stopwords.words('english') + punct + ['rt', 'via']
# record the number of occurences for each word
tf = Counter()
all_dates = []
with open('brexit_data.csv', 'rU') as inputfile:
tweetreader = csv.reader(inputfile,delimiter='|')
# get the text and the time
for row in tweetreader:
message = row[2]
tokens = process(text = message, tokenizer = tweet_tokenizer, stopwords = stopword_list)
all_dates.append(row[1])
# update word frequency
tf.update(tokens)
# convert the counter to a sorted list (tf_sorted is a list of 2-tuples)
tf_list_sorted = sorted(tf.items(), key = lambda pair: pair[1], reverse = True)
# print each word and its frequency
csvfile = open('text_data.csv', 'w')
csvwriter = csv.writer(csvfile)
for item in tf_list_sorted:
print item[0].encode('utf-8'), item[1]
csvwriter.writerow([item[0].encode('utf-8'), item[1]])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment