import nltk, sys, csv from nltk.tokenize import TweetTokenizer from nltk.corpus import stopwords from datetime import datetime import pandas as pd import numpy as np import string from collections import Counter # function to tokenize a text: 1. lowercase, 2. tokenize, 3. stopwords removal, 4. digits removal def process(text, tokenizer=TweetTokenizer(), stopwords=[]): text = text.lower() tokens = tokenizer.tokenize(text) return [word for word in tokens if word not in stopwords and not word.isdigit()] # *** word frequency mining **** # tokenizer tweet_tokenizer = TweetTokenizer() # punctuation list punct = list(string.punctuation) # download 127 Englisg stop words nltk.download('stopwords') # list of stop words and punctuations stopword_list = stopwords.words('english') + punct + ['rt', 'via'] # record the number of occurences for each word tf = Counter() all_dates = [] with open('brexit_data.csv', 'rU') as inputfile: tweetreader = csv.reader(inputfile,delimiter='|') # get the text and the time for row in tweetreader: message = row[2] tokens = process(text = message, tokenizer = tweet_tokenizer, stopwords = stopword_list) all_dates.append(row[1]) # update word frequency tf.update(tokens) # convert the counter to a sorted list (tf_sorted is a list of 2-tuples) tf_list_sorted = sorted(tf.items(), key = lambda pair: pair[1], reverse = True) # print each word and its frequency csvfile = open('text_data.csv', 'w') csvwriter = csv.writer(csvfile) for item in tf_list_sorted: print item[0].encode('utf-8'), item[1] csvwriter.writerow([item[0].encode('utf-8'), item[1]])