Last active
November 26, 2021 15:09
-
-
Save revox/49975a50d0b96cf580ca966d157186f6 to your computer and use it in GitHub Desktop.
Revisions
-
revox revised this gist
Nov 28, 2017 . 1 changed file with 0 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -2,9 +2,6 @@ from nltk.tokenize import TweetTokenizer from nltk.corpus import stopwords from datetime import datetime import pandas as pd import numpy as np import string -
revox created this gist
Nov 28, 2017 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,53 @@ import nltk, sys, csv from nltk.tokenize import TweetTokenizer from nltk.corpus import stopwords from datetime import datetime # conda install -c conda-forge matplotlib import matplotlib.pyplot as plt import matplotlib.dates as mdates import pandas as pd import numpy as np import string from collections import Counter # function to tokenize a text: 1. lowercase, 2. tokenize, 3. stopwords removal, 4. digits removal def process(text, tokenizer=TweetTokenizer(), stopwords=[]): text = text.lower() tokens = tokenizer.tokenize(text) return [word for word in tokens if word not in stopwords and not word.isdigit()] # *** word frequency mining **** # tokenizer tweet_tokenizer = TweetTokenizer() # punctuation list punct = list(string.punctuation) # download 127 Englisg stop words nltk.download('stopwords') # list of stop words and punctuations stopword_list = stopwords.words('english') + punct + ['rt', 'via'] # record the number of occurences for each word tf = Counter() all_dates = [] with open('brexit_data.csv', 'rU') as inputfile: tweetreader = csv.reader(inputfile,delimiter='|') # get the text and the time for row in tweetreader: message = row[2] tokens = process(text = message, tokenizer = tweet_tokenizer, stopwords = stopword_list) all_dates.append(row[1]) # update word frequency tf.update(tokens) # convert the counter to a sorted list (tf_sorted is a list of 2-tuples) tf_list_sorted = sorted(tf.items(), key = lambda pair: pair[1], reverse = True) # print each word and its frequency csvfile = open('text_data.csv', 'w') csvwriter = csv.writer(csvfile) for item in tf_list_sorted: print item[0].encode('utf-8'), item[1] csvwriter.writerow([item[0].encode('utf-8'), item[1]])