revox · November 26, 2021 15:09
diff --git a/tweet_word_frequency.py b/tweet_word_frequency.py
 import nltk, sys, csv
 from nltk.tokenize import TweetTokenizer
 from nltk.corpus import stopwords
 from datetime import datetime
 # conda install -c conda-forge matplotlib
 import matplotlib.pyplot as plt
 import matplotlib.dates as mdates
 import pandas as pd
 import numpy as np
 import string
 from collections import Counter


 # function to tokenize a text: 1. lowercase, 2. tokenize, 3. stopwords removal, 4. digits removal
 def process(text, tokenizer=TweetTokenizer(), stopwords=[]):
    text = text.lower()
    tokens = tokenizer.tokenize(text)
    return [word for word in tokens if word not in stopwords and not word.isdigit()]


 # *** word frequency mining ****
 # tokenizer
 tweet_tokenizer = TweetTokenizer()
 # punctuation list
 punct = list(string.punctuation)
 # download 127 Englisg stop words
 nltk.download('stopwords')
 # list of stop words and punctuations
 stopword_list = stopwords.words('english') + punct + ['rt', 'via']

 # record the number of occurences for each word
 tf = Counter()
 all_dates = []
 with open('brexit_data.csv', 'rU') as inputfile:
    tweetreader = csv.reader(inputfile,delimiter='|')
    # get the text and the time
    for row in tweetreader:
        message = row[2]
        tokens = process(text = message, tokenizer = tweet_tokenizer, stopwords = stopword_list)
        all_dates.append(row[1])
        # update word frequency
        tf.update(tokens)

 # convert the counter to a sorted list (tf_sorted is a list of 2-tuples)
 tf_list_sorted = sorted(tf.items(), key = lambda pair: pair[1], reverse = True)
 # print each word and its frequency
 csvfile = open('text_data.csv', 'w')
 csvwriter = csv.writer(csvfile)

 for item in tf_list_sorted:
    print item[0].encode('utf-8'), item[1]
    csvwriter.writerow([item[0].encode('utf-8'), item[1]])
	import nltk, sys, csv
	from nltk.tokenize import TweetTokenizer
	from nltk.corpus import stopwords
	from datetime import datetime
	# conda install -c conda-forge matplotlib
	import matplotlib.pyplot as plt
	import matplotlib.dates as mdates
	import pandas as pd
	import numpy as np
	import string
	from collections import Counter


	# function to tokenize a text: 1. lowercase, 2. tokenize, 3. stopwords removal, 4. digits removal
	def process(text, tokenizer=TweetTokenizer(), stopwords=[]):
	text = text.lower()
	tokens = tokenizer.tokenize(text)
	return [word for word in tokens if word not in stopwords and not word.isdigit()]


	# * word frequency mining **
	# tokenizer
	tweet_tokenizer = TweetTokenizer()
	# punctuation list
	punct = list(string.punctuation)
	# download 127 Englisg stop words
	nltk.download('stopwords')
	# list of stop words and punctuations
	stopword_list = stopwords.words('english') + punct + ['rt', 'via']

	# record the number of occurences for each word
	tf = Counter()
	all_dates = []
	with open('brexit_data.csv', 'rU') as inputfile:
	tweetreader = csv.reader(inputfile,delimiter='\|')
	# get the text and the time
	for row in tweetreader:
	message = row[2]
	tokens = process(text = message, tokenizer = tweet_tokenizer, stopwords = stopword_list)
	all_dates.append(row[1])
	# update word frequency
	tf.update(tokens)

	# convert the counter to a sorted list (tf_sorted is a list of 2-tuples)
	tf_list_sorted = sorted(tf.items(), key = lambda pair: pair[1], reverse = True)
	# print each word and its frequency
	csvfile = open('text_data.csv', 'w')
	csvwriter = csv.writer(csvfile)

	for item in tf_list_sorted:
	print item[0].encode('utf-8'), item[1]
	csvwriter.writerow([item[0].encode('utf-8'), item[1]])
No results found