Skip to content

Instantly share code, notes, and snippets.

@revox
Last active November 26, 2021 15:09
Show Gist options
  • Save revox/49975a50d0b96cf580ca966d157186f6 to your computer and use it in GitHub Desktop.
Save revox/49975a50d0b96cf580ca966d157186f6 to your computer and use it in GitHub Desktop.

Revisions

  1. revox revised this gist Nov 28, 2017. 1 changed file with 0 additions and 3 deletions.
    3 changes: 0 additions & 3 deletions tweet_word_frequency.py
    Original file line number Diff line number Diff line change
    @@ -2,9 +2,6 @@
    from nltk.tokenize import TweetTokenizer
    from nltk.corpus import stopwords
    from datetime import datetime
    # conda install -c conda-forge matplotlib
    import matplotlib.pyplot as plt
    import matplotlib.dates as mdates
    import pandas as pd
    import numpy as np
    import string
  2. revox created this gist Nov 28, 2017.
    53 changes: 53 additions & 0 deletions tweet_word_frequency.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,53 @@
    import nltk, sys, csv
    from nltk.tokenize import TweetTokenizer
    from nltk.corpus import stopwords
    from datetime import datetime
    # conda install -c conda-forge matplotlib
    import matplotlib.pyplot as plt
    import matplotlib.dates as mdates
    import pandas as pd
    import numpy as np
    import string
    from collections import Counter


    # function to tokenize a text: 1. lowercase, 2. tokenize, 3. stopwords removal, 4. digits removal
    def process(text, tokenizer=TweetTokenizer(), stopwords=[]):
    text = text.lower()
    tokens = tokenizer.tokenize(text)
    return [word for word in tokens if word not in stopwords and not word.isdigit()]


    # *** word frequency mining ****
    # tokenizer
    tweet_tokenizer = TweetTokenizer()
    # punctuation list
    punct = list(string.punctuation)
    # download 127 Englisg stop words
    nltk.download('stopwords')
    # list of stop words and punctuations
    stopword_list = stopwords.words('english') + punct + ['rt', 'via']

    # record the number of occurences for each word
    tf = Counter()
    all_dates = []
    with open('brexit_data.csv', 'rU') as inputfile:
    tweetreader = csv.reader(inputfile,delimiter='|')
    # get the text and the time
    for row in tweetreader:
    message = row[2]
    tokens = process(text = message, tokenizer = tweet_tokenizer, stopwords = stopword_list)
    all_dates.append(row[1])
    # update word frequency
    tf.update(tokens)

    # convert the counter to a sorted list (tf_sorted is a list of 2-tuples)
    tf_list_sorted = sorted(tf.items(), key = lambda pair: pair[1], reverse = True)
    # print each word and its frequency
    csvfile = open('text_data.csv', 'w')
    csvwriter = csv.writer(csvfile)

    for item in tf_list_sorted:
    print item[0].encode('utf-8'), item[1]
    csvwriter.writerow([item[0].encode('utf-8'), item[1]])