#!/usr/bin/env python import sys import urllib2 import json from collections import defaultdict from nltk import word_tokenize IGNORED_WORDS = ["!", ".", ",", "(", ")", "'s", ":", "?", "...", "$", "<", ">", "''", "``", "-", "c", "'", "--", "&", "and", "the", "or", "not", "i", "you", "to", "this", "of", "in", "for", "a", "an", "and", "your", "with", "me", "my", "be", "these", "that", "do", "at", "no", "so", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "http", "@", "is", "am", "are", "it", "if", "n't", "'em", "from", "one", "on", "up", "like", "we", "their", "they", "'ll", "'d", "'m", "//www.youtube.com/watch", "by", "have", "just", "will", "as"] def get_json(url): """ Return the parsed JSON from a url. """ print "GET %s" % url f = urllib2.urlopen(url) return json.loads(f.read()) def fb_feed_url(access_token): """ The graph api url for your facebook feed. access_token should be a valid facebook access token as a string. """ return "https://graph.facebook.com/me/home?access_token=%s" % access_token def fb_news_feed(access_token): """ Returns a set number of pages from a facebook news feed as a list of parsed json. """ feed_url = fb_feed_url(access_token) page_json = [] for x in range(0, 100): json = get_json(feed_url) page_json.append(json) # Break if there's not enough data to continue if 'paging' in json and 'next' in json['paging']: feed_url = json['paging']['next'] else: break return page_json def word_count(words): """ Count the occurrences of each word. Words should be a list of strings that you might get from str.split() or nltk.word_tokenize(). """ seen = defaultdict(lambda: 0) for w in words: if w.lower() not in IGNORED_WORDS: seen[w] += 1 return seen def word_cloud_sizes(counts): max_count = max(counts.values()) min_size, max_size = (1, 5) word_sizes = defaultdict(lambda : 1) for word in counts: if counts[word] > 1: size = int(float(counts[word]) * (max_size - 1) / max_count) + 1 word_sizes[word] = size return word_sizes def word_cloud_header(): return """ My Facebook Word Cloud
""" def word_cloud_footer(): return """
""" def word_cloud_html(sizes): def word_html(word, size): return "%s" % (size, word) return ' '.join([word_html(w, s) for w, s in sizes.items()]) if __name__ == "__main__": access_token = sys.argv[1] pages = fb_news_feed(access_token) messages = [post['message'] for page in pages for post in page['data'] if 'message' in post] tokens = [token for msg in messages for token in word_tokenize(msg)] counts = word_count(tokens) sizes = word_cloud_sizes(counts) print ''.join([ word_cloud_header(), word_cloud_html(sizes), word_cloud_footer()])