#-*- coding: utf-8 -*- import re import nltk from nltk.tokenize import RegexpTokenizer from nltk import bigrams, trigrams import math stopwords = nltk.corpus.stopwords.words('portuguese') tokenizer = RegexpTokenizer("[\w’]+", flags=re.UNICODE) def freq(word, doc): return doc.count(word) def word_count(doc): return len(doc) def tf(word, doc): return (freq(word, doc) / float(word_count(doc))) def num_docs_containing(word, list_of_docs): count = 0 for document in list_of_docs: if freq(word, document) > 0: count += 1 return 1 + count def idf(word, list_of_docs): return math.log(len(list_of_docs) / float(num_docs_containing(word, list_of_docs))) def tf_idf(word, doc, list_of_docs): return (tf(word, doc) * idf(word, list_of_docs)) #Compute the frequency for each term. vocabulary = [] docs = {} all_tips = [] for tip in (['documment 1', 'documment 2']): tokens = tokenizer.tokenize(tip.text) bi_tokens = bigrams(tokens) tri_tokens = trigrams(tokens) tokens = [token.lower() for token in tokens if len(token) > 2] tokens = [token for token in tokens if token not in stopwords] bi_tokens = [' '.join(token).lower() for token in bi_tokens] bi_tokens = [token for token in bi_tokens if token not in stopwords] tri_tokens = [' '.join(token).lower() for token in tri_tokens] tri_tokens = [token for token in tri_tokens if token not in stopwords] final_tokens = [] final_tokens.extend(tokens) final_tokens.extend(bi_tokens) final_tokens.extend(tri_tokens) docs[tip] = {'freq': {}, 'tf': {}, 'idf': {}, 'tf-idf': {}, 'tokens': []} for token in final_tokens: #The frequency computed for each tip docs[tip]['freq'][token] = freq(token, final_tokens) #The term-frequency (Normalized Frequency) docs[tip]['tf'][token] = tf(token, final_tokens) docs[tip]['tokens'] = final_tokens vocabulary.append(final_tokens) for doc in docs: for token in docs[doc]['tf']: #The Inverse-Document-Frequency docs[doc]['idf'][token] = idf(token, vocabulary) #The tf-idf docs[doc]['tf-idf'][token] = tf_idf(token, docs[doc]['tokens'], vocabulary) #Now let's find out the most relevant words by tf-idf. words = {} for doc in docs: for token in docs[doc]['tf-idf']: if token not in words: words[token] = docs[doc]['tf-idf'][token] else: if docs[doc]['tf-idf'][token] > words[token]: words[token] = docs[doc]['tf-idf'][token] print doc for token in docs[doc]['tf-idf']: print token, docs[doc]['tf-idf'][token] for item in sorted(words.items(), key=lambda x: x[1], reverse=True): print "%f <= %s" % (item[1], item[0])