hudvin · August 20, 2014 08:00 · Jan 13, 2012
diff --git a/tf_idf_final.py b/tf_idf_final.py
@@ -0,0 +1,99 @@
+#-*- coding: utf-8 -*-
+
+import re
+import nltk
+from nltk.tokenize import RegexpTokenizer
+from nltk import bigrams, trigrams
+import math
+
+
+stopwords = nltk.corpus.stopwords.words('portuguese')
+tokenizer = RegexpTokenizer("[\w’]+", flags=re.UNICODE)
+
+
+def freq(word, doc):
+    return doc.count(word)
+
+
+def word_count(doc):
+    return len(doc)
+
+
+def tf(word, doc):
+    return (freq(word, doc) / float(word_count(doc)))
+
+
+def num_docs_containing(word, list_of_docs):
+    count = 0
+    for document in list_of_docs:
+        if freq(word, document) > 0:
+            count += 1
+    return 1 + count
+
+
+def idf(word, list_of_docs):
+    return math.log(len(list_of_docs) /
+            float(num_docs_containing(word, list_of_docs)))
+
+
+def tf_idf(word, doc, list_of_docs):
+    return (tf(word, doc) * idf(word, list_of_docs))
+
+#Compute the frequency for each term.
+vocabulary = []
+docs = {}
+all_tips = []
+for tip in (['documment 1', 'documment 2']):
+    tokens = tokenizer.tokenize(tip.text)
+
+    bi_tokens = bigrams(tokens)
+    tri_tokens = trigrams(tokens)
+    tokens = [token.lower() for token in tokens if len(token) > 2]
+    tokens = [token for token in tokens if token not in stopwords]
+
+    bi_tokens = [' '.join(token).lower() for token in bi_tokens]
+    bi_tokens = [token for token in bi_tokens if token not in stopwords]
+
+    tri_tokens = [' '.join(token).lower() for token in tri_tokens]
+    tri_tokens = [token for token in tri_tokens if token not in stopwords]
+
+    final_tokens = []
+    final_tokens.extend(tokens)
+    final_tokens.extend(bi_tokens)
+    final_tokens.extend(tri_tokens)
+    docs[tip] = {'freq': {}, 'tf': {}, 'idf': {},
+                        'tf-idf': {}, 'tokens': []}
+
+    for token in final_tokens:
+        #The frequency computed for each tip
+        docs[tip]['freq'][token] = freq(token, final_tokens)
+        #The term-frequency (Normalized Frequency)
+        docs[tip]['tf'][token] = tf(token, final_tokens)
+        docs[tip]['tokens'] = final_tokens
+
+    vocabulary.append(final_tokens)
+
+for doc in docs:
+    for token in docs[doc]['tf']:
+        #The Inverse-Document-Frequency
+        docs[doc]['idf'][token] = idf(token, vocabulary)
+        #The tf-idf
+        docs[doc]['tf-idf'][token] = tf_idf(token, docs[doc]['tokens'], vocabulary)
+
+#Now let's find out the most relevant words by tf-idf.
+words = {}
+for doc in docs:
+    for token in docs[doc]['tf-idf']:
+        if token not in words:
+            words[token] = docs[doc]['tf-idf'][token]
+        else:
+            if docs[doc]['tf-idf'][token] > words[token]:
+                words[token] = docs[doc]['tf-idf'][token]
+
+    print doc
+    for token in docs[doc]['tf-idf']:
+        print token, docs[doc]['tf-idf'][token]
+
+for item in sorted(words.items(), key=lambda x: x[1], reverse=True):
+    print "%f <= %s" % (item[1], item[0])
+
No results found