Skip to content

Instantly share code, notes, and snippets.

@hudvin
Forked from anhpt379/tf_idf_final.py
Created August 20, 2014 08:00
Show Gist options
  • Select an option

  • Save hudvin/842c7632e28caf4aa5e9 to your computer and use it in GitHub Desktop.

Select an option

Save hudvin/842c7632e28caf4aa5e9 to your computer and use it in GitHub Desktop.

Revisions

  1. @marcelcaraciolo marcelcaraciolo created this gist Jan 13, 2012.
    99 changes: 99 additions & 0 deletions tf_idf_final.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,99 @@
    #-*- coding: utf-8 -*-

    import re
    import nltk
    from nltk.tokenize import RegexpTokenizer
    from nltk import bigrams, trigrams
    import math


    stopwords = nltk.corpus.stopwords.words('portuguese')
    tokenizer = RegexpTokenizer("[\w’]+", flags=re.UNICODE)


    def freq(word, doc):
    return doc.count(word)


    def word_count(doc):
    return len(doc)


    def tf(word, doc):
    return (freq(word, doc) / float(word_count(doc)))


    def num_docs_containing(word, list_of_docs):
    count = 0
    for document in list_of_docs:
    if freq(word, document) > 0:
    count += 1
    return 1 + count


    def idf(word, list_of_docs):
    return math.log(len(list_of_docs) /
    float(num_docs_containing(word, list_of_docs)))


    def tf_idf(word, doc, list_of_docs):
    return (tf(word, doc) * idf(word, list_of_docs))

    #Compute the frequency for each term.
    vocabulary = []
    docs = {}
    all_tips = []
    for tip in (['documment 1', 'documment 2']):
    tokens = tokenizer.tokenize(tip.text)

    bi_tokens = bigrams(tokens)
    tri_tokens = trigrams(tokens)
    tokens = [token.lower() for token in tokens if len(token) > 2]
    tokens = [token for token in tokens if token not in stopwords]

    bi_tokens = [' '.join(token).lower() for token in bi_tokens]
    bi_tokens = [token for token in bi_tokens if token not in stopwords]

    tri_tokens = [' '.join(token).lower() for token in tri_tokens]
    tri_tokens = [token for token in tri_tokens if token not in stopwords]

    final_tokens = []
    final_tokens.extend(tokens)
    final_tokens.extend(bi_tokens)
    final_tokens.extend(tri_tokens)
    docs[tip] = {'freq': {}, 'tf': {}, 'idf': {},
    'tf-idf': {}, 'tokens': []}

    for token in final_tokens:
    #The frequency computed for each tip
    docs[tip]['freq'][token] = freq(token, final_tokens)
    #The term-frequency (Normalized Frequency)
    docs[tip]['tf'][token] = tf(token, final_tokens)
    docs[tip]['tokens'] = final_tokens

    vocabulary.append(final_tokens)

    for doc in docs:
    for token in docs[doc]['tf']:
    #The Inverse-Document-Frequency
    docs[doc]['idf'][token] = idf(token, vocabulary)
    #The tf-idf
    docs[doc]['tf-idf'][token] = tf_idf(token, docs[doc]['tokens'], vocabulary)

    #Now let's find out the most relevant words by tf-idf.
    words = {}
    for doc in docs:
    for token in docs[doc]['tf-idf']:
    if token not in words:
    words[token] = docs[doc]['tf-idf'][token]
    else:
    if docs[doc]['tf-idf'][token] > words[token]:
    words[token] = docs[doc]['tf-idf'][token]

    print doc
    for token in docs[doc]['tf-idf']:
    print token, docs[doc]['tf-idf'][token]

    for item in sorted(words.items(), key=lambda x: x[1], reverse=True):
    print "%f <= %s" % (item[1], item[0])