-
-
Save marcelcaraciolo/1604487 to your computer and use it in GitHub Desktop.
| #-*- coding: utf-8 -*- | |
| import re | |
| import nltk | |
| from nltk.tokenize import RegexpTokenizer | |
| from nltk import bigrams, trigrams | |
| import math | |
| stopwords = nltk.corpus.stopwords.words('portuguese') | |
| tokenizer = RegexpTokenizer("[\w’]+", flags=re.UNICODE) | |
| def freq(word, doc): | |
| return doc.count(word) | |
| def word_count(doc): | |
| return len(doc) | |
| def tf(word, doc): | |
| return (freq(word, doc) / float(word_count(doc))) | |
| def num_docs_containing(word, list_of_docs): | |
| count = 0 | |
| for document in list_of_docs: | |
| if freq(word, document) > 0: | |
| count += 1 | |
| return 1 + count | |
| def idf(word, list_of_docs): | |
| return math.log(len(list_of_docs) / | |
| float(num_docs_containing(word, list_of_docs))) | |
| def tf_idf(word, doc, list_of_docs): | |
| return (tf(word, doc) * idf(word, list_of_docs)) | |
| #Compute the frequency for each term. | |
| vocabulary = [] | |
| docs = {} | |
| all_tips = [] | |
| for tip in (['documment 1', 'documment 2']): | |
| tokens = tokenizer.tokenize(tip.text) | |
| bi_tokens = bigrams(tokens) | |
| tri_tokens = trigrams(tokens) | |
| tokens = [token.lower() for token in tokens if len(token) > 2] | |
| tokens = [token for token in tokens if token not in stopwords] | |
| bi_tokens = [' '.join(token).lower() for token in bi_tokens] | |
| bi_tokens = [token for token in bi_tokens if token not in stopwords] | |
| tri_tokens = [' '.join(token).lower() for token in tri_tokens] | |
| tri_tokens = [token for token in tri_tokens if token not in stopwords] | |
| final_tokens = [] | |
| final_tokens.extend(tokens) | |
| final_tokens.extend(bi_tokens) | |
| final_tokens.extend(tri_tokens) | |
| docs[tip] = {'freq': {}, 'tf': {}, 'idf': {}, | |
| 'tf-idf': {}, 'tokens': []} | |
| for token in final_tokens: | |
| #The frequency computed for each tip | |
| docs[tip]['freq'][token] = freq(token, final_tokens) | |
| #The term-frequency (Normalized Frequency) | |
| docs[tip]['tf'][token] = tf(token, final_tokens) | |
| docs[tip]['tokens'] = final_tokens | |
| vocabulary.append(final_tokens) | |
| for doc in docs: | |
| for token in docs[doc]['tf']: | |
| #The Inverse-Document-Frequency | |
| docs[doc]['idf'][token] = idf(token, vocabulary) | |
| #The tf-idf | |
| docs[doc]['tf-idf'][token] = tf_idf(token, docs[doc]['tokens'], vocabulary) | |
| #Now let's find out the most relevant words by tf-idf. | |
| words = {} | |
| for doc in docs: | |
| for token in docs[doc]['tf-idf']: | |
| if token not in words: | |
| words[token] = docs[doc]['tf-idf'][token] | |
| else: | |
| if docs[doc]['tf-idf'][token] > words[token]: | |
| words[token] = docs[doc]['tf-idf'][token] | |
| print doc | |
| for token in docs[doc]['tf-idf']: | |
| print token, docs[doc]['tf-idf'][token] | |
| for item in sorted(words.items(), key=lambda x: x[1], reverse=True): | |
| print "%f <= %s" % (item[1], item[0]) | |
I got an error
tokens = tokenizer.tokenize(tip.text)
AttributeError: 'str' object has no attribute 'text'
How to pass the documents ?
I too got an error
tokens = tokenizer.tokenize(tip.text)
AttributeError: 'str' object has no attribute 'text'
How to pass the documents ?
same error
tokens = tokenizer.tokenize(tip.text)
AttributeError: 'str' object has no attribute 'text'
How to pass the documents ?
Thank you for the code. It was really helpful!
same error
tokens = tokenizer.tokenize(tip.text)
AttributeError: 'str' object has no attribute 'text'How to pass the documents ?
Hey were you able to resolve the AttributeError: 'str' object has no attribute 'text' error ?
ame error
tokens = tokenizer.tokenize(tip.text)
AttributeError: 'str' object has no attribute 'text'
How to pass the documents ?
What is document 1 and document 2 ?
Can you give some example ??