Last active
April 19, 2016 21:17
-
-
Save geoom/c81970ef0171ecbb1a1b to your computer and use it in GitHub Desktop.
Revisions
-
geoom revised this gist
Jun 8, 2015 . 5 changed files with 264 additions and 78 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,5 +1,5 @@ class TextualAnalizer(object): STOP_WORDS = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'aren\'t', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', @@ -20,79 +20,79 @@ class Analizer(object): 'would', 'wouldn\'t', 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves'] PUNCTUATION_MARKS = ['.', ',', '?', ':', ';', '-', '...'] EXCLAMATION_MARK = '!' NOISE_MARKS = ['/', '&'] POSITIVE_EMOTICONS = [':)', ':D'] NEUTRAL_EMOTICONS = [':|'] NEGATIVE_EMOTICONS = [':(', ':\'('] def __init__(self, text): self.text = text.lower() print "text is ", self.text def _discard_terms(self, *terms_lists): for _, term_list in enumerate(terms_lists): for term in term_list: if term in self.text: self.text = self.text.replace(term, '') def _get_ocurrences_number(self, term_list): counter = 0 self.text = self.text.lower() for stop_word in term_list: if stop_word in self.text: counter += 1 return counter def get_stop_words_number(self): return self._get_ocurrences_number(self.STOP_WORDS) def get_words_number(self, exclude_stop_words=False): return len(self.get_words_list(self.text, exclude_stop_words)) def get_words_list(self, only_uniques=False, exclude_stop_words=False): self._discard_terms(self.PUNCTUATION_MARKS, self.EXCLAMATION_MARK, self.POSITIVE_EMOTICONS, self.NEUTRAL_EMOTICONS, self.NEGATIVE_EMOTICONS, self.NOISE_MARKS) all_words = self.text.strip().split(' ') all_words = filter(lambda item: item != '' and '#' not in item and 'http' not in item and '@' not in item, all_words) if exclude_stop_words: for excl_word in self.STOP_WORDS: if excl_word in all_words: all_words.remove(excl_word) return list(set(all_words)) if only_uniques else all_words def get_punctuation_marks_number(self): return self._get_ocurrences_number(self.PUNCTUATION_MARKS) def get_exclamation_marks_number(self): exclamation_string = filter(lambda item: item == self.EXCLAMATION_MARK, self.text) return len(exclamation_string) def get_capitalized_words_number(self): ocurrences = [word for word in self.text if word[0].isupper()] return len(ocurrences) def get_positive_emoticons_number(self): return self._get_ocurrences_number(self.POSITIVE_EMOTICONS) def get_neutral_emoticons_number(self): return self._get_ocurrences_number(self.NEUTRAL_EMOTICONS) def get_negative_emoticons_number(self): return self._get_ocurrences_number(self.NEGATIVE_EMOTICONS) This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,96 @@ import urllib import settings from xml.dom import minidom from twitter import * class TwitterHandler(object): def __init__(self, query): self.query = query self.twitter = Twitter(auth=OAuth(settings.ACCESS_KEY, settings.ACCESS_SECRET, settings.CONSUMER_KEY, settings.CONSUMER_SECRET)) def get_product_tweets(self): results = self.twitter.search.tweets(q=self.query, count=5) return results["statuses"] class FileHandler(object): def __init__(self, row_format_in_file): self.row_format_in_file = row_format_in_file self.output_file = settings.OUTPUT_FILENAME def save(self, stored_data): output_file = file(self.output_file, "a") row = self.row_format_in_file % stored_data output_file.write(row) output_file.close() def clean(self): output_file = file(self.output_file, "w") output_file.write('') output_file.close() class DALHandler(object): NEGATIVE_WORD, NEUTRAL_WORD, POSITIVE_WORD = (-1, 0, 1) word_affect_list = list() polarity_list = list() def __init__(self, word_list): query = '+'.join(word_list) self.url = 'http://compling.org/cgi-bin/DAL_sentence_xml.cgi?sentence=%s' % query def _get_remote_document(self): remote_doc = urllib.urlopen(self.url).read() parsed_doc = minidom.parseString(remote_doc) return parsed_doc @staticmethod def get_polarity(valence): valence = float(valence) normalization_factor = 3.0 result = valence/normalization_factor if result < 0.5: return DALHandler.NEGATIVE_WORD elif result > 0.8: return DALHandler.POSITIVE_WORD return DALHandler.NEUTRAL_WORD def make_word_affect_list(self): doc = self._get_remote_document() words = doc.getElementsByTagName("word") for word in words: token_tag = word.getElementsByTagName('token')[0] emotion_measure_tag = word.getElementsByTagName('measure')[0] valence = emotion_measure_tag.getAttribute("valence") polarity = DALHandler.get_polarity(valence) if len(valence) > 0 else None self.word_affect_list.append( (token_tag.firstChild.data, polarity)) self.polarity_list.append(polarity) print 'word_affect_list', self.word_affect_list def get_positive_word_number(self): return self.polarity_list.count(DALHandler.POSITIVE_WORD) def get_negative_word_number(self): return self.polarity_list.count(DALHandler.NEGATIVE_WORD) def get_neutral_word_number(self): return self.polarity_list.count(DALHandler.NEUTRAL_WORD) This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,33 +1,132 @@ from handler import FileHandler, TwitterHandler, DALHandler from analizer import TextualAnalizer import settings class MinedProductTweet(object): def __init__(self, tweet_text): self.tweet_text = tweet_text self.positive_words_number = 0 self.neutral_words_number = 0 self.negative_words_number = 0 self.stop_words_number = 0 self.words_number = 0 self.punctuation_marks_number = 0 self.exclamation_marks_number = 0 self.capitalized_words_number = 0 self.positive_emoticons_number = 0 self.neutral_emoticons_number = 0 self.negative_emoticons_number = 0 def make_data(self): analizer = TextualAnalizer(self.tweet_text) self.stop_words_number = analizer.get_stop_words_number() self.words_number = analizer.get_words_number() self.punctuation_marks_number = analizer.get_punctuation_marks_number() self.exclamation_marks_number = analizer.get_exclamation_marks_number() self.capitalized_words_number = analizer.get_capitalized_words_number() self.positive_emoticons_number = analizer.get_positive_emoticons_number() self.neutral_emoticons_number = analizer.get_neutral_emoticons_number() self.negative_emoticons_number = analizer.get_negative_emoticons_number() word_list = analizer.get_words_list(exclude_stop_words=True) handler = DALHandler(word_list) handler.make_word_affect_list() self.positive_words_number = handler.get_positive_word_number() self.neutral_words_number = handler.get_neutral_word_number() self.negative_words_number = handler.get_negative_word_number() class MinedProduct(object): product_tweet_list = list() def __init__(self, hashtag): self.hashtag = hashtag self.tweets_number = 0 self.retweet_percentage = 0 self.price_from_amazon_seller = 0 self.sell_raking = 0 self.rating_by_clients = 0 self.elapsed_time_since_release = 0 self.average_positive_words_number = 0 self.average_neutral_words_number = 0 self.average_negative_words_number = 0 self.average_stop_words_number = 0 self.average_words_number = 0 self.average_punctuation_marks_number = 0 self.average_exclamation_marks_number = 0 self.average_capitalized_marks_number = 0 self.average_positive_emoticons_number = 0 self.average_neutral_emoticons_number = 0 self.average_negative_emoticons_number = 0 self.acceptability = 0 def make_data(self): handler = TwitterHandler(self.hashtag) tweet_results = handler.get_product_tweets() self.tweets_number = len(tweet_results) for tweet in tweet_results: product_tweet = MinedProductTweet(tweet['text'].encode('utf-8')) product_tweet.make_data() self.product_tweet_list.append(product_tweet) positive_words_number_list, neutral_words_number_list, \ negative_words_number_list = zip(*[(product_tweet.positive_words_number, product_tweet.neutral_words_number, product_tweet.negative_words_number) for product_tweet in self.product_tweet_list]) self.average_positive_words_number = sum(positive_words_number_list)/len(positive_words_number_list) self.average_neutral_words_number = sum(neutral_words_number_list)/len(neutral_words_number_list) self.average_negative_words_number = sum(negative_words_number_list)/len(negative_words_number_list) print self.__dict__ def calculate_acceptability(self): pass def save(self): row_format_in_file = "%(tweets_number)s, %(average_positive_words_number)s, " \ "%(average_neutral_words_number)s, %(average_negative_words_number)s\n" handler = FileHandler(row_format_in_file) handler.save(self.__dict__) class Miner(object): product_hashtag_list = settings.ALL_PRODUCT_HASTAGS def perform_mining(self): for product_hashtag in self.product_hashtag_list: product = MinedProduct(product_hashtag) product.make_data() # product.calculate_acceptability() product.save() miner = Miner() miner.perform_mining() This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,18 +0,0 @@ This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,9 @@ CONSUMER_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" CONSUMER_SECRET = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" ACCESS_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" ACCESS_SECRET = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" OUTPUT_FILENAME = "products.txt" ALL_PRODUCT_HASTAGS = ['#gopro'] -
geoom created this gist
Jun 7, 2015 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,98 @@ class Analizer(object): STOP_WORDS = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'aren\'t', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can\'t', 'cannot', 'could', 'couldn\'t', 'did', 'didn\'t', 'do', 'does', 'doesn\'t', 'doing', 'don\'t', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn\'t', 'has', 'hasn\'t', 'have', 'haven\'t', 'having', 'he', 'he\'d', 'he\'ll', 'he\'s', 'her', 'here', 'here\'s', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'how\'s', 'i', 'i\'d', 'i\'ll', 'i\'m', 'i\'ve', 'if', 'in', 'into', 'is', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'let\'s', 'me', 'more', 'most', 'mustn\'t', 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', 'shan\'t', 'she', 'she\'d', 'she\'ll', 'she\'s', 'should', 'shouldn\'t', 'so', 'some', 'such', 'than', 'that', 'that\'s', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'there\'s', 'these', 'they', 'they\'d', 'they\'ll', 'they\'re', 'they\'ve', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'wasn\'t', 'we', 'we\'d', 'we\'ll', 'we\'re', 'we\'ve', 'were', 'weren\'t', 'what', 'what\'s', 'when', 'when\'s', 'where', 'where\'s', 'which', 'while', 'who', 'who\'s', 'whom', 'why', 'why\'s', 'with', 'won\'t', 'would', 'wouldn\'t', 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves'] PUNCTUATION_MARKS = ['.', ',', '?', ':', ';', '-', '...', '/'] EXCLAMATION_MARK = '!' POSITIVE_EMOTICONS = [':)', ':D'] NEUTRAL_EMOTICONS = [':|'] NEGATIVE_EMOTICONS = [':(', ':\'('] def _discard_terms(self, text, *terms_lists): for _, term_list in enumerate(terms_lists): for term in term_list: if term in text: text = text.replace(term, '') return text def _get_ocurrences_number(self, text, term_list): counter = 0 text = text.lower() for stop_word in term_list: if stop_word in text: counter += 1 return counter def get_stop_words_number(self, text): return self._get_ocurrences_number(text, self.STOP_WORDS) def get_words_number(self, text, exclude_stop_words=False): return len(self.get_words_list(text, exclude_stop_words)) def get_words_list(self, text, exclude_stop_words=False): # TODO: discard hashtag text = text.lower() text = self._discard_terms(text, self.PUNCTUATION_MARKS, self.EXCLAMATION_MARK, self.POSITIVE_EMOTICONS, self.NEUTRAL_EMOTICONS, self.NEGATIVE_EMOTICONS) all_words = text.strip().split(' ') all_words = filter(lambda item: item != '', all_words) if exclude_stop_words: for excl_word in self.STOP_WORDS: if excl_word in all_words: all_words.remove(excl_word) return all_words def get_punctuation_marks_number(self, text): return self._get_ocurrences_number(text, self.PUNCTUATION_MARKS) def get_exclamation_marks_number(self, text): exclamtion_string = filter(lambda item: item == self.EXCLAMATION_MARK, text) return len(exclamtion_string) def get_capitalized_words_number(self, text): ocurrences = [word for word in text if word[0].isupper()] return len(ocurrences) def get_positive_emoticons_number(self, text): return self._get_ocurrences_number(text, self.POSITIVE_EMOTICONS) def get_neutral_emoticons_number(self, text): return self._get_ocurrences_number(text, self.NEUTRAL_EMOTICONS) def get_negative_emoticons_number(self, text): return self._get_ocurrences_number(text, self.NEGATIVE_EMOTICONS) This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,33 @@ from twitter import * import parser import pprint CONSUMER_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXX" CONSUMER_SECRET = "XXXXXXXXXXXXXXXXXXXXXXXXXXXX" ACCESS_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXX" ACCESS_SECRET = "XXXXXXXXXXXXXXXXXXXXXXXXXXXX" OUTPUT_FILENAME = "products.txt" products_to_miner = ['#gopro', '#iphone6'] twitter = Twitter(auth=OAuth( ACCESS_KEY, ACCESS_SECRET, CONSUMER_KEY, CONSUMER_SECRET)) products_file = file(OUTPUT_FILENAME, "a") for product_query in products_to_miner: results = twitter.search.tweets(q=product_query, count=100) # pprint.pprint(results["statuses"][0]) stored_data = { "tweets_number": len(results["statuses"]) } row_format_in_file = "%(tweets_number)s, ...\n" # Write in file products_file.write(row_format_in_file % stored_data) products_file.close() This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,18 @@ from xml.dom import minidom import urllib words_list = ['these', 'are', '73iufweghj', 'words'] query = '+'.join(words_list) url = 'http://compling.org/cgi-bin/DAL_sentence_xml.cgi?sentence=%s' remote_doc = urllib.urlopen(url % query).read() # doc = minidom.parse("sentences.xml") doc = minidom.parseString(remote_doc) words = doc.getElementsByTagName("word") for word in words: token_tag = word.getElementsByTagName('token')[0] emotion_measure_tag = word.getElementsByTagName('measure')[0] valence = emotion_measure_tag.getAttribute("valence")