Skip to content

Instantly share code, notes, and snippets.

@geoom
Last active April 19, 2016 21:17
Show Gist options
  • Select an option

  • Save geoom/c81970ef0171ecbb1a1b to your computer and use it in GitHub Desktop.

Select an option

Save geoom/c81970ef0171ecbb1a1b to your computer and use it in GitHub Desktop.

Revisions

  1. geoom revised this gist Jun 8, 2015. 5 changed files with 264 additions and 78 deletions.
    76 changes: 38 additions & 38 deletions analizer.py
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,5 @@

    class Analizer(object):
    class TextualAnalizer(object):

    STOP_WORDS = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any',
    'are', 'aren\'t', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below',
    @@ -20,79 +20,79 @@ class Analizer(object):
    'would', 'wouldn\'t', 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours',
    'yourself', 'yourselves']

    PUNCTUATION_MARKS = ['.', ',', '?', ':', ';', '-', '...', '/']
    PUNCTUATION_MARKS = ['.', ',', '?', ':', ';', '-', '...']

    EXCLAMATION_MARK = '!'
    NOISE_MARKS = ['/', '&']

    POSITIVE_EMOTICONS = [':)', ':D']
    NEUTRAL_EMOTICONS = [':|']
    NEGATIVE_EMOTICONS = [':(', ':\'(']

    def __init__(self, text):
    self.text = text.lower()
    print "text is ", self.text

    def _discard_terms(self, text, *terms_lists):
    def _discard_terms(self, *terms_lists):

    for _, term_list in enumerate(terms_lists):
    for term in term_list:
    if term in text:
    text = text.replace(term, '')
    if term in self.text:
    self.text = self.text.replace(term, '')

    return text

    def _get_ocurrences_number(self, text, term_list):
    def _get_ocurrences_number(self, term_list):

    counter = 0
    text = text.lower()
    self.text = self.text.lower()

    for stop_word in term_list:
    if stop_word in text:
    if stop_word in self.text:
    counter += 1

    return counter

    def get_stop_words_number(self, text):
    return self._get_ocurrences_number(text, self.STOP_WORDS)

    def get_words_number(self, text, exclude_stop_words=False):
    return len(self.get_words_list(text, exclude_stop_words))
    def get_stop_words_number(self):
    return self._get_ocurrences_number(self.STOP_WORDS)

    def get_words_list(self, text, exclude_stop_words=False):
    def get_words_number(self, exclude_stop_words=False):
    return len(self.get_words_list(self.text, exclude_stop_words))

    # TODO: discard hashtag
    def get_words_list(self, only_uniques=False, exclude_stop_words=False):

    text = text.lower()
    self._discard_terms(self.PUNCTUATION_MARKS, self.EXCLAMATION_MARK,
    self.POSITIVE_EMOTICONS, self.NEUTRAL_EMOTICONS,
    self.NEGATIVE_EMOTICONS, self.NOISE_MARKS)

    text = self._discard_terms(text, self.PUNCTUATION_MARKS, self.EXCLAMATION_MARK,
    self.POSITIVE_EMOTICONS, self.NEUTRAL_EMOTICONS,
    self.NEGATIVE_EMOTICONS)
    all_words = self.text.strip().split(' ')

    all_words = text.strip().split(' ')

    all_words = filter(lambda item: item != '', all_words)
    all_words = filter(lambda item: item != '' and '#' not in item and 'http' not in item and '@' not in item,
    all_words)

    if exclude_stop_words:
    for excl_word in self.STOP_WORDS:
    if excl_word in all_words:
    all_words.remove(excl_word)

    return all_words
    return list(set(all_words)) if only_uniques else all_words

    def get_punctuation_marks_number(self, text):
    return self._get_ocurrences_number(text, self.PUNCTUATION_MARKS)
    def get_punctuation_marks_number(self):
    return self._get_ocurrences_number(self.PUNCTUATION_MARKS)

    def get_exclamation_marks_number(self, text):
    def get_exclamation_marks_number(self):

    exclamtion_string = filter(lambda item: item == self.EXCLAMATION_MARK, text)
    return len(exclamtion_string)
    exclamation_string = filter(lambda item: item == self.EXCLAMATION_MARK, self.text)
    return len(exclamation_string)

    def get_capitalized_words_number(self, text):
    ocurrences = [word for word in text if word[0].isupper()]
    def get_capitalized_words_number(self):
    ocurrences = [word for word in self.text if word[0].isupper()]
    return len(ocurrences)

    def get_positive_emoticons_number(self, text):
    return self._get_ocurrences_number(text, self.POSITIVE_EMOTICONS)
    def get_positive_emoticons_number(self):
    return self._get_ocurrences_number(self.POSITIVE_EMOTICONS)

    def get_neutral_emoticons_number(self):
    return self._get_ocurrences_number(self.NEUTRAL_EMOTICONS)

    def get_neutral_emoticons_number(self, text):
    return self._get_ocurrences_number(text, self.NEUTRAL_EMOTICONS)
    def get_negative_emoticons_number(self):
    return self._get_ocurrences_number(self.NEGATIVE_EMOTICONS)

    def get_negative_emoticons_number(self, text):
    return self._get_ocurrences_number(text, self.NEGATIVE_EMOTICONS)
    96 changes: 96 additions & 0 deletions handler.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,96 @@

    import urllib
    import settings
    from xml.dom import minidom
    from twitter import *


    class TwitterHandler(object):

    def __init__(self, query):
    self.query = query
    self.twitter = Twitter(auth=OAuth(settings.ACCESS_KEY, settings.ACCESS_SECRET,
    settings.CONSUMER_KEY, settings.CONSUMER_SECRET))

    def get_product_tweets(self):
    results = self.twitter.search.tweets(q=self.query, count=5)

    return results["statuses"]


    class FileHandler(object):

    def __init__(self, row_format_in_file):
    self.row_format_in_file = row_format_in_file
    self.output_file = settings.OUTPUT_FILENAME

    def save(self, stored_data):
    output_file = file(self.output_file, "a")

    row = self.row_format_in_file % stored_data
    output_file.write(row)

    output_file.close()

    def clean(self):
    output_file = file(self.output_file, "w")
    output_file.write('')
    output_file.close()


    class DALHandler(object):

    NEGATIVE_WORD, NEUTRAL_WORD, POSITIVE_WORD = (-1, 0, 1)
    word_affect_list = list()
    polarity_list = list()

    def __init__(self, word_list):
    query = '+'.join(word_list)
    self.url = 'http://compling.org/cgi-bin/DAL_sentence_xml.cgi?sentence=%s' % query

    def _get_remote_document(self):

    remote_doc = urllib.urlopen(self.url).read()
    parsed_doc = minidom.parseString(remote_doc)

    return parsed_doc

    @staticmethod
    def get_polarity(valence):

    valence = float(valence)
    normalization_factor = 3.0
    result = valence/normalization_factor

    if result < 0.5:
    return DALHandler.NEGATIVE_WORD
    elif result > 0.8:
    return DALHandler.POSITIVE_WORD

    return DALHandler.NEUTRAL_WORD

    def make_word_affect_list(self):
    doc = self._get_remote_document()
    words = doc.getElementsByTagName("word")

    for word in words:
    token_tag = word.getElementsByTagName('token')[0]
    emotion_measure_tag = word.getElementsByTagName('measure')[0]
    valence = emotion_measure_tag.getAttribute("valence")

    polarity = DALHandler.get_polarity(valence) if len(valence) > 0 else None

    self.word_affect_list.append(
    (token_tag.firstChild.data, polarity))
    self.polarity_list.append(polarity)

    print 'word_affect_list', self.word_affect_list

    def get_positive_word_number(self):
    return self.polarity_list.count(DALHandler.POSITIVE_WORD)

    def get_negative_word_number(self):
    return self.polarity_list.count(DALHandler.NEGATIVE_WORD)

    def get_neutral_word_number(self):
    return self.polarity_list.count(DALHandler.NEUTRAL_WORD)
    143 changes: 121 additions & 22 deletions miner.py
    Original file line number Diff line number Diff line change
    @@ -1,33 +1,132 @@
    from twitter import *
    import parser
    import pprint
    from handler import FileHandler, TwitterHandler, DALHandler
    from analizer import TextualAnalizer
    import settings

    CONSUMER_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXX"
    CONSUMER_SECRET = "XXXXXXXXXXXXXXXXXXXXXXXXXXXX"
    ACCESS_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXX"
    ACCESS_SECRET = "XXXXXXXXXXXXXXXXXXXXXXXXXXXX"

    OUTPUT_FILENAME = "products.txt"
    class MinedProductTweet(object):

    products_to_miner = ['#gopro', '#iphone6']
    def __init__(self, tweet_text):

    twitter = Twitter(auth=OAuth(
    ACCESS_KEY, ACCESS_SECRET, CONSUMER_KEY, CONSUMER_SECRET))
    self.tweet_text = tweet_text

    products_file = file(OUTPUT_FILENAME, "a")
    self.positive_words_number = 0
    self.neutral_words_number = 0
    self.negative_words_number = 0

    for product_query in products_to_miner:
    results = twitter.search.tweets(q=product_query, count=100)
    self.stop_words_number = 0
    self.words_number = 0
    self.punctuation_marks_number = 0
    self.exclamation_marks_number = 0
    self.capitalized_words_number = 0

    # pprint.pprint(results["statuses"][0])
    self.positive_emoticons_number = 0
    self.neutral_emoticons_number = 0
    self.negative_emoticons_number = 0

    stored_data = {
    "tweets_number": len(results["statuses"])
    }
    def make_data(self):

    row_format_in_file = "%(tweets_number)s, ...\n"
    analizer = TextualAnalizer(self.tweet_text)

    # Write in file
    products_file.write(row_format_in_file % stored_data)
    self.stop_words_number = analizer.get_stop_words_number()
    self.words_number = analizer.get_words_number()

    self.punctuation_marks_number = analizer.get_punctuation_marks_number()
    self.exclamation_marks_number = analizer.get_exclamation_marks_number()
    self.capitalized_words_number = analizer.get_capitalized_words_number()

    self.positive_emoticons_number = analizer.get_positive_emoticons_number()
    self.neutral_emoticons_number = analizer.get_neutral_emoticons_number()
    self.negative_emoticons_number = analizer.get_negative_emoticons_number()

    word_list = analizer.get_words_list(exclude_stop_words=True)

    handler = DALHandler(word_list)
    handler.make_word_affect_list()

    self.positive_words_number = handler.get_positive_word_number()
    self.neutral_words_number = handler.get_neutral_word_number()
    self.negative_words_number = handler.get_negative_word_number()


    class MinedProduct(object):

    product_tweet_list = list()

    def __init__(self, hashtag):

    self.hashtag = hashtag

    self.tweets_number = 0
    self.retweet_percentage = 0

    self.price_from_amazon_seller = 0
    self.sell_raking = 0
    self.rating_by_clients = 0
    self.elapsed_time_since_release = 0

    self.average_positive_words_number = 0
    self.average_neutral_words_number = 0
    self.average_negative_words_number = 0

    self.average_stop_words_number = 0
    self.average_words_number = 0
    self.average_punctuation_marks_number = 0
    self.average_exclamation_marks_number = 0
    self.average_capitalized_marks_number = 0

    self.average_positive_emoticons_number = 0
    self.average_neutral_emoticons_number = 0
    self.average_negative_emoticons_number = 0

    self.acceptability = 0

    def make_data(self):
    handler = TwitterHandler(self.hashtag)

    tweet_results = handler.get_product_tweets()
    self.tweets_number = len(tweet_results)

    for tweet in tweet_results:
    product_tweet = MinedProductTweet(tweet['text'].encode('utf-8'))
    product_tweet.make_data()

    self.product_tweet_list.append(product_tweet)

    positive_words_number_list, neutral_words_number_list, \
    negative_words_number_list = zip(*[(product_tweet.positive_words_number, product_tweet.neutral_words_number,
    product_tweet.negative_words_number)
    for product_tweet in self.product_tweet_list])

    self.average_positive_words_number = sum(positive_words_number_list)/len(positive_words_number_list)
    self.average_neutral_words_number = sum(neutral_words_number_list)/len(neutral_words_number_list)
    self.average_negative_words_number = sum(negative_words_number_list)/len(negative_words_number_list)

    print self.__dict__

    def calculate_acceptability(self):
    pass

    def save(self):
    row_format_in_file = "%(tweets_number)s, %(average_positive_words_number)s, " \
    "%(average_neutral_words_number)s, %(average_negative_words_number)s\n"

    handler = FileHandler(row_format_in_file)
    handler.save(self.__dict__)


    class Miner(object):

    product_hashtag_list = settings.ALL_PRODUCT_HASTAGS

    def perform_mining(self):
    for product_hashtag in self.product_hashtag_list:
    product = MinedProduct(product_hashtag)
    product.make_data()

    # product.calculate_acceptability()
    product.save()


    miner = Miner()
    miner.perform_mining()

    products_file.close()
    18 changes: 0 additions & 18 deletions parser.py
    Original file line number Diff line number Diff line change
    @@ -1,18 +0,0 @@

    from xml.dom import minidom
    import urllib

    words_list = ['these', 'are', '73iufweghj', 'words']
    query = '+'.join(words_list)
    url = 'http://compling.org/cgi-bin/DAL_sentence_xml.cgi?sentence=%s'

    remote_doc = urllib.urlopen(url % query).read()

    # doc = minidom.parse("sentences.xml")
    doc = minidom.parseString(remote_doc)

    words = doc.getElementsByTagName("word")
    for word in words:
    token_tag = word.getElementsByTagName('token')[0]
    emotion_measure_tag = word.getElementsByTagName('measure')[0]
    valence = emotion_measure_tag.getAttribute("valence")
    9 changes: 9 additions & 0 deletions settings.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,9 @@

    CONSUMER_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
    CONSUMER_SECRET = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
    ACCESS_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
    ACCESS_SECRET = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"

    OUTPUT_FILENAME = "products.txt"

    ALL_PRODUCT_HASTAGS = ['#gopro']
  2. geoom created this gist Jun 7, 2015.
    98 changes: 98 additions & 0 deletions analizer.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,98 @@

    class Analizer(object):

    STOP_WORDS = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any',
    'are', 'aren\'t', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below',
    'between', 'both', 'but', 'by', 'can\'t', 'cannot', 'could', 'couldn\'t', 'did',
    'didn\'t', 'do', 'does', 'doesn\'t', 'doing', 'don\'t', 'down', 'during', 'each',
    'few', 'for', 'from', 'further', 'had', 'hadn\'t', 'has', 'hasn\'t', 'have', 'haven\'t',
    'having', 'he', 'he\'d', 'he\'ll', 'he\'s', 'her', 'here', 'here\'s', 'hers', 'herself',
    'him', 'himself', 'his', 'how', 'how\'s', 'i', 'i\'d', 'i\'ll', 'i\'m', 'i\'ve', 'if',
    'in', 'into', 'is', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'let\'s', 'me', 'more',
    'most', 'mustn\'t', 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only',
    'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', 'shan\'t',
    'she', 'she\'d', 'she\'ll', 'she\'s', 'should', 'shouldn\'t', 'so', 'some', 'such', 'than',
    'that', 'that\'s', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'there\'s',
    'these', 'they', 'they\'d', 'they\'ll', 'they\'re', 'they\'ve', 'this', 'those', 'through',
    'to', 'too', 'under', 'until', 'up', 'very', 'was', 'wasn\'t', 'we', 'we\'d', 'we\'ll',
    'we\'re', 'we\'ve', 'were', 'weren\'t', 'what', 'what\'s', 'when', 'when\'s', 'where',
    'where\'s', 'which', 'while', 'who', 'who\'s', 'whom', 'why', 'why\'s', 'with', 'won\'t',
    'would', 'wouldn\'t', 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours',
    'yourself', 'yourselves']

    PUNCTUATION_MARKS = ['.', ',', '?', ':', ';', '-', '...', '/']

    EXCLAMATION_MARK = '!'

    POSITIVE_EMOTICONS = [':)', ':D']
    NEUTRAL_EMOTICONS = [':|']
    NEGATIVE_EMOTICONS = [':(', ':\'(']


    def _discard_terms(self, text, *terms_lists):

    for _, term_list in enumerate(terms_lists):
    for term in term_list:
    if term in text:
    text = text.replace(term, '')

    return text

    def _get_ocurrences_number(self, text, term_list):

    counter = 0
    text = text.lower()

    for stop_word in term_list:
    if stop_word in text:
    counter += 1

    return counter

    def get_stop_words_number(self, text):
    return self._get_ocurrences_number(text, self.STOP_WORDS)

    def get_words_number(self, text, exclude_stop_words=False):
    return len(self.get_words_list(text, exclude_stop_words))

    def get_words_list(self, text, exclude_stop_words=False):

    # TODO: discard hashtag

    text = text.lower()

    text = self._discard_terms(text, self.PUNCTUATION_MARKS, self.EXCLAMATION_MARK,
    self.POSITIVE_EMOTICONS, self.NEUTRAL_EMOTICONS,
    self.NEGATIVE_EMOTICONS)

    all_words = text.strip().split(' ')

    all_words = filter(lambda item: item != '', all_words)

    if exclude_stop_words:
    for excl_word in self.STOP_WORDS:
    if excl_word in all_words:
    all_words.remove(excl_word)

    return all_words

    def get_punctuation_marks_number(self, text):
    return self._get_ocurrences_number(text, self.PUNCTUATION_MARKS)

    def get_exclamation_marks_number(self, text):

    exclamtion_string = filter(lambda item: item == self.EXCLAMATION_MARK, text)
    return len(exclamtion_string)

    def get_capitalized_words_number(self, text):
    ocurrences = [word for word in text if word[0].isupper()]
    return len(ocurrences)

    def get_positive_emoticons_number(self, text):
    return self._get_ocurrences_number(text, self.POSITIVE_EMOTICONS)

    def get_neutral_emoticons_number(self, text):
    return self._get_ocurrences_number(text, self.NEUTRAL_EMOTICONS)

    def get_negative_emoticons_number(self, text):
    return self._get_ocurrences_number(text, self.NEGATIVE_EMOTICONS)
    33 changes: 33 additions & 0 deletions miner.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,33 @@
    from twitter import *
    import parser
    import pprint

    CONSUMER_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXX"
    CONSUMER_SECRET = "XXXXXXXXXXXXXXXXXXXXXXXXXXXX"
    ACCESS_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXX"
    ACCESS_SECRET = "XXXXXXXXXXXXXXXXXXXXXXXXXXXX"

    OUTPUT_FILENAME = "products.txt"

    products_to_miner = ['#gopro', '#iphone6']

    twitter = Twitter(auth=OAuth(
    ACCESS_KEY, ACCESS_SECRET, CONSUMER_KEY, CONSUMER_SECRET))

    products_file = file(OUTPUT_FILENAME, "a")

    for product_query in products_to_miner:
    results = twitter.search.tweets(q=product_query, count=100)

    # pprint.pprint(results["statuses"][0])

    stored_data = {
    "tweets_number": len(results["statuses"])
    }

    row_format_in_file = "%(tweets_number)s, ...\n"

    # Write in file
    products_file.write(row_format_in_file % stored_data)

    products_file.close()
    18 changes: 18 additions & 0 deletions parser.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,18 @@

    from xml.dom import minidom
    import urllib

    words_list = ['these', 'are', '73iufweghj', 'words']
    query = '+'.join(words_list)
    url = 'http://compling.org/cgi-bin/DAL_sentence_xml.cgi?sentence=%s'

    remote_doc = urllib.urlopen(url % query).read()

    # doc = minidom.parse("sentences.xml")
    doc = minidom.parseString(remote_doc)

    words = doc.getElementsByTagName("word")
    for word in words:
    token_tag = word.getElementsByTagName('token')[0]
    emotion_measure_tag = word.getElementsByTagName('measure')[0]
    valence = emotion_measure_tag.getAttribute("valence")