Skip to content

Instantly share code, notes, and snippets.

@Karmak23
Forked from shlomibabluki/np_extractor.py
Created October 21, 2015 09:32
Show Gist options
  • Select an option

  • Save Karmak23/0b22335748f071e3b68f to your computer and use it in GitHub Desktop.

Select an option

Save Karmak23/0b22335748f071e3b68f to your computer and use it in GitHub Desktop.

Revisions

  1. @shlomibabluki shlomibabluki revised this gist May 9, 2013. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion np_extractor.py
    Original file line number Diff line number Diff line change
    @@ -4,7 +4,7 @@

    # This is a fast and simple noun phrase extractor (based on NLTK)
    # Feel free to use it, just keep a link back to this post
    # URL
    # http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/
    # Create by Shlomi Babluki
    # May, 2013

  2. @shlomibabluki shlomibabluki created this gist May 8, 2013.
    111 changes: 111 additions & 0 deletions np_extractor.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,111 @@
    # coding=UTF-8
    import nltk
    from nltk.corpus import brown

    # This is a fast and simple noun phrase extractor (based on NLTK)
    # Feel free to use it, just keep a link back to this post
    # URL
    # Create by Shlomi Babluki
    # May, 2013


    # This is our fast Part of Speech tagger
    #############################################################################
    brown_train = brown.tagged_sents(categories='news')
    regexp_tagger = nltk.RegexpTagger(
    [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
    (r'(-|:|;)$', ':'),
    (r'\'*$', 'MD'),
    (r'(The|the|A|a|An|an)$', 'AT'),
    (r'.*able$', 'JJ'),
    (r'^[A-Z].*$', 'NNP'),
    (r'.*ness$', 'NN'),
    (r'.*ly$', 'RB'),
    (r'.*s$', 'NNS'),
    (r'.*ing$', 'VBG'),
    (r'.*ed$', 'VBD'),
    (r'.*', 'NN')
    ])
    unigram_tagger = nltk.UnigramTagger(brown_train, backoff=regexp_tagger)
    bigram_tagger = nltk.BigramTagger(brown_train, backoff=unigram_tagger)
    #############################################################################


    # This is our semi-CFG; Extend it according to your own needs
    #############################################################################
    cfg = {}
    cfg["NNP+NNP"] = "NNP"
    cfg["NN+NN"] = "NNI"
    cfg["NNI+NN"] = "NNI"
    cfg["JJ+JJ"] = "JJ"
    cfg["JJ+NN"] = "NNI"
    #############################################################################


    class NPExtractor(object):

    def __init__(self, sentence):
    self.sentence = sentence

    # Split the sentence into singlw words/tokens
    def tokenize_sentence(self, sentence):
    tokens = nltk.word_tokenize(sentence)
    return tokens

    # Normalize brown corpus' tags ("NN", "NN-PL", "NNS" > "NN")
    def normalize_tags(self, tagged):
    n_tagged = []
    for t in tagged:
    if t[1] == "NP-TL" or t[1] == "NP":
    n_tagged.append((t[0], "NNP"))
    continue
    if t[1].endswith("-TL"):
    n_tagged.append((t[0], t[1][:-3]))
    continue
    if t[1].endswith("S"):
    n_tagged.append((t[0], t[1][:-1]))
    continue
    n_tagged.append((t[0], t[1]))
    return n_tagged

    # Extract the main topics from the sentence
    def extract(self):

    tokens = self.tokenize_sentence(self.sentence)
    tags = self.normalize_tags(bigram_tagger.tag(tokens))

    merge = True
    while merge:
    merge = False
    for x in range(0, len(tags) - 1):
    t1 = tags[x]
    t2 = tags[x + 1]
    key = "%s+%s" % (t1[1], t2[1])
    value = cfg.get(key, '')
    if value:
    merge = True
    tags.pop(x)
    tags.pop(x)
    match = "%s %s" % (t1[0], t2[0])
    pos = value
    tags.insert(x, (match, pos))
    break

    matches = []
    for t in tags:
    if t[1] == "NNP" or t[1] == "NNI":
    #if t[1] == "NNP" or t[1] == "NNI" or t[1] == "NN":
    matches.append(t[0])
    return matches


    # Main method, just run "python np_extractor.py"
    def main():

    sentence = "Swayy is a beautiful new dashboard for discovering and curating online content."
    np_extractor = NPExtractor(sentence)
    result = np_extractor.extract()
    print "This sentence is about: %s" % ", ".join(result)

    if __name__ == '__main__':
    main()