Skip to content

Instantly share code, notes, and snippets.

@zachary
Forked from edubey/bag-of-word-vectors.py
Created February 28, 2024 19:11
Show Gist options
  • Select an option

  • Save zachary/45b452870c29d3a983a98d07a2fd4c51 to your computer and use it in GitHub Desktop.

Select an option

Save zachary/45b452870c29d3a983a98d07a2fd4c51 to your computer and use it in GitHub Desktop.

Revisions

  1. @edubey edubey created this gist Dec 2, 2018.
    45 changes: 45 additions & 0 deletions bag-of-word-vectors.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,45 @@
    # import statments
    import numpy
    import re

    '''
    Tokenize each the sentences, example
    Input : "John likes to watch movies. Mary likes movies too"
    Ouput : "John","likes","to","watch","movies","Mary","likes","movies","too"
    '''
    def tokenize(sentences):
    words = []
    for sentence in sentences:
    w = word_extraction(sentence)
    words.extend(w)

    words = sorted(list(set(words)))
    return words

    def word_extraction(sentence):
    ignore = ['a', "the", "is"]
    words = re.sub("[^\w]", " ", sentence).split()
    cleaned_text = [w.lower() for w in words if w not in ignore]
    return cleaned_text

    def generate_bow(allsentences):
    vocab = tokenize(allsentences)
    print("Word List for Document \n{0} \n".format(vocab));

    for sentence in allsentences:
    words = word_extraction(sentence)
    bag_vector = numpy.zeros(len(vocab))
    for w in words:
    for i,word in enumerate(vocab):
    if word == w:
    bag_vector[i] += 1

    print("{0} \n{1}\n".format(sentence,numpy.array(bag_vector)))


    allsentences = ["Joe waited for the train", "The train was late", "Mary and Samantha took the bus",
    "I looked for Mary and Samantha at the bus station",
    "Mary and Samantha arrived at the bus station early but waited until noon for the bus"]


    generate_bow(allsentences)