__author__ = 'ssbushi'

# Import the toolkit and tags
import nltk
from nltk.corpus import treebank

# Train data - pretagged
train_data = treebank.tagged_sents()[:3000]

print train_data[0]

# Import HMM module
from nltk.tag import hmm

# Setup a trainer with default(None) values
# And train with the data
trainer = hmm.HiddenMarkovModelTrainer()
tagger = trainer.train_supervised(train_data)

print tagger
# Prints the basic data about the tagger

print tagger.tag("Today is a good day .".split())

print tagger.tag("Joe met Joanne in Delhi .".split())

print tagger.tag("Chicago is the birthplace of Ginny".split())

"""
Output in order (Notice some tags are wrong :/):
[('Today', u'NN'), ('is', u'VBZ'), ('a', u'DT'), ('good', u'JJ'), ('day', u'NN'), ('.', u'.')]
[('Joe', u'NNP'), ('met', u'VBD'), ('Joanne', u'NNP'), ('in', u'IN'), ('Delhi', u'NNP'), ('.', u'NNP')]
[('Chicago', u'NNP'), ('is', u'VBZ'), ('the', u'DT'), ('birthplace', u'NNP'), ('of', u'NNP'), ('Ginny', u'NNP')]
"""