"""A simple implementation of a greedy transition-based parser. Released under BSD license.""" from os import path import os import sys from collections import defaultdict import random import time import pickle SHIFT = 0; RIGHT = 1; LEFT = 2; MOVES = (SHIFT, RIGHT, LEFT) START = ['-START-', '-START2-'] END = ['-END-', '-END2-'] class DefaultList(list): """A list that returns a default value if index out of bounds.""" def __init__(self, default=None): self.default = default list.__init__(self) def __getitem__(self, index): try: return list.__getitem__(self, index) except IndexError: return self.default class Parse(object): def __init__(self, n): self.n = n self.heads = [None] * (n-1) self.labels = [None] * (n-1) self.lefts = [] self.rights = [] for i in range(n+1): self.lefts.append(DefaultList(0)) self.rights.append(DefaultList(0)) def add(self, head, child, label=None): self.heads[child] = head self.labels[child] = label if child < head: self.lefts[head].append(child) else: self.rights[head].append(child) class Parser(object): def __init__(self, load=True): model_dir = os.path.dirname(__file__) self.model = Perceptron(MOVES) if load: self.model.load(path.join(model_dir, 'parser.pickle')) self.tagger = PerceptronTagger(load=load) self.confusion_matrix = defaultdict(lambda: defaultdict(int)) def save(self): self.model.save(path.join(os.path.dirname(__file__), 'parser.pickle')) self.tagger.save() def parse(self, words): n = len(words) i = 2; stack = [1]; parse = Parse(n) tags = self.tagger.tag(words) while stack or (i+1) < n: features = extract_features(words, tags, i, n, stack, parse) scores = self.model.score(features) valid_moves = get_valid_moves(i, n, len(stack)) guess = max(valid_moves, key=lambda move: scores[move]) i = transition(guess, i, stack, parse) return tags, parse.heads def train_one(self, itn, words, gold_tags, gold_heads): n = len(words) i = 2; stack = [1]; parse = Parse(n) tags = self.tagger.tag(words) while stack or (i + 1) < n: features = extract_features(words, tags, i, n, stack, parse) scores = self.model.score(features) valid_moves = get_valid_moves(i, n, len(stack)) gold_moves = get_gold_moves(i, n, stack, parse.heads, gold_heads) guess = max(valid_moves, key=lambda move: scores[move]) assert gold_moves best = max(gold_moves, key=lambda move: scores[move]) self.model.update(best, guess, features) i = transition(guess, i, stack, parse) self.confusion_matrix[best][guess] += 1 return len([i for i in range(n-1) if parse.heads[i] == gold_heads[i]]) def transition(move, i, stack, parse): if move == SHIFT: stack.append(i) return i + 1 elif move == RIGHT: parse.add(stack[-2], stack.pop()) return i elif move == LEFT: parse.add(i, stack.pop()) return i assert move in MOVES def get_valid_moves(i, n, stack_depth): moves = [] if (i+1) < n: moves.append(SHIFT) if stack_depth >= 2: moves.append(RIGHT) if stack_depth >= 1: moves.append(LEFT) return moves def get_gold_moves(n0, n, stack, heads, gold): def deps_between(target, others, gold): for word in others: if gold[word] == target or gold[target] == word: return True return False valid = get_valid_moves(n0, n, len(stack)) if not stack or (SHIFT in valid and gold[n0] == stack[-1]): return [SHIFT] if gold[stack[-1]] == n0: return [LEFT] costly = set([m for m in MOVES if m not in valid]) # If the word behind s0 is its gold head, Left is incorrect if len(stack) >= 2 and gold[stack[-1]] == stack[-2]: costly.add(LEFT) # If there are any dependencies between n0 and the stack, # pushing n0 will lose them. if SHIFT not in costly and deps_between(n0, stack, gold): costly.add(SHIFT) # If there are any dependencies between s0 and the buffer, popping # s0 will lose them. if deps_between(stack[-1], range(n0+1, n-1), gold): costly.add(LEFT) costly.add(RIGHT) return [m for m in MOVES if m not in costly] def extract_features(words, tags, n0, n, stack, parse): def get_stack_context(depth, stack, data): if depth >= 3: return data[stack[-1]], data[stack[-2]], data[stack[-3]] elif depth >= 2: return data[stack[-1]], data[stack[-2]], '' elif depth == 1: return data[stack[-1]], '', '' else: return '', '', '' def get_buffer_context(i, n, data): if i + 1 >= n: return data[i], '', '' elif i + 2 >= n: return data[i], data[i + 1], '' else: return data[i], data[i + 1], data[i + 2] def get_parse_context(word, deps, data): if word == -1: return 0, '', '' deps = deps[word] valency = len(deps) if not valency: return 0, '', '' elif valency == 1: return 1, data[deps[-1]], '' else: return valency, data[deps[-1]], data[deps[-2]] features = {} # Set up the context pieces --- the word (W) and tag (T) of: # S0-2: Top three words on the stack # N0-2: First three words of the buffer # n0b1, n0b2: Two leftmost children of the first word of the buffer # s0b1, s0b2: Two leftmost children of the top word of the stack # s0f1, s0f2: Two rightmost children of the top word of the stack depth = len(stack) s0 = stack[-1] if depth else -1 Ws0, Ws1, Ws2 = get_stack_context(depth, stack, words) Ts0, Ts1, Ts2 = get_stack_context(depth, stack, tags) Wn0, Wn1, Wn2 = get_buffer_context(n0, n, words) Tn0, Tn1, Tn2 = get_buffer_context(n0, n, tags) Vn0b, Wn0b1, Wn0b2 = get_parse_context(n0, parse.lefts, words) Vn0b, Tn0b1, Tn0b2 = get_parse_context(n0, parse.lefts, tags) Vn0f, Wn0f1, Wn0f2 = get_parse_context(n0, parse.rights, words) _, Tn0f1, Tn0f2 = get_parse_context(n0, parse.rights, tags) Vs0b, Ws0b1, Ws0b2 = get_parse_context(s0, parse.lefts, words) _, Ts0b1, Ts0b2 = get_parse_context(s0, parse.lefts, tags) Vs0f, Ws0f1, Ws0f2 = get_parse_context(s0, parse.rights, words) _, Ts0f1, Ts0f2 = get_parse_context(s0, parse.rights, tags) # Cap numeric features at 5? # String-distance Ds0n0 = min((n0 - s0, 5)) if s0 != 0 else 0 features['bias'] = 1 # Add word and tag unigrams for w in (Wn0, Wn1, Wn2, Ws0, Ws1, Ws2, Wn0b1, Wn0b2, Ws0b1, Ws0b2, Ws0f1, Ws0f2): if w: features['w=%s' % w] = 1 for t in (Tn0, Tn1, Tn2, Ts0, Ts1, Ts2, Tn0b1, Tn0b2, Ts0b1, Ts0b2, Ts0f1, Ts0f2): if t: features['t=%s' % t] = 1 # Add word/tag pairs for i, (w, t) in enumerate(((Wn0, Tn0), (Wn1, Tn1), (Wn2, Tn2), (Ws0, Ts0))): if w or t: features['%d w=%s, t=%s' % (i, w, t)] = 1 # Add some bigrams features['s0w=%s, n0w=%s' % (Ws0, Wn0)] = 1 features['wn0tn0-ws0 %s/%s %s' % (Wn0, Tn0, Ws0)] = 1 features['wn0tn0-ts0 %s/%s %s' % (Wn0, Tn0, Ts0)] = 1 features['ws0ts0-wn0 %s/%s %s' % (Ws0, Ts0, Wn0)] = 1 features['ws0-ts0 tn0 %s/%s %s' % (Ws0, Ts0, Tn0)] = 1 features['wt-wt %s/%s %s/%s' % (Ws0, Ts0, Wn0, Tn0)] = 1 features['tt s0=%s n0=%s' % (Ts0, Tn0)] = 1 features['tt n0=%s n1=%s' % (Tn0, Tn1)] = 1 # Add some tag trigrams trigrams = ((Tn0, Tn1, Tn2), (Ts0, Tn0, Tn1), (Ts0, Ts1, Tn0), (Ts0, Ts0f1, Tn0), (Ts0, Ts0f1, Tn0), (Ts0, Tn0, Tn0b1), (Ts0, Ts0b1, Ts0b2), (Ts0, Ts0f1, Ts0f2), (Tn0, Tn0b1, Tn0b2), (Ts0, Ts1, Ts1)) for i, (t1, t2, t3) in enumerate(trigrams): if t1 or t2 or t3: features['ttt-%d %s %s %s' % (i, t1, t2, t3)] = 1 # Add some valency and distance features vw = ((Ws0, Vs0f), (Ws0, Vs0b), (Wn0, Vn0b)) vt = ((Ts0, Vs0f), (Ts0, Vs0b), (Tn0, Vn0b)) d = ((Ws0, Ds0n0), (Wn0, Ds0n0), (Ts0, Ds0n0), (Tn0, Ds0n0), ('t' + Tn0+Ts0, Ds0n0), ('w' + Wn0+Ws0, Ds0n0)) for i, (w_t, v_d) in enumerate(vw + vt + d): if w_t or v_d: features['val/d-%d %s %d' % (i, w_t, v_d)] = 1 return features class Perceptron(object): def __init__(self, classes=None): # Each feature gets its own weight vector, so weights is a dict-of-arrays self.classes = classes self.weights = {} # The accumulated values, for the averaging. These will be keyed by # feature/clas tuples self._totals = defaultdict(int) # The last time the feature was changed, for the averaging. Also # keyed by feature/clas tuples # (tstamps is short for timestamps) self._tstamps = defaultdict(int) # Number of instances seen self.i = 0 def predict(self, features): '''Dot-product the features and current weights and return the best class.''' scores = self.score(features) # Do a secondary alphabetic sort, for stability return max(self.classes, key=lambda clas: (scores[clas], clas)) def score(self, features): all_weights = self.weights scores = dict((clas, 0) for clas in self.classes) for feat, value in features.items(): if value == 0: continue if feat not in all_weights: continue weights = all_weights[feat] for clas, weight in weights.items(): scores[clas] += value * weight return scores def update(self, truth, guess, features): def upd_feat(c, f, w, v): param = (f, c) self._totals[param] += (self.i - self._tstamps[param]) * w self._tstamps[param] = self.i self.weights[f][c] = w + v self.i += 1 if truth == guess: return None for f in features: weights = self.weights.setdefault(f, {}) upd_feat(truth, f, weights.get(truth, 0.0), 1.0) upd_feat(guess, f, weights.get(guess, 0.0), -1.0) def average_weights(self): for feat, weights in self.weights.items(): new_feat_weights = {} for clas, weight in weights.items(): param = (feat, clas) total = self._totals[param] total += (self.i - self._tstamps[param]) * weight averaged = round(total / float(self.i), 3) if averaged: new_feat_weights[clas] = averaged self.weights[feat] = new_feat_weights def save(self, path): print "Saving model to %s" % path pickle.dump(self.weights, open(path, 'w')) def load(self, path): self.weights = pickle.load(open(path)) class PerceptronTagger(object): '''Greedy Averaged Perceptron tagger''' model_loc = os.path.join(os.path.dirname(__file__), 'tagger.pickle') def __init__(self, classes=None, load=True): self.tagdict = {} if classes: self.classes = classes else: self.classes = set() self.model = Perceptron(self.classes) if load: self.load(PerceptronTagger.model_loc) def tag(self, words, tokenize=True): prev, prev2 = START tags = DefaultList('') context = START + [self._normalize(w) for w in words] + END for i, word in enumerate(words): tag = self.tagdict.get(word) if not tag: features = self._get_features(i, word, context, prev, prev2) tag = self.model.predict(features) tags.append(tag) prev2 = prev; prev = tag return tags def start_training(self, sentences): self._make_tagdict(sentences) self.model = Perceptron(self.classes) def train(self, sentences, save_loc=None, nr_iter=5): '''Train a model from sentences, and save it at save_loc. nr_iter controls the number of Perceptron training iterations.''' self.start_training(sentences) for iter_ in range(nr_iter): for words, tags in sentences: self.train_one(words, tags) random.shuffle(sentences) self.end_training(save_loc) def save(self): # Pickle as a binary file pickle.dump((self.model.weights, self.tagdict, self.classes), open(PerceptronTagger.model_loc, 'wb'), -1) def train_one(self, words, tags): prev, prev2 = START context = START + [self._normalize(w) for w in words] + END for i, word in enumerate(words): guess = self.tagdict.get(word) if not guess: feats = self._get_features(i, word, context, prev, prev2) guess = self.model.predict(feats) self.model.update(tags[i], guess, feats) prev2 = prev; prev = guess def load(self, loc): w_td_c = pickle.load(open(loc, 'rb')) self.model.weights, self.tagdict, self.classes = w_td_c self.model.classes = self.classes def _normalize(self, word): if '-' in word and word[0] != '-': return '!HYPHEN' elif word.isdigit() and len(word) == 4: return '!YEAR' elif word[0].isdigit(): return '!DIGITS' else: return word.lower() def _get_features(self, i, word, context, prev, prev2): '''Map tokens into a feature representation, implemented as a {hashable: float} dict. If the features change, a new model must be trained.''' def add(name, *args): features[' '.join((name,) + tuple(args))] += 1 i += len(START) features = defaultdict(int) # It's useful to have a constant feature, which acts sort of like a prior add('bias') add('i suffix', word[-3:]) add('i pref1', word[0]) add('i-1 tag', prev) add('i-2 tag', prev2) add('i tag+i-2 tag', prev, prev2) add('i word', context[i]) add('i-1 tag+i word', prev, context[i]) add('i-1 word', context[i-1]) add('i-1 suffix', context[i-1][-3:]) add('i-2 word', context[i-2]) add('i+1 word', context[i+1]) add('i+1 suffix', context[i+1][-3:]) add('i+2 word', context[i+2]) return features def _make_tagdict(self, sentences): '''Make a tag dictionary for single-tag words.''' counts = defaultdict(lambda: defaultdict(int)) for sent in sentences: for word, tag in zip(sent[0], sent[1]): counts[word][tag] += 1 self.classes.add(tag) freq_thresh = 20 ambiguity_thresh = 0.97 for word, tag_freqs in counts.items(): tag, mode = max(tag_freqs.items(), key=lambda item: item[1]) n = sum(tag_freqs.values()) # Don't add rare words to the tag dictionary # Only add quite unambiguous words if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh: self.tagdict[word] = tag def _pc(n, d): return (float(n) / d) * 100 def train(parser, sentences, nr_iter): parser.tagger.start_training(sentences) for itn in range(nr_iter): corr = 0; total = 0 random.shuffle(sentences) for words, gold_tags, gold_parse, gold_label in sentences: corr += parser.train_one(itn, words, gold_tags, gold_parse) if itn < 5: parser.tagger.train_one(words, gold_tags) total += len(words) print itn, '%.3f' % (float(corr) / float(total)) if itn == 4: parser.tagger.model.average_weights() print 'Averaging weights' parser.model.average_weights() def read_pos(loc): for line in open(loc): if not line.strip(): continue words = DefaultList('') tags = DefaultList('') for token in line.split(): if not token: continue word, tag = token.rsplit('/', 1) #words.append(normalize(word)) words.append(word) tags.append(tag) pad_tokens(words); pad_tokens(tags) yield words, tags def read_conll(loc): for sent_str in open(loc).read().strip().split('\n\n'): lines = [line.split() for line in sent_str.split('\n')] words = DefaultList(''); tags = DefaultList('') heads = [None]; labels = [None] for i, (word, pos, head, label) in enumerate(lines): words.append(intern(word)) #words.append(intern(normalize(word))) tags.append(intern(pos)) heads.append(int(head) + 1 if head != '-1' else len(lines) + 1) labels.append(label) pad_tokens(words); pad_tokens(tags) yield words, tags, heads, labels def pad_tokens(tokens): tokens.insert(0, '') tokens.append('ROOT') def main(model_dir, train_loc, heldout_in, heldout_gold): if not os.path.exists(model_dir): os.mkdir(model_dir) input_sents = list(read_pos(heldout_in)) parser = Parser(load=False) sentences = list(read_conll(train_loc)) train(parser, sentences, nr_iter=15) parser.save() c = 0 t = 0 gold_sents = list(read_conll(heldout_gold)) t1 = time.time() for (words, tags), (_, _, gold_heads, gold_labels) in zip(input_sents, gold_sents): _, heads = parser.parse(words) for i, w in list(enumerate(words))[1:-1]: if gold_labels[i] in ('P', 'punct'): continue if heads[i] == gold_heads[i]: c += 1 t += 1 t2 = time.time() print 'Parsing took %0.3f ms' % ((t2-t1)*1000.0) print c, t, float(c)/t if __name__ == '__main__': main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])