-
-
Save smhanov/94230b422c2100ae4218 to your computer and use it in GitHub Desktop.
| #!/usr/bin/python3 | |
| # By Steve Hanov, 2011. Released to the public domain. | |
| # Please see http://stevehanov.ca/blog/index.php?id=115 for the accompanying article. | |
| # | |
| # Based on Daciuk, Jan, et al. "Incremental construction of minimal acyclic finite-state automata." | |
| # Computational linguistics 26.1 (2000): 3-16. | |
| # | |
| # Updated 2014 to use DAWG as a mapping; see | |
| # Kowaltowski, T.; CL. Lucchesi (1993), "Applications of finite automata representing large vocabularies", | |
| # Software-Practice and Experience 1993 | |
| import sys | |
| import time | |
| DICTIONARY = "/usr/share/dict/words" | |
| QUERY = sys.argv[1:] | |
| # This class represents a node in the directed acyclic word graph (DAWG). It | |
| # has a list of edges to other nodes. It has functions for testing whether it | |
| # is equivalent to another node. Nodes are equivalent if they have identical | |
| # edges, and each identical edge leads to identical states. The __hash__ and | |
| # __eq__ functions allow it to be used as a key in a python dictionary. | |
| class DawgNode: | |
| NextId = 0 | |
| def __init__(self): | |
| self.id = DawgNode.NextId | |
| DawgNode.NextId += 1 | |
| self.final = False | |
| self.edges = {} | |
| # Number of end nodes reachable from this one. | |
| self.count = 0 | |
| def __str__(self): | |
| arr = [] | |
| if self.final: | |
| arr.append("1") | |
| else: | |
| arr.append("0") | |
| for (label, node) in self.edges.items(): | |
| arr.append( label ) | |
| arr.append( str( node.id ) ) | |
| return "_".join(arr) | |
| def __hash__(self): | |
| return self.__str__().__hash__() | |
| def __eq__(self, other): | |
| return self.__str__() == other.__str__() | |
| def numReachable(self): | |
| # if a count is already assigned, return it | |
| if self.count: return self.count | |
| # count the number of final nodes that are reachable from this one. | |
| # including self | |
| count = 0 | |
| if self.final: count += 1 | |
| for label, node in self.edges.items(): | |
| count += node.numReachable() | |
| self.count = count | |
| return count | |
| class Dawg: | |
| def __init__(self): | |
| self.previousWord = "" | |
| self.root = DawgNode() | |
| # Here is a list of nodes that have not been checked for duplication. | |
| self.uncheckedNodes = [] | |
| # Here is a list of unique nodes that have been checked for | |
| # duplication. | |
| self.minimizedNodes = {} | |
| # Here is the data associated with all the nodes | |
| self.data = [] | |
| def insert( self, word, data ): | |
| if word <= self.previousWord: | |
| raise Exception("Error: Words must be inserted in alphabetical " + | |
| "order.") | |
| # find common prefix between word and previous word | |
| commonPrefix = 0 | |
| for i in range( min( len( word ), len( self.previousWord ) ) ): | |
| if word[i] != self.previousWord[i]: break | |
| commonPrefix += 1 | |
| # Check the uncheckedNodes for redundant nodes, proceeding from last | |
| # one down to the common prefix size. Then truncate the list at that | |
| # point. | |
| self._minimize( commonPrefix ) | |
| self.data.append(data) | |
| # add the suffix, starting from the correct node mid-way through the | |
| # graph | |
| if len(self.uncheckedNodes) == 0: | |
| node = self.root | |
| else: | |
| node = self.uncheckedNodes[-1][2] | |
| for letter in word[commonPrefix:]: | |
| nextNode = DawgNode() | |
| node.edges[letter] = nextNode | |
| self.uncheckedNodes.append( (node, letter, nextNode) ) | |
| node = nextNode | |
| node.final = True | |
| self.previousWord = word | |
| def finish( self ): | |
| # minimize all uncheckedNodes | |
| self._minimize( 0 ); | |
| # go through entire structure and assign the counts to each node. | |
| self.root.numReachable() | |
| def _minimize( self, downTo ): | |
| # proceed from the leaf up to a certain point | |
| for i in range( len(self.uncheckedNodes) - 1, downTo - 1, -1 ): | |
| (parent, letter, child) = self.uncheckedNodes[i]; | |
| if child in self.minimizedNodes: | |
| # replace the child with the previously encountered one | |
| parent.edges[letter] = self.minimizedNodes[child] | |
| else: | |
| # add the state to the minimized nodes. | |
| self.minimizedNodes[child] = child; | |
| self.uncheckedNodes.pop() | |
| def lookup( self, word ): | |
| node = self.root | |
| skipped = 0 # keep track of number of final nodes that we skipped | |
| for letter in word: | |
| if letter not in node.edges: return None | |
| for label, child in sorted(node.edges.items()): | |
| if label == letter: | |
| if node.final: skipped += 1 | |
| node = child | |
| break | |
| skipped += child.count | |
| if node.final: | |
| return self.data[skipped] | |
| def nodeCount( self ): | |
| return len(self.minimizedNodes) | |
| def edgeCount( self ): | |
| count = 0 | |
| for node in self.minimizedNodes: | |
| count += len(node.edges) | |
| return count | |
| def display(self): | |
| stack = [self.root] | |
| done = set() | |
| while stack: | |
| node = stack.pop() | |
| if node.id in done: continue | |
| done.add(node.id) | |
| print("{}: ({})".format(node.id, node)) | |
| for label, child in node.edges.items(): | |
| print(" {} goto {}".format(label, child.id)) | |
| stack.append(child) | |
| if 0: | |
| dawg = Dawg() | |
| dawg.insert("cat", 0) | |
| dawg.insert("catnip", 1) | |
| dawg.insert("zcatnip", 2) | |
| dawg.finish() | |
| dawg.display() | |
| sys.exit() | |
| dawg = Dawg() | |
| WordCount = 0 | |
| words = open(DICTIONARY, "rt").read().split() | |
| words.sort() | |
| start = time.time() | |
| for word in words: | |
| WordCount += 1 | |
| # insert all words, using the reversed version as the data associated with | |
| # it | |
| dawg.insert(word, ''.join(reversed(word))) | |
| if ( WordCount % 100 ) == 0: print("{0}\r".format(WordCount), end="") | |
| dawg.finish() | |
| print("Dawg creation took {0} s".format(time.time()-start)) | |
| EdgeCount = dawg.edgeCount() | |
| print("Read {0} words into {1} nodes and {2} edges".format( | |
| WordCount, dawg.nodeCount(), EdgeCount)) | |
| print("This could be stored in as little as {0} bytes".format(EdgeCount * 4)) | |
| for word in QUERY: | |
| result = dawg.lookup(word) | |
| if result == None: | |
| print("{0} not in dictionary.".format(word)) | |
| else: | |
| print("{0} is in the dictionary and has data {1}".format(word, result)) |
For my memory efficient implementation in go, see:
https://godoc.org/github.com/smhanov/dawg
Instead of storing a map for edge node, this version uses one map for all edges.
For my memory efficient implementation in go, see:
https://godoc.org/github.com/smhanov/dawg
Instead of storing a map for edge node, this version uses one map for all edges.
@smhanov - how can I modify this if I'm dealing with sentences & I want the edges to be keyed on a token/word as opposed to a character?
@dhrushilbadani - Did you try it? Because it's python, I think it will work unmodified with sentences. Instead of using a word use a sentence split on spaces. Then fix any minor problems you find.
@smhanov - how can I modify this if I'm dealing with sentences & I want the edges to be keyed on a token/word as opposed to a character?
@smhanov makes sense, thanks for the reply! Also, what can one do when sorting the input strings is too expensive?
@dhrushilbadani The original paper "Incremental construction of minimal acyclic finite-state automata." also had an algorithm for non-sorted input, but at the time I thought it was too complicated for my blog entry.
Another option: The Unix sort command is magical and optimized for very large files. Usually when something is too expensive to sort, I will pipe it through "sort" and get the results.
I think the
__eq__() method is flawed: it depends on__str__()which itself depends on the arbitrary order of items in a dictionary.Changing line 41 to
should fix this.