Last active
          August 18, 2025 12:59 
        
      - 
      
- 
        Save smhanov/94230b422c2100ae4218 to your computer and use it in GitHub Desktop. 
Revisions
- 
        smhanov revised this gist Apr 7, 2019 . No changes.There are no files selected for viewing
- 
        smhanov revised this gist Apr 7, 2019 . No changes.There are no files selected for viewing
- 
        smhanov revised this gist Feb 10, 2017 . 1 changed file with 8 additions and 1 deletion.There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,6 +1,13 @@ #!/usr/bin/python3 # By Steve Hanov, 2011. Released to the public domain. # Please see http://stevehanov.ca/blog/index.php?id=115 for the accompanying article. # # Based on Daciuk, Jan, et al. "Incremental construction of minimal acyclic finite-state automata." # Computational linguistics 26.1 (2000): 3-16. # # Updated 2014 to use DAWG as a mapping; see # Kowaltowski, T.; CL. Lucchesi (1993), "Applications of finite automata representing large vocabularies", # Software-Practice and Experience 1993 import sys import time 
- 
        smhanov revised this gist Feb 10, 2017 . 1 changed file with 4 additions and 5 deletions.There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,6 +1,5 @@ #!/usr/bin/python3 # By Steve Hanov, 2011. Released to the public domain. # Updated 2014 to use DAWG as a mapping. import sys import time @@ -151,16 +150,16 @@ def edgeCount( self ): return count def display(self): stack = [self.root] done = set() while stack: node = stack.pop() if node.id in done: continue done.add(node.id) print("{}: ({})".format(node.id, node)) for label, child in node.edges.items(): print(" {} goto {}".format(label, child.id)) stack.append(child) if 0: dawg = Dawg() 
- 
        smhanov revised this gist Feb 10, 2017 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,5 +1,6 @@ #!/usr/bin/python3 # By Steve Hanov, 2011. Released to the public domain. # Please see http://stevehanov.ca/blog/index.php?id=115 for the accompanying article. # Updated 2014 to use DAWG as a mapping. import sys import time 
- 
        smhanov revised this gist Feb 9, 2017 . 1 changed file with 5 additions and 2 deletions.There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -151,14 +151,17 @@ def edgeCount( self ): def display(self): queue = [self.root] done = set() while queue: node = queue.pop(0) if node.id in done: continue done.add(node.id) print("{}: ({})".format(node.id, node)) for label, child in node.edges.items(): print(" {} goto {}".format(label, child.id)) queue.append(child) if 0: dawg = Dawg() dawg.insert("cat", 0) dawg.insert("catnip", 1) 
- 
        smhanov revised this gist Feb 9, 2017 . 1 changed file with 19 additions and 2 deletions.There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -51,7 +51,7 @@ def numReachable(self): # including self count = 0 if self.final: count += 1 for label, node in self.edges.items(): count += node.numReachable() self.count = count @@ -149,7 +149,24 @@ def edgeCount( self ): count += len(node.edges) return count def display(self): queue = [self.root] while queue: node = queue.pop(0) print("{}:".format(node.id)) for label, child in node.edges.items(): print(" {} goto {}".format(label, child.id)) queue.append(child) if 1: dawg = Dawg() dawg.insert("cat", 0) dawg.insert("catnip", 1) dawg.insert("zcatnip", 2) dawg.finish() dawg.display() sys.exit() dawg = Dawg() WordCount = 0 words = open(DICTIONARY, "rt").read().split() 
- 
        smhanov revised this gist Nov 5, 2014 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -73,7 +73,7 @@ def __init__(self): self.data = [] def insert( self, word, data ): if word <= self.previousWord: raise Exception("Error: Words must be inserted in alphabetical " + "order.") 
- 
        smhanov renamed this gist Nov 5, 2014 . 1 changed file with 6 additions and 9 deletions.There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,5 +1,6 @@ #!/usr/bin/python3 # By Steve Hanov, 2011. Released to the public domain. # Updated 2014 to use DAWG as a mapping. import sys import time @@ -47,15 +48,11 @@ def numReachable(self): if self.count: return self.count # count the number of final nodes that are reachable from this one. # including self count = 0 if self.final: count += 1 for label, node in sorted(self.edges.items()): count += node.numReachable() self.count = count return count 
- 
        smhanov created this gist Nov 5, 2014 .There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,181 @@ #!/usr/bin/python3 # By Steve Hanov, 2011. Released to the public domain. import sys import time DICTIONARY = "/usr/share/dict/words" QUERY = sys.argv[1:] # This class represents a node in the directed acyclic word graph (DAWG). It # has a list of edges to other nodes. It has functions for testing whether it # is equivalent to another node. Nodes are equivalent if they have identical # edges, and each identical edge leads to identical states. The __hash__ and # __eq__ functions allow it to be used as a key in a python dictionary. class DawgNode: NextId = 0 def __init__(self): self.id = DawgNode.NextId DawgNode.NextId += 1 self.final = False self.edges = {} # Number of end nodes reachable from this one. self.count = 0 def __str__(self): arr = [] if self.final: arr.append("1") else: arr.append("0") for (label, node) in self.edges.items(): arr.append( label ) arr.append( str( node.id ) ) return "_".join(arr) def __hash__(self): return self.__str__().__hash__() def __eq__(self, other): return self.__str__() == other.__str__() def numReachable(self): # if a count is already assigned, return it if self.count: return self.count # count the number of final nodes that are reachable from this one. if len(self.edges) == 0: # no children. One node is reachable. count = 1 else: # otherwise, the count the number of final-nodes, including self. count = 0 if self.final: count += 1 for label, node in sorted(self.edges.items()): count += node.numReachable() self.count = count return count class Dawg: def __init__(self): self.previousWord = "" self.root = DawgNode() # Here is a list of nodes that have not been checked for duplication. self.uncheckedNodes = [] # Here is a list of unique nodes that have been checked for # duplication. self.minimizedNodes = {} # Here is the data associated with all the nodes self.data = [] def insert( self, word, data ): if word < self.previousWord: raise Exception("Error: Words must be inserted in alphabetical " + "order.") # find common prefix between word and previous word commonPrefix = 0 for i in range( min( len( word ), len( self.previousWord ) ) ): if word[i] != self.previousWord[i]: break commonPrefix += 1 # Check the uncheckedNodes for redundant nodes, proceeding from last # one down to the common prefix size. Then truncate the list at that # point. self._minimize( commonPrefix ) self.data.append(data) # add the suffix, starting from the correct node mid-way through the # graph if len(self.uncheckedNodes) == 0: node = self.root else: node = self.uncheckedNodes[-1][2] for letter in word[commonPrefix:]: nextNode = DawgNode() node.edges[letter] = nextNode self.uncheckedNodes.append( (node, letter, nextNode) ) node = nextNode node.final = True self.previousWord = word def finish( self ): # minimize all uncheckedNodes self._minimize( 0 ); # go through entire structure and assign the counts to each node. self.root.numReachable() def _minimize( self, downTo ): # proceed from the leaf up to a certain point for i in range( len(self.uncheckedNodes) - 1, downTo - 1, -1 ): (parent, letter, child) = self.uncheckedNodes[i]; if child in self.minimizedNodes: # replace the child with the previously encountered one parent.edges[letter] = self.minimizedNodes[child] else: # add the state to the minimized nodes. self.minimizedNodes[child] = child; self.uncheckedNodes.pop() def lookup( self, word ): node = self.root skipped = 0 # keep track of number of final nodes that we skipped for letter in word: if letter not in node.edges: return None for label, child in sorted(node.edges.items()): if label == letter: if node.final: skipped += 1 node = child break skipped += child.count if node.final: return self.data[skipped] def nodeCount( self ): return len(self.minimizedNodes) def edgeCount( self ): count = 0 for node in self.minimizedNodes: count += len(node.edges) return count dawg = Dawg() WordCount = 0 words = open(DICTIONARY, "rt").read().split() words.sort() start = time.time() for word in words: WordCount += 1 # insert all words, using the reversed version as the data associated with # it dawg.insert(word, ''.join(reversed(word))) if ( WordCount % 100 ) == 0: print("{0}\r".format(WordCount), end="") dawg.finish() print("Dawg creation took {0} s".format(time.time()-start)) EdgeCount = dawg.edgeCount() print("Read {0} words into {1} nodes and {2} edges".format( WordCount, dawg.nodeCount(), EdgeCount)) print("This could be stored in as little as {0} bytes".format(EdgeCount * 4)) for word in QUERY: result = dawg.lookup(word) if result == None: print("{0} not in dictionary.".format(word)) else: print("{0} is in the dictionary and has data {1}".format(word, result))