dongyuwei · October 31, 2015 03:47 · Nov 5, 2014 · Nov 5, 2014 · Nov 5, 2014
diff --git a/dawg.py b/dawg.py
@@ -73,7 +73,7 @@ def __init__(self):
         self.data = []
 
     def insert( self, word, data ):
-        if word < self.previousWord:
+        if word <= self.previousWord:
             raise Exception("Error: Words must be inserted in alphabetical " +
                 "order.")
 

diff --git a/gistfile1.txt → dawg.py b/gistfile1.txt → dawg.py
@@ -1,5 +1,6 @@
 #!/usr/bin/python3
 # By Steve Hanov, 2011. Released to the public domain.
+# Updated 2014 to use DAWG as a mapping.
 import sys
 import time
 
@@ -47,15 +48,11 @@ def numReachable(self):
         if self.count: return self.count
 
         # count the number of final nodes that are reachable from this one.
-        if len(self.edges) == 0:
-            # no children. One node is reachable.
-            count = 1
-        else:
-            # otherwise, the count the number of final-nodes, including self.
-            count = 0
-            if self.final: count += 1
-            for label, node in sorted(self.edges.items()):
-                count += node.numReachable()
+        # including self
+        count = 0
+        if self.final: count += 1
+        for label, node in sorted(self.edges.items()):
+            count += node.numReachable()
 
         self.count = count
         return count

diff --git a/gistfile1.txt b/gistfile1.txt
@@ -0,0 +1,181 @@
+#!/usr/bin/python3
+# By Steve Hanov, 2011. Released to the public domain.
+import sys
+import time
+
+DICTIONARY = "/usr/share/dict/words"
+QUERY = sys.argv[1:]
+
+# This class represents a node in the directed acyclic word graph (DAWG). It
+# has a list of edges to other nodes. It has functions for testing whether it
+# is equivalent to another node. Nodes are equivalent if they have identical
+# edges, and each identical edge leads to identical states. The __hash__ and
+# __eq__ functions allow it to be used as a key in a python dictionary.
+class DawgNode:
+    NextId = 0
+
+    def __init__(self):
+        self.id = DawgNode.NextId
+        DawgNode.NextId += 1
+        self.final = False
+        self.edges = {}
+
+        # Number of end nodes reachable from this one.
+        self.count = 0
+
+    def __str__(self):        
+        arr = []
+        if self.final: 
+            arr.append("1")
+        else:
+            arr.append("0")
+
+        for (label, node) in self.edges.items():
+            arr.append( label )
+            arr.append( str( node.id ) )
+
+        return "_".join(arr)
+
+    def __hash__(self):
+        return self.__str__().__hash__()
+
+    def __eq__(self, other):
+        return self.__str__() == other.__str__()
+
+    def numReachable(self):
+        # if a count is already assigned, return it
+        if self.count: return self.count
+
+        # count the number of final nodes that are reachable from this one.
+        if len(self.edges) == 0:
+            # no children. One node is reachable.
+            count = 1
+        else:
+            # otherwise, the count the number of final-nodes, including self.
+            count = 0
+            if self.final: count += 1
+            for label, node in sorted(self.edges.items()):
+                count += node.numReachable()
+
+        self.count = count
+        return count
+
+class Dawg:
+    def __init__(self):
+        self.previousWord = ""
+        self.root = DawgNode()
+
+        # Here is a list of nodes that have not been checked for duplication.
+        self.uncheckedNodes = []
+
+        # Here is a list of unique nodes that have been checked for
+        # duplication.
+        self.minimizedNodes = {}
+
+        # Here is the data associated with all the nodes
+        self.data = []
+
+    def insert( self, word, data ):
+        if word < self.previousWord:
+            raise Exception("Error: Words must be inserted in alphabetical " +
+                "order.")
+
+        # find common prefix between word and previous word
+        commonPrefix = 0
+        for i in range( min( len( word ), len( self.previousWord ) ) ):
+            if word[i] != self.previousWord[i]: break
+            commonPrefix += 1
+
+        # Check the uncheckedNodes for redundant nodes, proceeding from last
+        # one down to the common prefix size. Then truncate the list at that
+        # point.
+        self._minimize( commonPrefix )
+
+        self.data.append(data)
+
+        # add the suffix, starting from the correct node mid-way through the
+        # graph
+        if len(self.uncheckedNodes) == 0:
+            node = self.root
+        else:
+            node = self.uncheckedNodes[-1][2]
+
+        for letter in word[commonPrefix:]:
+            nextNode = DawgNode()
+            node.edges[letter] = nextNode
+            self.uncheckedNodes.append( (node, letter, nextNode) )
+            node = nextNode
+
+        node.final = True
+        self.previousWord = word
+
+    def finish( self ):
+        # minimize all uncheckedNodes
+        self._minimize( 0 );
+
+        # go through entire structure and assign the counts to each node.
+        self.root.numReachable()
+
+    def _minimize( self, downTo ):
+        # proceed from the leaf up to a certain point
+        for i in range( len(self.uncheckedNodes) - 1, downTo - 1, -1 ):
+            (parent, letter, child) = self.uncheckedNodes[i];
+            if child in self.minimizedNodes:
+                # replace the child with the previously encountered one
+                parent.edges[letter] = self.minimizedNodes[child]
+            else:
+                # add the state to the minimized nodes.
+                self.minimizedNodes[child] = child;
+            self.uncheckedNodes.pop()
+
+    def lookup( self, word ):
+        node = self.root
+        skipped = 0 # keep track of number of final nodes that we skipped
+        for letter in word:
+            if letter not in node.edges: return None
+            for label, child in sorted(node.edges.items()):
+                if label == letter: 
+                    if node.final: skipped += 1
+                    node = child
+                    break
+                skipped += child.count
+
+        if node.final:
+            return self.data[skipped]
+
+    def nodeCount( self ):
+        return len(self.minimizedNodes)
+
+    def edgeCount( self ):
+        count = 0
+        for node in self.minimizedNodes:
+            count += len(node.edges)
+        return count
+
+
+dawg = Dawg()
+WordCount = 0
+words = open(DICTIONARY, "rt").read().split()
+words.sort()
+start = time.time()    
+for word in words:
+    WordCount += 1
+    # insert all words, using the reversed version as the data associated with
+    # it
+    dawg.insert(word, ''.join(reversed(word)))
+    if ( WordCount % 100 ) == 0: print("{0}\r".format(WordCount), end="")
+dawg.finish()
+print("Dawg creation took {0} s".format(time.time()-start))
+
+EdgeCount = dawg.edgeCount()
+print("Read {0} words into {1} nodes and {2} edges".format(
+    WordCount, dawg.nodeCount(), EdgeCount))
+
+print("This could be stored in as little as {0} bytes".format(EdgeCount * 4))
+
+for word in QUERY:
+    result = dawg.lookup(word)
+    if result == None:
+        print("{0} not in dictionary.".format(word))
+    else:
+        print("{0} is in the dictionary and has data {1}".format(word, result))