Skip to content

Instantly share code, notes, and snippets.

@dongyuwei
Forked from smhanov/dawg.py
Created October 31, 2015 03:47
Show Gist options
  • Save dongyuwei/7732f641fc67efeaf877 to your computer and use it in GitHub Desktop.
Save dongyuwei/7732f641fc67efeaf877 to your computer and use it in GitHub Desktop.

Revisions

  1. @smhanov smhanov revised this gist Nov 5, 2014. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion dawg.py
    Original file line number Diff line number Diff line change
    @@ -73,7 +73,7 @@ def __init__(self):
    self.data = []

    def insert( self, word, data ):
    if word < self.previousWord:
    if word <= self.previousWord:
    raise Exception("Error: Words must be inserted in alphabetical " +
    "order.")

  2. @smhanov smhanov renamed this gist Nov 5, 2014. 1 changed file with 6 additions and 9 deletions.
    15 changes: 6 additions & 9 deletions gistfile1.txt → dawg.py
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,6 @@
    #!/usr/bin/python3
    # By Steve Hanov, 2011. Released to the public domain.
    # Updated 2014 to use DAWG as a mapping.
    import sys
    import time

    @@ -47,15 +48,11 @@ def numReachable(self):
    if self.count: return self.count

    # count the number of final nodes that are reachable from this one.
    if len(self.edges) == 0:
    # no children. One node is reachable.
    count = 1
    else:
    # otherwise, the count the number of final-nodes, including self.
    count = 0
    if self.final: count += 1
    for label, node in sorted(self.edges.items()):
    count += node.numReachable()
    # including self
    count = 0
    if self.final: count += 1
    for label, node in sorted(self.edges.items()):
    count += node.numReachable()

    self.count = count
    return count
  3. @smhanov smhanov created this gist Nov 5, 2014.
    181 changes: 181 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,181 @@
    #!/usr/bin/python3
    # By Steve Hanov, 2011. Released to the public domain.
    import sys
    import time

    DICTIONARY = "/usr/share/dict/words"
    QUERY = sys.argv[1:]

    # This class represents a node in the directed acyclic word graph (DAWG). It
    # has a list of edges to other nodes. It has functions for testing whether it
    # is equivalent to another node. Nodes are equivalent if they have identical
    # edges, and each identical edge leads to identical states. The __hash__ and
    # __eq__ functions allow it to be used as a key in a python dictionary.
    class DawgNode:
    NextId = 0

    def __init__(self):
    self.id = DawgNode.NextId
    DawgNode.NextId += 1
    self.final = False
    self.edges = {}

    # Number of end nodes reachable from this one.
    self.count = 0

    def __str__(self):
    arr = []
    if self.final:
    arr.append("1")
    else:
    arr.append("0")

    for (label, node) in self.edges.items():
    arr.append( label )
    arr.append( str( node.id ) )

    return "_".join(arr)

    def __hash__(self):
    return self.__str__().__hash__()

    def __eq__(self, other):
    return self.__str__() == other.__str__()

    def numReachable(self):
    # if a count is already assigned, return it
    if self.count: return self.count

    # count the number of final nodes that are reachable from this one.
    if len(self.edges) == 0:
    # no children. One node is reachable.
    count = 1
    else:
    # otherwise, the count the number of final-nodes, including self.
    count = 0
    if self.final: count += 1
    for label, node in sorted(self.edges.items()):
    count += node.numReachable()

    self.count = count
    return count

    class Dawg:
    def __init__(self):
    self.previousWord = ""
    self.root = DawgNode()

    # Here is a list of nodes that have not been checked for duplication.
    self.uncheckedNodes = []

    # Here is a list of unique nodes that have been checked for
    # duplication.
    self.minimizedNodes = {}

    # Here is the data associated with all the nodes
    self.data = []

    def insert( self, word, data ):
    if word < self.previousWord:
    raise Exception("Error: Words must be inserted in alphabetical " +
    "order.")

    # find common prefix between word and previous word
    commonPrefix = 0
    for i in range( min( len( word ), len( self.previousWord ) ) ):
    if word[i] != self.previousWord[i]: break
    commonPrefix += 1

    # Check the uncheckedNodes for redundant nodes, proceeding from last
    # one down to the common prefix size. Then truncate the list at that
    # point.
    self._minimize( commonPrefix )

    self.data.append(data)

    # add the suffix, starting from the correct node mid-way through the
    # graph
    if len(self.uncheckedNodes) == 0:
    node = self.root
    else:
    node = self.uncheckedNodes[-1][2]

    for letter in word[commonPrefix:]:
    nextNode = DawgNode()
    node.edges[letter] = nextNode
    self.uncheckedNodes.append( (node, letter, nextNode) )
    node = nextNode

    node.final = True
    self.previousWord = word

    def finish( self ):
    # minimize all uncheckedNodes
    self._minimize( 0 );

    # go through entire structure and assign the counts to each node.
    self.root.numReachable()

    def _minimize( self, downTo ):
    # proceed from the leaf up to a certain point
    for i in range( len(self.uncheckedNodes) - 1, downTo - 1, -1 ):
    (parent, letter, child) = self.uncheckedNodes[i];
    if child in self.minimizedNodes:
    # replace the child with the previously encountered one
    parent.edges[letter] = self.minimizedNodes[child]
    else:
    # add the state to the minimized nodes.
    self.minimizedNodes[child] = child;
    self.uncheckedNodes.pop()

    def lookup( self, word ):
    node = self.root
    skipped = 0 # keep track of number of final nodes that we skipped
    for letter in word:
    if letter not in node.edges: return None
    for label, child in sorted(node.edges.items()):
    if label == letter:
    if node.final: skipped += 1
    node = child
    break
    skipped += child.count

    if node.final:
    return self.data[skipped]

    def nodeCount( self ):
    return len(self.minimizedNodes)

    def edgeCount( self ):
    count = 0
    for node in self.minimizedNodes:
    count += len(node.edges)
    return count


    dawg = Dawg()
    WordCount = 0
    words = open(DICTIONARY, "rt").read().split()
    words.sort()
    start = time.time()
    for word in words:
    WordCount += 1
    # insert all words, using the reversed version as the data associated with
    # it
    dawg.insert(word, ''.join(reversed(word)))
    if ( WordCount % 100 ) == 0: print("{0}\r".format(WordCount), end="")
    dawg.finish()
    print("Dawg creation took {0} s".format(time.time()-start))

    EdgeCount = dawg.edgeCount()
    print("Read {0} words into {1} nodes and {2} edges".format(
    WordCount, dawg.nodeCount(), EdgeCount))

    print("This could be stored in as little as {0} bytes".format(EdgeCount * 4))

    for word in QUERY:
    result = dawg.lookup(word)
    if result == None:
    print("{0} not in dictionary.".format(word))
    else:
    print("{0} is in the dictionary and has data {1}".format(word, result))