Skip to content

Instantly share code, notes, and snippets.

@istepanov
Created September 10, 2013 08:23
Show Gist options
  • Save istepanov/6506508 to your computer and use it in GitHub Desktop.
Save istepanov/6506508 to your computer and use it in GitHub Desktop.

Revisions

  1. istepanov created this gist Sep 10, 2013.
    336 changes: 336 additions & 0 deletions lcs.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,336 @@
    #!/usr/bin/env python
    # -*- coding: utf-8

    """
    Search longest common substrings using generalized suffix trees built with Ukkonen's algorithm
    Author: Ilya Stepanov <code at ilyastepanov.com>
    (c) 2013
    """


    from __future__ import print_function
    import sys
    import re
    import argparse

    END_OF_STRING = sys.maxint


    class SuffixTreeNode:
    """
    Suffix tree node class. Actually, it also respresents a tree edge that points to this node.
    """
    new_identifier = 0

    def __init__(self, start=0, end=END_OF_STRING):
    self.identifier = SuffixTreeNode.new_identifier
    SuffixTreeNode.new_identifier += 1

    # suffix link is required by Ukkonen's algorithm
    self.suffix_link = None

    # child edges/nodes, each dict key represents the first letter of an edge
    self.edges = {}

    # stores reference to parent
    self.parent = None

    # bit vector shows to which strings this node belongs
    self.bit_vector = 0

    # edge info: start index and end index
    self.start = start
    self.end = end

    def add_child(self, key, start, end):
    """
    Create a new child node
    Agrs:
    key: a char that will be used during active edge searching
    start, end: node's edge start and end indices
    Returns:
    created child node
    """
    child = SuffixTreeNode(start=start, end=end)
    child.parent = self
    self.edges[key] = child
    return child

    def add_exisiting_node_as_child(self, key, node):
    """
    Add an existing node as a child
    Args:
    key: a char that will be used during active edge searching
    node: a node that will be added as a child
    """
    node.parent = self
    self.edges[key] = node

    def get_edge_length(self, current_index):
    """
    Get length of an edge that points to this node
    Args:
    current_index: index of current processing symbol (usefull for leaf nodes that have "infinity" end index)
    """
    return min(self.end, current_index + 1) - self.start

    def __str__(self):
    return 'id=' + str(self.identifier)


    class SuffixTree:
    """
    Generalized suffix tree
    """

    def __init__(self):
    # the root node
    self.root = SuffixTreeNode()

    # all strings are concatenaited together. Tree's nodes stores only indices
    self.input_string = ''

    # number of strings stored by this tree
    self.strings_count = 0

    # list of tree leaves
    self.leaves = []

    def append_string(self, input_string):
    """
    Add new string to the suffix tree
    """
    start_index = len(self.input_string)
    current_string_index = self.strings_count

    # each sting should have a unique ending
    input_string += '$' + str(current_string_index)

    # gathering 'em all together
    self.input_string += input_string
    self.strings_count += 1

    # these 3 variables represents current "active point"
    active_node = self.root
    active_edge = 0
    active_length = 0

    # shows how many
    remainder = 0

    # new leaves appended to tree
    new_leaves = []

    # main circle
    for index in range(start_index, len(self.input_string)):
    previous_node = None
    remainder += 1
    while remainder > 0:
    if active_length == 0:
    active_edge = index

    if self.input_string[active_edge] not in active_node.edges:
    # no edge starting with current char, so creating a new leaf node
    leaf_node = active_node.add_child(self.input_string[active_edge], index, END_OF_STRING)

    # a leaf node will always be leaf node belonging to only one string
    # (because each string has different termination)
    leaf_node.bit_vector = 1 << current_string_index
    new_leaves.append(leaf_node)

    # doing suffix link magic
    if previous_node is not None:
    previous_node.suffix_link = active_node
    previous_node = active_node
    else:
    # ok, we've got an active edge
    next_node = active_node.edges[self.input_string[active_edge]]

    # walking down through edges (if active_length is bigger than edge length)
    next_edge_length = next_node.get_edge_length(index)
    if active_length >= next_node.get_edge_length(index):
    active_edge += next_edge_length
    active_length -= next_edge_length
    active_node = next_node
    continue

    # current edge already contains the suffix we need to insert.
    # Increase the active_length and go forward
    if self.input_string[next_node.start + active_length] == self.input_string[index]:
    active_length += 1
    if previous_node is not None:
    previous_node.suffix_link = active_node
    previous_node = active_node
    break

    # splitting edge
    split_node = active_node.add_child(
    self.input_string[active_edge],
    next_node.start,
    next_node.start + active_length
    )
    next_node.start += active_length
    split_node.add_exisiting_node_as_child(self.input_string[next_node.start], next_node)
    leaf_node = split_node.add_child(self.input_string[index], index, END_OF_STRING)
    leaf_node.bit_vector = 1 << current_string_index
    new_leaves.append(leaf_node)

    # suffix link magic again
    if previous_node is not None:
    previous_node.suffix_link = split_node
    previous_node = split_node

    remainder -= 1

    # follow suffix link (if exists) or go to root
    if active_node == self.root and active_length > 0:
    active_length -= 1
    active_edge = index - remainder + 1
    else:
    active_node = active_node.suffix_link if active_node.suffix_link is not None else self.root

    # update leaves ends from "infinity" to actual string end
    for leaf in new_leaves:
    leaf.end = len(self.input_string)
    self.leaves.extend(new_leaves)

    def find_longest_common_substrings(self):
    """
    Search longest common substrings in the tree by locating lowest common ancestors what belong to all strings
    """

    # all bits are set
    success_bit_vector = 2 ** self.strings_count - 1

    lowest_common_ancestors = []

    # going up to the root
    for leaf in self.leaves:
    node = leaf
    while node.parent is not None:
    if node.bit_vector != success_bit_vector:
    # updating parent's bit vector
    node.parent.bit_vector |= node.bit_vector
    node = node.parent
    else:
    # hey, we've found a lowest common ancestor!
    lowest_common_ancestors.append(node)
    break

    longest_common_substrings = ['']
    longest_length = 0

    # need to filter the result array and get the longest common strings
    for common_ancestor in lowest_common_ancestors:
    common_substring = ''
    node = common_ancestor
    while node.parent is not None:
    label = self.input_string[node.start:node.end]
    common_substring = label + common_substring
    node = node.parent
    # remove unique endings ($<number>), we don't need them anymore
    common_substring = re.sub(r'(.*?)\$?\d*$', r'\1', common_substring)
    if len(common_substring) > longest_length:
    longest_length = len(common_substring)
    longest_common_substrings = [common_substring]
    elif len(common_substring) == longest_length and common_substring not in longest_common_substrings:
    longest_common_substrings.append(common_substring)

    return longest_common_substrings

    def to_graphviz(self, node=None, output=''):
    """
    Show the tree as graphviz string. For debugging purposes only
    """
    if node is None:
    node = self.root
    output = 'digraph G {edge [arrowsize=0.4,fontsize=10];'

    output +=\
    str(node.identifier) + '[label="' +\
    str(node.identifier) + '\\n' + '{0:b}'.format(node.bit_vector).zfill(self.strings_count) + '"'
    if node.bit_vector == 2 ** self.strings_count - 1:
    output += ',style="filled",fillcolor="red"'
    output += '];'
    if node.suffix_link is not None:
    output += str(node.identifier) + '->' + str(node.suffix_link.identifier) + '[style="dashed"];'

    for child in node.edges.values():
    label = self.input_string[child.start:child.end]
    output += str(node.identifier) + '->' + str(child.identifier) + '[label="' + label + '"];'
    output = self.to_graphviz(child, output)

    if node == self.root:
    output += '}'

    return output

    def __str__(self):
    return self.to_graphviz()


    def main():
    parser = argparse.ArgumentParser(
    description='Searching longest common substring. '
    'Uses Ukkonen\'s suffix tree algorithm and generalized suffix tree. '
    'Written by Ilya Stepanov (c) 2013'
    )
    parser.add_argument(
    'strings',
    metavar='STRING',
    nargs='*',
    help='String for searching',
    )
    parser.add_argument(
    '-f',
    '--file',
    help='Path for input file. First line should contain number of lines to search in'
    )
    parser.add_argument(
    '-d',
    '--debug',
    help='Debug mode: shows generalized suffix tree in output (graphviz)',
    action="store_true"
    )

    args = parser.parse_args()
    if not args.strings and not args.file:
    parser.print_help()
    exit()

    suffix_tree = SuffixTree()

    for s in args.strings:
    suffix_tree.append_string(s)

    if args.file:
    with open(args.file, 'rU') as f:
    first_line = f.readline().strip('\r\n')
    if first_line.isdigit():
    max_lines_count = int(first_line)
    else:
    max_lines_count = sys.maxint
    suffix_tree.append_string(first_line)

    for index, line in enumerate(f):
    if index >= max_lines_count:
    break
    suffix_tree.append_string(line.strip('\r\n'))

    lcs = suffix_tree.find_longest_common_substrings()
    for s in lcs:
    print(s)

    if args.debug:
    print(suffix_tree.to_graphviz())


    if __name__ == "__main__":
    main()