@@ -0,0 +1,336 @@
#!/usr/bin/env python
# -*- coding: utf-8
"""
Search longest common substrings using generalized suffix trees built with Ukkonen's algorithm
Author: Ilya Stepanov <code at ilyastepanov.com>
(c) 2013
"""
from __future__ import print_function
import sys
import re
import argparse
END_OF_STRING = sys .maxint
class SuffixTreeNode :
"""
Suffix tree node class. Actually, it also respresents a tree edge that points to this node.
"""
new_identifier = 0
def __init__ (self , start = 0 , end = END_OF_STRING ):
self .identifier = SuffixTreeNode .new_identifier
SuffixTreeNode .new_identifier += 1
# suffix link is required by Ukkonen's algorithm
self .suffix_link = None
# child edges/nodes, each dict key represents the first letter of an edge
self .edges = {}
# stores reference to parent
self .parent = None
# bit vector shows to which strings this node belongs
self .bit_vector = 0
# edge info: start index and end index
self .start = start
self .end = end
def add_child (self , key , start , end ):
"""
Create a new child node
Agrs:
key: a char that will be used during active edge searching
start, end: node's edge start and end indices
Returns:
created child node
"""
child = SuffixTreeNode (start = start , end = end )
child .parent = self
self .edges [key ] = child
return child
def add_exisiting_node_as_child (self , key , node ):
"""
Add an existing node as a child
Args:
key: a char that will be used during active edge searching
node: a node that will be added as a child
"""
node .parent = self
self .edges [key ] = node
def get_edge_length (self , current_index ):
"""
Get length of an edge that points to this node
Args:
current_index: index of current processing symbol (usefull for leaf nodes that have "infinity" end index)
"""
return min (self .end , current_index + 1 ) - self .start
def __str__ (self ):
return 'id=' + str (self .identifier )
class SuffixTree :
"""
Generalized suffix tree
"""
def __init__ (self ):
# the root node
self .root = SuffixTreeNode ()
# all strings are concatenaited together. Tree's nodes stores only indices
self .input_string = ''
# number of strings stored by this tree
self .strings_count = 0
# list of tree leaves
self .leaves = []
def append_string (self , input_string ):
"""
Add new string to the suffix tree
"""
start_index = len (self .input_string )
current_string_index = self .strings_count
# each sting should have a unique ending
input_string += '$' + str (current_string_index )
# gathering 'em all together
self .input_string += input_string
self .strings_count += 1
# these 3 variables represents current "active point"
active_node = self .root
active_edge = 0
active_length = 0
# shows how many
remainder = 0
# new leaves appended to tree
new_leaves = []
# main circle
for index in range (start_index , len (self .input_string )):
previous_node = None
remainder += 1
while remainder > 0 :
if active_length == 0 :
active_edge = index
if self .input_string [active_edge ] not in active_node .edges :
# no edge starting with current char, so creating a new leaf node
leaf_node = active_node .add_child (self .input_string [active_edge ], index , END_OF_STRING )
# a leaf node will always be leaf node belonging to only one string
# (because each string has different termination)
leaf_node .bit_vector = 1 << current_string_index
new_leaves .append (leaf_node )
# doing suffix link magic
if previous_node is not None :
previous_node .suffix_link = active_node
previous_node = active_node
else :
# ok, we've got an active edge
next_node = active_node .edges [self .input_string [active_edge ]]
# walking down through edges (if active_length is bigger than edge length)
next_edge_length = next_node .get_edge_length (index )
if active_length >= next_node .get_edge_length (index ):
active_edge += next_edge_length
active_length -= next_edge_length
active_node = next_node
continue
# current edge already contains the suffix we need to insert.
# Increase the active_length and go forward
if self .input_string [next_node .start + active_length ] == self .input_string [index ]:
active_length += 1
if previous_node is not None :
previous_node .suffix_link = active_node
previous_node = active_node
break
# splitting edge
split_node = active_node .add_child (
self .input_string [active_edge ],
next_node .start ,
next_node .start + active_length
)
next_node .start += active_length
split_node .add_exisiting_node_as_child (self .input_string [next_node .start ], next_node )
leaf_node = split_node .add_child (self .input_string [index ], index , END_OF_STRING )
leaf_node .bit_vector = 1 << current_string_index
new_leaves .append (leaf_node )
# suffix link magic again
if previous_node is not None :
previous_node .suffix_link = split_node
previous_node = split_node
remainder -= 1
# follow suffix link (if exists) or go to root
if active_node == self .root and active_length > 0 :
active_length -= 1
active_edge = index - remainder + 1
else :
active_node = active_node .suffix_link if active_node .suffix_link is not None else self .root
# update leaves ends from "infinity" to actual string end
for leaf in new_leaves :
leaf .end = len (self .input_string )
self .leaves .extend (new_leaves )
def find_longest_common_substrings (self ):
"""
Search longest common substrings in the tree by locating lowest common ancestors what belong to all strings
"""
# all bits are set
success_bit_vector = 2 ** self .strings_count - 1
lowest_common_ancestors = []
# going up to the root
for leaf in self .leaves :
node = leaf
while node .parent is not None :
if node .bit_vector != success_bit_vector :
# updating parent's bit vector
node .parent .bit_vector |= node .bit_vector
node = node .parent
else :
# hey, we've found a lowest common ancestor!
lowest_common_ancestors .append (node )
break
longest_common_substrings = ['' ]
longest_length = 0
# need to filter the result array and get the longest common strings
for common_ancestor in lowest_common_ancestors :
common_substring = ''
node = common_ancestor
while node .parent is not None :
label = self .input_string [node .start :node .end ]
common_substring = label + common_substring
node = node .parent
# remove unique endings ($<number>), we don't need them anymore
common_substring = re .sub (r'(.*?)\$?\d*$' , r'\1' , common_substring )
if len (common_substring ) > longest_length :
longest_length = len (common_substring )
longest_common_substrings = [common_substring ]
elif len (common_substring ) == longest_length and common_substring not in longest_common_substrings :
longest_common_substrings .append (common_substring )
return longest_common_substrings
def to_graphviz (self , node = None , output = '' ):
"""
Show the tree as graphviz string. For debugging purposes only
"""
if node is None :
node = self .root
output = 'digraph G {edge [arrowsize=0.4,fontsize=10];'
output += \
str (node .identifier ) + '[label="' + \
str (node .identifier ) + '\\ n' + '{0:b}' .format (node .bit_vector ).zfill (self .strings_count ) + '"'
if node .bit_vector == 2 ** self .strings_count - 1 :
output += ',style="filled",fillcolor="red"'
output += '];'
if node .suffix_link is not None :
output += str (node .identifier ) + '->' + str (node .suffix_link .identifier ) + '[style="dashed"];'
for child in node .edges .values ():
label = self .input_string [child .start :child .end ]
output += str (node .identifier ) + '->' + str (child .identifier ) + '[label="' + label + '"];'
output = self .to_graphviz (child , output )
if node == self .root :
output += '}'
return output
def __str__ (self ):
return self .to_graphviz ()
def main ():
parser = argparse .ArgumentParser (
description = 'Searching longest common substring. '
'Uses Ukkonen\' s suffix tree algorithm and generalized suffix tree. '
'Written by Ilya Stepanov (c) 2013'
)
parser .add_argument (
'strings' ,
metavar = 'STRING' ,
nargs = '*' ,
help = 'String for searching' ,
)
parser .add_argument (
'-f' ,
'--file' ,
help = 'Path for input file. First line should contain number of lines to search in'
)
parser .add_argument (
'-d' ,
'--debug' ,
help = 'Debug mode: shows generalized suffix tree in output (graphviz)' ,
action = "store_true"
)
args = parser .parse_args ()
if not args .strings and not args .file :
parser .print_help ()
exit ()
suffix_tree = SuffixTree ()
for s in args .strings :
suffix_tree .append_string (s )
if args .file :
with open (args .file , 'rU' ) as f :
first_line = f .readline ().strip ('\r \n ' )
if first_line .isdigit ():
max_lines_count = int (first_line )
else :
max_lines_count = sys .maxint
suffix_tree .append_string (first_line )
for index , line in enumerate (f ):
if index >= max_lines_count :
break
suffix_tree .append_string (line .strip ('\r \n ' ))
lcs = suffix_tree .find_longest_common_substrings ()
for s in lcs :
print (s )
if args .debug :
print (suffix_tree .to_graphviz ())
if __name__ == "__main__" :
main ()