Created
September 10, 2013 08:23
-
-
Save istepanov/6506508 to your computer and use it in GitHub Desktop.
Revisions
-
istepanov created this gist
Sep 10, 2013 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,336 @@ #!/usr/bin/env python # -*- coding: utf-8 """ Search longest common substrings using generalized suffix trees built with Ukkonen's algorithm Author: Ilya Stepanov <code at ilyastepanov.com> (c) 2013 """ from __future__ import print_function import sys import re import argparse END_OF_STRING = sys.maxint class SuffixTreeNode: """ Suffix tree node class. Actually, it also respresents a tree edge that points to this node. """ new_identifier = 0 def __init__(self, start=0, end=END_OF_STRING): self.identifier = SuffixTreeNode.new_identifier SuffixTreeNode.new_identifier += 1 # suffix link is required by Ukkonen's algorithm self.suffix_link = None # child edges/nodes, each dict key represents the first letter of an edge self.edges = {} # stores reference to parent self.parent = None # bit vector shows to which strings this node belongs self.bit_vector = 0 # edge info: start index and end index self.start = start self.end = end def add_child(self, key, start, end): """ Create a new child node Agrs: key: a char that will be used during active edge searching start, end: node's edge start and end indices Returns: created child node """ child = SuffixTreeNode(start=start, end=end) child.parent = self self.edges[key] = child return child def add_exisiting_node_as_child(self, key, node): """ Add an existing node as a child Args: key: a char that will be used during active edge searching node: a node that will be added as a child """ node.parent = self self.edges[key] = node def get_edge_length(self, current_index): """ Get length of an edge that points to this node Args: current_index: index of current processing symbol (usefull for leaf nodes that have "infinity" end index) """ return min(self.end, current_index + 1) - self.start def __str__(self): return 'id=' + str(self.identifier) class SuffixTree: """ Generalized suffix tree """ def __init__(self): # the root node self.root = SuffixTreeNode() # all strings are concatenaited together. Tree's nodes stores only indices self.input_string = '' # number of strings stored by this tree self.strings_count = 0 # list of tree leaves self.leaves = [] def append_string(self, input_string): """ Add new string to the suffix tree """ start_index = len(self.input_string) current_string_index = self.strings_count # each sting should have a unique ending input_string += '$' + str(current_string_index) # gathering 'em all together self.input_string += input_string self.strings_count += 1 # these 3 variables represents current "active point" active_node = self.root active_edge = 0 active_length = 0 # shows how many remainder = 0 # new leaves appended to tree new_leaves = [] # main circle for index in range(start_index, len(self.input_string)): previous_node = None remainder += 1 while remainder > 0: if active_length == 0: active_edge = index if self.input_string[active_edge] not in active_node.edges: # no edge starting with current char, so creating a new leaf node leaf_node = active_node.add_child(self.input_string[active_edge], index, END_OF_STRING) # a leaf node will always be leaf node belonging to only one string # (because each string has different termination) leaf_node.bit_vector = 1 << current_string_index new_leaves.append(leaf_node) # doing suffix link magic if previous_node is not None: previous_node.suffix_link = active_node previous_node = active_node else: # ok, we've got an active edge next_node = active_node.edges[self.input_string[active_edge]] # walking down through edges (if active_length is bigger than edge length) next_edge_length = next_node.get_edge_length(index) if active_length >= next_node.get_edge_length(index): active_edge += next_edge_length active_length -= next_edge_length active_node = next_node continue # current edge already contains the suffix we need to insert. # Increase the active_length and go forward if self.input_string[next_node.start + active_length] == self.input_string[index]: active_length += 1 if previous_node is not None: previous_node.suffix_link = active_node previous_node = active_node break # splitting edge split_node = active_node.add_child( self.input_string[active_edge], next_node.start, next_node.start + active_length ) next_node.start += active_length split_node.add_exisiting_node_as_child(self.input_string[next_node.start], next_node) leaf_node = split_node.add_child(self.input_string[index], index, END_OF_STRING) leaf_node.bit_vector = 1 << current_string_index new_leaves.append(leaf_node) # suffix link magic again if previous_node is not None: previous_node.suffix_link = split_node previous_node = split_node remainder -= 1 # follow suffix link (if exists) or go to root if active_node == self.root and active_length > 0: active_length -= 1 active_edge = index - remainder + 1 else: active_node = active_node.suffix_link if active_node.suffix_link is not None else self.root # update leaves ends from "infinity" to actual string end for leaf in new_leaves: leaf.end = len(self.input_string) self.leaves.extend(new_leaves) def find_longest_common_substrings(self): """ Search longest common substrings in the tree by locating lowest common ancestors what belong to all strings """ # all bits are set success_bit_vector = 2 ** self.strings_count - 1 lowest_common_ancestors = [] # going up to the root for leaf in self.leaves: node = leaf while node.parent is not None: if node.bit_vector != success_bit_vector: # updating parent's bit vector node.parent.bit_vector |= node.bit_vector node = node.parent else: # hey, we've found a lowest common ancestor! lowest_common_ancestors.append(node) break longest_common_substrings = [''] longest_length = 0 # need to filter the result array and get the longest common strings for common_ancestor in lowest_common_ancestors: common_substring = '' node = common_ancestor while node.parent is not None: label = self.input_string[node.start:node.end] common_substring = label + common_substring node = node.parent # remove unique endings ($<number>), we don't need them anymore common_substring = re.sub(r'(.*?)\$?\d*$', r'\1', common_substring) if len(common_substring) > longest_length: longest_length = len(common_substring) longest_common_substrings = [common_substring] elif len(common_substring) == longest_length and common_substring not in longest_common_substrings: longest_common_substrings.append(common_substring) return longest_common_substrings def to_graphviz(self, node=None, output=''): """ Show the tree as graphviz string. For debugging purposes only """ if node is None: node = self.root output = 'digraph G {edge [arrowsize=0.4,fontsize=10];' output +=\ str(node.identifier) + '[label="' +\ str(node.identifier) + '\\n' + '{0:b}'.format(node.bit_vector).zfill(self.strings_count) + '"' if node.bit_vector == 2 ** self.strings_count - 1: output += ',style="filled",fillcolor="red"' output += '];' if node.suffix_link is not None: output += str(node.identifier) + '->' + str(node.suffix_link.identifier) + '[style="dashed"];' for child in node.edges.values(): label = self.input_string[child.start:child.end] output += str(node.identifier) + '->' + str(child.identifier) + '[label="' + label + '"];' output = self.to_graphviz(child, output) if node == self.root: output += '}' return output def __str__(self): return self.to_graphviz() def main(): parser = argparse.ArgumentParser( description='Searching longest common substring. ' 'Uses Ukkonen\'s suffix tree algorithm and generalized suffix tree. ' 'Written by Ilya Stepanov (c) 2013' ) parser.add_argument( 'strings', metavar='STRING', nargs='*', help='String for searching', ) parser.add_argument( '-f', '--file', help='Path for input file. First line should contain number of lines to search in' ) parser.add_argument( '-d', '--debug', help='Debug mode: shows generalized suffix tree in output (graphviz)', action="store_true" ) args = parser.parse_args() if not args.strings and not args.file: parser.print_help() exit() suffix_tree = SuffixTree() for s in args.strings: suffix_tree.append_string(s) if args.file: with open(args.file, 'rU') as f: first_line = f.readline().strip('\r\n') if first_line.isdigit(): max_lines_count = int(first_line) else: max_lines_count = sys.maxint suffix_tree.append_string(first_line) for index, line in enumerate(f): if index >= max_lines_count: break suffix_tree.append_string(line.strip('\r\n')) lcs = suffix_tree.find_longest_common_substrings() for s in lcs: print(s) if args.debug: print(suffix_tree.to_graphviz()) if __name__ == "__main__": main()