Skip to content

Instantly share code, notes, and snippets.

@MarkTuddenham
Last active May 24, 2019 13:38
Show Gist options
  • Select an option

  • Save MarkTuddenham/069def4d209933bcfd977c8e4194f466 to your computer and use it in GitHub Desktop.

Select an option

Save MarkTuddenham/069def4d209933bcfd977c8e4194f466 to your computer and use it in GitHub Desktop.
Create simple regex expression from examples: 'simple_regex aa_bb_cc.py aa.pyc' -> aa*.py*
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Simple regex creator.
Creates regex expressions from examples.
"""
from typing import Set
import sys
def lcs(str1: str, str2: str) -> Set[str]:
"""Find longest common substring.
https://bogotobogo.com/python/python_longest_common_substring_lcs_algorithm_generalized_suffix_tree.php
"""
str1_len = len(str1)
str2_len = len(str2)
counter = [[0]*(str2_len+1) for x in range(str1_len+1)]
longest = 0
lcs_set: Set[str] = set()
for i in range(str1_len):
for j in range(str2_len):
if str1[i] == str2[j]:
count = counter[i][j] + 1
counter[i+1][j+1] = count
if count > longest:
lcs_set = set()
longest = count
lcs_set.add(str1[i-count+1:i+1])
elif count == longest:
lcs_set.add(str1[i-count+1:i+1])
return lcs_set
def split(str1: str, str2: str) -> str:
"""Find the least restrictive regex of two strings."""
# we only want to return * if at least one string has characters
if str1 is str2 is '':
return ''
ret = lcs(str1, str2)
try:
sub_str = next(iter(ret))
# if we split around 1 character matches
# then we get a load of potentially spurious rules
# e.g.
# split('total', 'example') -> *a*l*
# which is probably not right
if len(sub_str) is 1:
return '*'
front_str1, back_str1 = str1.split(sub_str)
front_str2, back_str2 = str2.split(sub_str)
return split(front_str1, front_str2) + sub_str + split(back_str1, back_str2)
except StopIteration as sub_str:
# No common substrings
return '*'
def regex(*args: str) -> str:
"""Find the least restrictive regex of n strings."""
return args[0] if len(args) is 1 else split(args[0], regex(*args[1:]))
if __name__ == "__main__":
print(regex(*sys.argv[1:]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment