Last active
May 24, 2019 13:38
-
-
Save MarkTuddenham/069def4d209933bcfd977c8e4194f466 to your computer and use it in GitHub Desktop.
Create simple regex expression from examples: 'simple_regex aa_bb_cc.py aa.pyc' -> aa*.py*
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| """Simple regex creator. | |
| Creates regex expressions from examples. | |
| """ | |
| from typing import Set | |
| import sys | |
| def lcs(str1: str, str2: str) -> Set[str]: | |
| """Find longest common substring. | |
| https://bogotobogo.com/python/python_longest_common_substring_lcs_algorithm_generalized_suffix_tree.php | |
| """ | |
| str1_len = len(str1) | |
| str2_len = len(str2) | |
| counter = [[0]*(str2_len+1) for x in range(str1_len+1)] | |
| longest = 0 | |
| lcs_set: Set[str] = set() | |
| for i in range(str1_len): | |
| for j in range(str2_len): | |
| if str1[i] == str2[j]: | |
| count = counter[i][j] + 1 | |
| counter[i+1][j+1] = count | |
| if count > longest: | |
| lcs_set = set() | |
| longest = count | |
| lcs_set.add(str1[i-count+1:i+1]) | |
| elif count == longest: | |
| lcs_set.add(str1[i-count+1:i+1]) | |
| return lcs_set | |
| def split(str1: str, str2: str) -> str: | |
| """Find the least restrictive regex of two strings.""" | |
| # we only want to return * if at least one string has characters | |
| if str1 is str2 is '': | |
| return '' | |
| ret = lcs(str1, str2) | |
| try: | |
| sub_str = next(iter(ret)) | |
| # if we split around 1 character matches | |
| # then we get a load of potentially spurious rules | |
| # e.g. | |
| # split('total', 'example') -> *a*l* | |
| # which is probably not right | |
| if len(sub_str) is 1: | |
| return '*' | |
| front_str1, back_str1 = str1.split(sub_str) | |
| front_str2, back_str2 = str2.split(sub_str) | |
| return split(front_str1, front_str2) + sub_str + split(back_str1, back_str2) | |
| except StopIteration as sub_str: | |
| # No common substrings | |
| return '*' | |
| def regex(*args: str) -> str: | |
| """Find the least restrictive regex of n strings.""" | |
| return args[0] if len(args) is 1 else split(args[0], regex(*args[1:])) | |
| if __name__ == "__main__": | |
| print(regex(*sys.argv[1:])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment