Skip to content

Instantly share code, notes, and snippets.

@EricDuminil
Forked from atiking/regexp-trie.py
Last active June 30, 2025 20:38
Show Gist options
  • Save EricDuminil/8faabc2f3de82b24e5a371b6dc0fd1e0 to your computer and use it in GitHub Desktop.
Save EricDuminil/8faabc2f3de82b24e5a371b6dc0fd1e0 to your computer and use it in GitHub Desktop.

Revisions

  1. EricDuminil revised this gist Jun 30, 2025. 1 changed file with 13 additions and 0 deletions.
    13 changes: 13 additions & 0 deletions trie.py
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,19 @@
    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    #
    # Original Perl module: Regexp::Trie
    # Original Copyright (C) 2006 by Dan Kogai
    #
    # This Python translation is a derivative work based on Regexp::Trie
    # Copyright (c) 2010 by rex
    # Copyright (c) 2017 by fcicq, atiking and EricDuminil

    # This Python code is licensed under the Artistic License 2.0 (AL2.0).
    # A copy of the Artistic License 2.0 can be found at:
    # https://opensource.org/licenses/Artistic-2.0
    #
    # Original source: https://metacpan.org/pod/Regexp::Trie
    #
    #author: rex
    #blog: http://iregex.org
    #filename trie.py
  2. EricDuminil revised this gist Mar 14, 2017. 1 changed file with 3 additions and 2 deletions.
    5 changes: 3 additions & 2 deletions trie.py
    Original file line number Diff line number Diff line change
    @@ -3,7 +3,7 @@
    #
    #author: rex
    #blog: http://iregex.org
    #filename tr.py
    #filename trie.py
    #created: 2010-08-01 20:24
    #source uri: http://iregex.org/blog/trie-in-python.html

    @@ -14,7 +14,8 @@


    class Trie():
    """Regexp::Trie in python"""
    """Regexp::Trie in python. Creates a Trie out of a list of words. The trie can be exported to a Regexp pattern.
    The corresponding Regexp should match much faster than a simple Regexp union."""

    def __init__(self):
    self.data = {}
  3. EricDuminil renamed this gist Mar 14, 2017. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  4. EricDuminil revised this gist Mar 14, 2017. 1 changed file with 36 additions and 31 deletions.
    67 changes: 36 additions & 31 deletions regexp-trie.py
    Original file line number Diff line number Diff line change
    @@ -8,70 +8,75 @@
    #source uri: http://iregex.org/blog/trie-in-python.html

    # escape bug fix by fcicq @ 2012.8.19
    # python3 compatible by EricDuminil @ 2017.03.

    import re


    class Trie():
    """Regexp::Trie in python"""

    def __init__(self):
    self.data={}
    self.data = {}

    def add(self, word):
    ref=self.data
    ref = self.data
    for char in word:
    ref[char]=ref.has_key(char) and ref[char] or {}
    ref=ref[char]
    ref['']=1
    ref[char] = char in ref and ref[char] or {}
    ref = ref[char]
    ref[''] = 1

    def dump(self):
    return self.data

    def quote(self, char):
    return re.escape(char)

    def _regexp(self, pData):
    data=pData
    if data.has_key("") and len(data.keys())==1:
    def _pattern(self, pData):
    data = pData
    if "" in data and len(data.keys()) == 1:
    return None

    alt=[]
    cc=[]
    q=0
    alt = []
    cc = []
    q = 0
    for char in sorted(data.keys()):
    if isinstance(data[char],dict):
    if isinstance(data[char], dict):
    try:
    recurse=self._regexp(data[char])
    alt.append(self.quote(char)+recurse)
    recurse = self._pattern(data[char])
    alt.append(self.quote(char) + recurse)
    except:
    cc.append(self.quote(char))
    else:
    q=1
    cconly=not len(alt)>0
    q = 1
    cconly = not len(alt) > 0

    if len(cc)>0:
    if len(cc)==1:
    if len(cc) > 0:
    if len(cc) == 1:
    alt.append(cc[0])
    else:
    alt.append('['+''.join(cc)+']')
    alt.append('[' + ''.join(cc) + ']')

    if len(alt)==1:
    result=alt[0]
    if len(alt) == 1:
    result = alt[0]
    else:
    result="(?:"+"|".join(alt)+")"
    result = "(?:" + "|".join(alt) + ")"

    if q:
    if cconly:
    result+="?"
    result += "?"
    else:
    result="(?:%s)?" % result
    result = "(?:%s)?" % result
    return result
    def regexp(self):
    # return "(?-xism:%s)" % self._regexp(self.dump()) # fcicq: not sure what is it...
    return self._regexp(self.dump())

    def pattern(self):
    return self._pattern(self.dump())


    if __name__ == '__main__':
    a=Trie()
    t = Trie()

    for w in ['foobar', 'foobah', 'fooxar', 'foozap', 'fooza']:
    a.add(w)
    print a.regexp()
    for w in ['foobar', 'foobah', 'fooxar', 'foozap', 'fooza']:
    t.add(w)
    print(t.pattern())
    #=> "foo(?:ba[hr]|xar|zap?)"
  5. @atiking atiking revised this gist Nov 18, 2013. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion regexp-trie.py
    Original file line number Diff line number Diff line change
    @@ -46,7 +46,7 @@ def _regexp(self, pData):
    cc.append(self.quote(char))
    else:
    q=1
    cconly=len(alt) and 0 or 1 #if len, 0; else:0
    cconly=not len(alt)>0

    if len(cc)>0:
    if len(cc)==1:
  6. @fcicq fcicq revised this gist Aug 19, 2012. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions regexp-trie.py
    Original file line number Diff line number Diff line change
    @@ -8,7 +8,6 @@
    #source uri: http://iregex.org/blog/trie-in-python.html

    # escape bug fix by fcicq @ 2012.8.19
    # known issues: remove the prefix '(?-xism:(?:' and suffix '))' in order to work.

    import re
    class Trie():
    @@ -67,7 +66,8 @@ def _regexp(self, pData):
    result="(?:%s)?" % result
    return result
    def regexp(self):
    return "(?-xism:%s)" % self._regexp(self.dump())
    # return "(?-xism:%s)" % self._regexp(self.dump()) # fcicq: not sure what is it...
    return self._regexp(self.dump())

    if __name__ == '__main__':
    a=Trie()
  7. @fcicq fcicq revised this gist Aug 19, 2012. 1 changed file with 10 additions and 2 deletions.
    12 changes: 10 additions & 2 deletions regexp-trie.py
    Original file line number Diff line number Diff line change
    @@ -7,8 +7,13 @@
    #created: 2010-08-01 20:24
    #source uri: http://iregex.org/blog/trie-in-python.html

    # escape bug fix by fcicq @ 2012.8.19
    # known issues: remove the prefix '(?-xism:(?:' and suffix '))' in order to work.

    import re
    class Trie():
    """Regexp::Trie in python"""

    def __init__(self):
    self.data={}

    @@ -22,6 +27,9 @@ def add(self, word):
    def dump(self):
    return self.data

    def quote(self, char):
    return re.escape(char)

    def _regexp(self, pData):
    data=pData
    if data.has_key("") and len(data.keys())==1:
    @@ -34,9 +42,9 @@ def _regexp(self, pData):
    if isinstance(data[char],dict):
    try:
    recurse=self._regexp(data[char])
    alt.append(char+recurse)
    alt.append(self.quote(char)+recurse)
    except:
    cc.append(char)
    cc.append(self.quote(char))
    else:
    q=1
    cconly=len(alt) and 0 or 1 #if len, 0; else:0
  8. @fcicq fcicq created this gist Aug 19, 2012.
    69 changes: 69 additions & 0 deletions regexp-trie.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,69 @@
    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    #
    #author: rex
    #blog: http://iregex.org
    #filename tr.py
    #created: 2010-08-01 20:24
    #source uri: http://iregex.org/blog/trie-in-python.html

    class Trie():
    """Regexp::Trie in python"""
    def __init__(self):
    self.data={}

    def add(self, word):
    ref=self.data
    for char in word:
    ref[char]=ref.has_key(char) and ref[char] or {}
    ref=ref[char]
    ref['']=1

    def dump(self):
    return self.data

    def _regexp(self, pData):
    data=pData
    if data.has_key("") and len(data.keys())==1:
    return None

    alt=[]
    cc=[]
    q=0
    for char in sorted(data.keys()):
    if isinstance(data[char],dict):
    try:
    recurse=self._regexp(data[char])
    alt.append(char+recurse)
    except:
    cc.append(char)
    else:
    q=1
    cconly=len(alt) and 0 or 1 #if len, 0; else:0

    if len(cc)>0:
    if len(cc)==1:
    alt.append(cc[0])
    else:
    alt.append('['+''.join(cc)+']')

    if len(alt)==1:
    result=alt[0]
    else:
    result="(?:"+"|".join(alt)+")"

    if q:
    if cconly:
    result+="?"
    else:
    result="(?:%s)?" % result
    return result
    def regexp(self):
    return "(?-xism:%s)" % self._regexp(self.dump())

    if __name__ == '__main__':
    a=Trie()

    for w in ['foobar', 'foobah', 'fooxar', 'foozap', 'fooza']:
    a.add(w)
    print a.regexp()