Skip to content

Instantly share code, notes, and snippets.

@axiak
Created November 5, 2014 02:25
Show Gist options
  • Select an option

  • Save axiak/8e7a920f53fa45a253a4 to your computer and use it in GitHub Desktop.

Select an option

Save axiak/8e7a920f53fa45a253a4 to your computer and use it in GitHub Desktop.

Revisions

  1. axiak created this gist Nov 5, 2014.
    77 changes: 77 additions & 0 deletions tlds.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,77 @@
    #!/usr/bin/env python
    import re
    from collections import defaultdict
    from urllib2 import urlopen


    def main():
    domains = set()
    #domains |= get_domains_from_url('https://publicsuffix.org/list/effective_tld_names.dat')
    domains |= get_domains_from_url('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')

    tree = build_tree(domains)

    print r'.+\\.' + build_regex(tree) + '$'


    START = ';'
    STOP = ':'


    def build_regex(tree, start=START):
    items = tree[start]
    if len(items) == 1:
    next = iter(items).next()
    if next == STOP:
    return ''
    return escape(next) + build_regex(tree, start + next)
    else:
    pattern = []
    for item in items:
    if item == STOP:
    pattern.append('')
    else:
    pattern.append(escape(item) + build_regex(tree, start + item))
    return '(?:' + '|'.join(pattern) + ')'


    def build_tree(domains):
    tree = defaultdict(set)
    for domain in domains:
    for prefix, next in prefixes(START + domain + STOP):
    tree[prefix].add(next)

    return tree


    def is_ascii(s):
    return all(ord(c) < 128 for c in s)


    def prefixes(input):
    for i in range(1, len(input), 1):
    yield input[:i], input[i]


    def escape(item):
    return re.escape(item)


    def get_domains_from_url(url):
    f = urlopen(url)
    domains = f.readlines()
    f.close()
    return {
    domain.strip().lower()
    .decode('utf8').encode('idna')
    .lstrip('*.')
    for domain in domains
    if not domain.strip().startswith('//') and
    not domain.strip().startswith('#') and
    not domain.strip().startswith('!') and
    domain.strip()
    }


    if __name__ == '__main__':
    main()