#!/usr/bin/env python
import re
from collections import defaultdict
from urllib2 import urlopen


def main():
    domains = set()
    #domains |= get_domains_from_url('https://publicsuffix.org/list/effective_tld_names.dat')
    domains |= get_domains_from_url('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')

    tree = build_tree(domains)

    print r'.+\\.' + build_regex(tree) + '$'


START = ';'
STOP = ':'


def build_regex(tree, start=START):
    items = tree[start]
    if len(items) == 1:
        next = iter(items).next()
        if next == STOP:
            return ''
        return escape(next) + build_regex(tree, start + next)
    else:
        pattern = []
        for item in items:
            if item == STOP:
                pattern.append('')
            else:
                pattern.append(escape(item) + build_regex(tree, start + item))
        return '(?:' + '|'.join(pattern) + ')'


def build_tree(domains):
    tree = defaultdict(set)
    for domain in domains:
        for prefix, next in prefixes(START + domain + STOP):
            tree[prefix].add(next)

    return tree


def is_ascii(s):
    return all(ord(c) < 128 for c in s)


def prefixes(input):
    for i in range(1, len(input), 1):
        yield input[:i], input[i]


def escape(item):
    return re.escape(item)


def get_domains_from_url(url):
    f = urlopen(url)
    domains = f.readlines()
    f.close()
    return {
        domain.strip().lower()
        .decode('utf8').encode('idna')
        .lstrip('*.')
        for domain in domains
        if not domain.strip().startswith('//') and
        not domain.strip().startswith('#') and
        not domain.strip().startswith('!') and
        domain.strip()
    }


if __name__ == '__main__':
    main()