#!/usr/bin/env python import re from collections import defaultdict from urllib2 import urlopen def main(): domains = set() #domains |= get_domains_from_url('https://publicsuffix.org/list/effective_tld_names.dat') domains |= get_domains_from_url('http://data.iana.org/TLD/tlds-alpha-by-domain.txt') tree = build_tree(domains) print r'.+\\.' + build_regex(tree) + '$' START = ';' STOP = ':' def build_regex(tree, start=START): items = tree[start] if len(items) == 1: next = iter(items).next() if next == STOP: return '' return escape(next) + build_regex(tree, start + next) else: pattern = [] for item in items: if item == STOP: pattern.append('') else: pattern.append(escape(item) + build_regex(tree, start + item)) return '(?:' + '|'.join(pattern) + ')' def build_tree(domains): tree = defaultdict(set) for domain in domains: for prefix, next in prefixes(START + domain + STOP): tree[prefix].add(next) return tree def is_ascii(s): return all(ord(c) < 128 for c in s) def prefixes(input): for i in range(1, len(input), 1): yield input[:i], input[i] def escape(item): return re.escape(item) def get_domains_from_url(url): f = urlopen(url) domains = f.readlines() f.close() return { domain.strip().lower() .decode('utf8').encode('idna') .lstrip('*.') for domain in domains if not domain.strip().startswith('//') and not domain.strip().startswith('#') and not domain.strip().startswith('!') and domain.strip() } if __name__ == '__main__': main()