Created
November 5, 2014 02:25
-
-
Save axiak/8e7a920f53fa45a253a4 to your computer and use it in GitHub Desktop.
Revisions
-
axiak created this gist
Nov 5, 2014 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,77 @@ #!/usr/bin/env python import re from collections import defaultdict from urllib2 import urlopen def main(): domains = set() #domains |= get_domains_from_url('https://publicsuffix.org/list/effective_tld_names.dat') domains |= get_domains_from_url('http://data.iana.org/TLD/tlds-alpha-by-domain.txt') tree = build_tree(domains) print r'.+\\.' + build_regex(tree) + '$' START = ';' STOP = ':' def build_regex(tree, start=START): items = tree[start] if len(items) == 1: next = iter(items).next() if next == STOP: return '' return escape(next) + build_regex(tree, start + next) else: pattern = [] for item in items: if item == STOP: pattern.append('') else: pattern.append(escape(item) + build_regex(tree, start + item)) return '(?:' + '|'.join(pattern) + ')' def build_tree(domains): tree = defaultdict(set) for domain in domains: for prefix, next in prefixes(START + domain + STOP): tree[prefix].add(next) return tree def is_ascii(s): return all(ord(c) < 128 for c in s) def prefixes(input): for i in range(1, len(input), 1): yield input[:i], input[i] def escape(item): return re.escape(item) def get_domains_from_url(url): f = urlopen(url) domains = f.readlines() f.close() return { domain.strip().lower() .decode('utf8').encode('idna') .lstrip('*.') for domain in domains if not domain.strip().startswith('//') and not domain.strip().startswith('#') and not domain.strip().startswith('!') and domain.strip() } if __name__ == '__main__': main()