axiak · November 5, 2014 02:25 · Nov 5, 2014
diff --git a/tlds.py b/tlds.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+import re
+from collections import defaultdict
+from urllib2 import urlopen
+
+
+def main():
+    domains = set()
+    #domains |= get_domains_from_url('https://publicsuffix.org/list/effective_tld_names.dat')
+    domains |= get_domains_from_url('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
+
+    tree = build_tree(domains)
+
+    print r'.+\\.' + build_regex(tree) + '$'
+
+
+START = ';'
+STOP = ':'
+
+
+def build_regex(tree, start=START):
+    items = tree[start]
+    if len(items) == 1:
+        next = iter(items).next()
+        if next == STOP:
+            return ''
+        return escape(next) + build_regex(tree, start + next)
+    else:
+        pattern = []
+        for item in items:
+            if item == STOP:
+                pattern.append('')
+            else:
+                pattern.append(escape(item) + build_regex(tree, start + item))
+        return '(?:' + '|'.join(pattern) + ')'
+
+
+def build_tree(domains):
+    tree = defaultdict(set)
+    for domain in domains:
+        for prefix, next in prefixes(START + domain + STOP):
+            tree[prefix].add(next)
+
+    return tree
+
+
+def is_ascii(s):
+    return all(ord(c) < 128 for c in s)
+
+
+def prefixes(input):
+    for i in range(1, len(input), 1):
+        yield input[:i], input[i]
+
+
+def escape(item):
+    return re.escape(item)
+
+
+def get_domains_from_url(url):
+    f = urlopen(url)
+    domains = f.readlines()
+    f.close()
+    return {
+        domain.strip().lower()
+        .decode('utf8').encode('idna')
+        .lstrip('*.')
+        for domain in domains
+        if not domain.strip().startswith('//') and
+        not domain.strip().startswith('#') and
+        not domain.strip().startswith('!') and
+        domain.strip()
+    }
+
+
+if __name__ == '__main__':
+    main()
No results found