# coding=utf-8 # python3 from urllib.parse import urlparse import requests import urllib3 from bs4 import BeautifulSoup # Disable SSL insecure warnings urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # Timeout for all HTTP requests GLOBAL_HTTP_TIMEOUT = 7 # Set User-Agent for "OPSEC" UA = { 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36" } def normalize_url(domain, src): ''' (Try to) Normalize URL to its absolute form ''' src = src.strip() src = src.rstrip('/') # Protocol relative URL if src.startswith('//'): return 'http:{}'.format(src) # Relative URL with / if src.startswith('/'): return 'http://{}{}'.format(domain, src) # Relative URL with ? if src.startswith('?'): return 'http://{}/{}'.format(domain, src) # Relative URL with ./ if src.startswith('./'): return 'http://{}{}'.format(domain, src[1:]) # Absolute URL if src.startswith('https://') or src.startswith('http://'): return src # Else let's hope it is relative URL return 'http://{}/{}'.format(domain, src) def extract_javascript(domain, source_code): ''' Extract and normalize external javascript files from HTML ''' tree = BeautifulSoup(source_code, 'html.parser') scripts = [normalize_url(domain, s.get('src')) for s in tree.find_all('script') if s.get('src')] return list(set(scripts)) def extract_links(domain, source_code): ''' Extract and normalize links in HTML file ''' tree = BeautifulSoup(source_code, 'html.parser') hrefs = [normalize_url(domain, s.get('href')) for s in tree.find_all('a') if s.get('href')] return list(set(hrefs)) def extract_styles(domain, source_code): ''' Extract and normalize CSS in HTML file ''' tree = BeautifulSoup(source_code, 'html.parser') hrefs = [normalize_url(domain, s.get('href')) for s in tree.find_all('link') if s.get('href')] return list(set(hrefs)) def extract_cors(headers): cors = headers['Access-Control-Allow-Origin'].split(',') if '*' in cors: # Use your imagination here return [] return cors def extract_domain(url): '''Extracts domain name from given URL''' return urlparse(url).netloc if __name__ == '__main__': # This is sample of intended functionality # ---- # Note that there is a missing functionality for showing # origin domain name where takeover was discovered (if any) # ---- domains = [] # Database retrieval results = {} for d in domains: for prefix in ['http://', 'https://']: # Trying both HTTP and HTTPS where HTTPS has higher priority # (Thus second in the list) try: r = requests.get('{}{}'.format(prefix, d), timeout=GLOBAL_HTTP_TIMEOUT, verify=False, headers=UA) except: pass if r is None: # Connection refused / NXDOMAIN / ... continue urls = extract_javascript(d, r.text) urls += extract_links(d, r.text) urls += extract_styles(d, r.text) urls += extract_cors(r.headers) # takeoverable = subdomain_takeover.check([extract_domain(u) for u in urls]) # ...