# coding=utf-8
# python3

from urllib.parse import urlparse

import requests
import urllib3

from bs4 import BeautifulSoup

# Disable SSL insecure warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Timeout for all HTTP requests
GLOBAL_HTTP_TIMEOUT = 7

# Set User-Agent for "OPSEC"
UA = {
	'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"
}

def normalize_url(domain, src):
	'''
	(Try to) Normalize URL to its absolute form
	'''

	src = src.strip()
	src = src.rstrip('/')

	# Protocol relative URL
	if src.startswith('//'):
		return 'http:{}'.format(src)
	
	# Relative URL with /
	if src.startswith('/'):
		return 'http://{}{}'.format(domain, src)

	# Relative URL with ?
	if src.startswith('?'):
		return 'http://{}/{}'.format(domain, src)

	# Relative URL with ./
	if src.startswith('./'):
		return 'http://{}{}'.format(domain, src[1:])

	# Absolute URL
	if src.startswith('https://') or src.startswith('http://'):
		return src

	# Else let's hope it is relative URL
	return 'http://{}/{}'.format(domain, src)

def extract_javascript(domain, source_code):
	'''
	Extract and normalize external javascript files from HTML
	'''

	tree = BeautifulSoup(source_code, 'html.parser')
	scripts = [normalize_url(domain, s.get('src')) for s in tree.find_all('script') if s.get('src')]
	return list(set(scripts))

def extract_links(domain, source_code):
	'''
	Extract and normalize links in HTML file 
	'''

	tree = BeautifulSoup(source_code, 'html.parser')
	hrefs = [normalize_url(domain, s.get('href')) for s in tree.find_all('a') if s.get('href')]
	return list(set(hrefs))

def extract_styles(domain, source_code):
	'''
	Extract and normalize CSS in HTML file 
	'''

	tree = BeautifulSoup(source_code, 'html.parser')
	hrefs = [normalize_url(domain, s.get('href')) for s in tree.find_all('link') if s.get('href')]
	return list(set(hrefs))

def extract_cors(headers):
	cors = headers['Access-Control-Allow-Origin'].split(',')
	if '*' in cors:
		# Use your imagination here
		return []
	return cors

def extract_domain(url):
	'''Extracts domain name from given URL'''

	return urlparse(url).netloc

if __name__ == '__main__':
	# This is sample of intended functionality
	# ----
	# Note that there is a missing functionality for showing
	# origin domain name where takeover was discovered (if any)
	# ----

	domains = [] # Database retrieval
	results = {}
	for d in domains:
		for prefix in ['http://', 'https://']:
			# Trying both HTTP and HTTPS where HTTPS has higher priority
			# (Thus second in the list)
			try:
				r = requests.get('{}{}'.format(prefix, d), timeout=GLOBAL_HTTP_TIMEOUT, verify=False, headers=UA)
			except:
				pass

		if r is None:
			# Connection refused / NXDOMAIN / ...
			continue
		
		urls = extract_javascript(d, r.text)
		urls += extract_links(d, r.text)
		urls += extract_styles(d, r.text)
		urls += extract_cors(r.headers)

		# takeoverable = subdomain_takeover.check([extract_domain(u) for u in urls])
		# ...