from urllib import request from time import sleep import bs4 import json BASE_URL="https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide" service_model = {} def get_cf_service_resource_types(url: str): with request.urlopen(url) as fd: html = fd.read().decode('utf-8') child_soup = bs4.BeautifulSoup(html, 'html.parser') resource_types = [rt.p.a.contents[0] for rt in child_soup.find_all('li', attrs={'class': 'listitem'})] return resource_types with request.urlopen(f"{BASE_URL}/aws-template-resource-type-ref.html") as fd: aws_cf_services = fd.read().decode('utf-8') services_soup = bs4.BeautifulSoup(aws_cf_services, 'html.parser') services_ul = services_soup.find(name='div', attrs={'class': 'highlights'}).ul cf_services_list = [services.a.contents[0] for services in services_ul.children] for service in services_ul.children: sobj = {'name': service.a.contents[0]} child_url = f"{BASE_URL}/{service.a.attrs['href'].lstrip('./')}" # print(f"Fetching {child_url}") sleep(0.1) for resource_identifier in get_cf_service_resource_types(child_url): try: service_provider, service_name, resource_type = resource_identifier.split('::') if service_provider not in service_model.keys(): service_model[service_provider] = {'services': {}} if service_name not in service_model[service_provider]['services'].keys(): service_model[service_provider]['services'][service_name] = [] service_model[service_provider]['services'][service_name].append(resource_type) except Exception as exc: # print(exc) pass print(json.dumps(service_model))