Skip to content

Instantly share code, notes, and snippets.

@obar1
Last active July 30, 2025 06:34
Show Gist options
  • Select an option

  • Save obar1/a6a7c656def69cf1179807f4303b345b to your computer and use it in GitHub Desktop.

Select an option

Save obar1/a6a7c656def69cf1179807f4303b345b to your computer and use it in GitHub Desktop.
simple skillboost html page downloader
import subprocess
import sys
import requests
from pathlib import Path
REPO = "paths_"
WKHTMLTOPDF_PATH = r"C:\git\py_fetch_skillboost\wkhtmltox\bin\wkhtmltopdf.exe"
def fetch_and_save_html(path_id, input_path_html):
url = f"https://partner.cloudskillsboost.google/paths/{path_id}"
try:
response = requests.get(url)
response.raise_for_status()
html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Saved Page {path_id}</title>
</head>
<body>
<p>Original page: <a href="{url}" target="_blank">{url}</a></p>
<hr>
{response.text}
</body>
</html>"""
if "This site is protected by reCAPTCHA and the Google" in html_content:
raise Exception("Warning: Page may be protected by reCAPTCHA. PDF conversion might not work properly.")
Path(input_path_html).write_text(html_content, encoding='utf-8')
print(f"Page saved successfully as '{input_path_html}'")
except requests.exceptions.HTTPError as http_err:
print(f"HTTP error occurred: {http_err} - Status code: {response.status_code}")
except requests.exceptions.RequestException as err:
print(f"Error fetching the page: {err}")
def generate_pdf(input_path_html):
try:
if not Path(WKHTMLTOPDF_PATH).exists():
print(f"wkhtmltopdf not found at: {WKHTMLTOPDF_PATH}")
return
output_pdf_path = input_path_html + '.pdf'
subprocess.run([WKHTMLTOPDF_PATH, input_path_html, output_pdf_path], check=True)
print(f"PDF saved to: {output_pdf_path}")
except subprocess.CalledProcessError as e:
print(f"PDF generation failed: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
def main():
if len(sys.argv) != 2:
print("Pass an id please")
sys.exit(1)
try:
path_id = int(sys.argv[1])
input_path_html = f"{REPO}{path_id}.html"
fetch_and_save_html(path_id, input_path_html)
generate_pdf(input_path_html)
except Exception as e:
print(e)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment