Skip to content

Instantly share code, notes, and snippets.

@tonyfast
Created January 25, 2025 07:16
Show Gist options
  • Select an option

  • Save tonyfast/4b0acd39945adbc85744edd1cb449f9d to your computer and use it in GitHub Desktop.

Select an option

Save tonyfast/4b0acd39945adbc85744edd1cb449f9d to your computer and use it in GitHub Desktop.

Revisions

  1. tonyfast created this gist Jan 25, 2025.
    435 changes: 435 additions & 0 deletions 2025-01-24-cold-docs.ipynb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,435 @@
    {
    "cells": [
    {
    "cell_type": "code",
    "execution_count": 1,
    "id": "bc87bb4d-16f9-4ed6-875b-5b786bce9402",
    "metadata": {},
    "outputs": [],
    "source": [
    "import polars\n",
    "from nbformat import v4\n",
    "\n",
    "input = list(map(anyio.Path, glob.glob(str(pathlib.Path(\"~/Documents/syllabus/docs/*.ipynb\").expanduser()))))\n",
    "files = polars.Series(\"path\", input).to_frame()\n",
    "files = files.with_columns(\n",
    " file=files[\"path\"].map_elements(lambda x: str(x._path), polars.String)\n",
    ")\n",
    "\n",
    "def enumerate_iterable(series, name=\"id\", start=0):\n",
    " target_type = series.dtype.base_type()(polars.Struct(series.dtype.inner.fields + [polars.Field(name, polars.Int64)]))\n",
    " return series.map_elements(lambda x: [{**body, name: i} for (i, body) in enumerate(x, start)], target_type)\n",
    "\n",
    "# https://github.com/jupyter/nbconvert/blob/5f508ebad9471876f53a59c737bd5f47b2b4c163/share/templates/base/display_priority.j2\n",
    "display_priority = \"\"\"text/html text/markdown image/svg+xml image/png image/jpeg text/plain application/pdf\n",
    "text/latex text/vnd.mermaid application/javascript application/vnd.jupyter.widget-view+json\"\"\".strip().split()\n",
    "\n",
    "async def read_text(path):\n",
    " if isinstance(path, (anyio.Path, pathlib.Path)):\n",
    " if path.suffix == \".ipynb\":\n",
    " return await path.read_text()\n",
    " elif path.suffix == \".md\":\n",
    " return json.dumps(\n",
    " v4.new_notebook(cells=[v4.new_markdown_cell((await path.read_text()).splitlines(True))])\n",
    " )\n",
    " elif path.suffix == \".py\":\n",
    " return json.dumps(v4.new_notebook(cells=[v4.new_code_cell((await path.read_text()).splitlines(True))]))\n",
    " return json.dumps(v4.new_notebook())"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 14,
    "id": "d263521e-7f72-447e-bb12-e7cfc8ecaa87",
    "metadata": {},
    "outputs": [],
    "source": [
    "import bs4\n",
    "Soup = partial(bs4.BeautifulSoup, features=\"lxml\")\n",
    "TEMPLATE = Soup(Path(\"~/Documents/refnb/packages/refnb-core/refnb-core/index.html\").expanduser().read_text())\n",
    "CELLS = TEMPLATE.select_one(\"template.cells\").select_one(\"tbody\")\n",
    "CELL = TEMPLATE.select_one(\"template.cell\").select_one(\"tr\")\n",
    "OUTPUTS = TEMPLATE.select_one(\"template.outputs\").select_one(\"details\")\n",
    "OUTPUT = TEMPLATE.select_one(\"template.output\").select_one(\"tr\")\n",
    "assert all((CELLS, CELL, OUTPUTS, OUTPUT)), \"bad selector\"\n",
    "\n",
    "def clone(el):\n",
    " from bs4 import Tag, NavigableString\n",
    " if isinstance(el, NavigableString): return type(el)(el)\n",
    " copy = Tag(None, el.builder, el.name, el.namespace, el.nsprefix)\n",
    " copy.attrs = dict(el.attrs)\n",
    " for k, v in copy.attrs.items():\n",
    " if isinstance(v, list):\n",
    " copy[k] = [*v]\n",
    " for attr in (\"can_be_empty_element\", \"hidden\"): setattr(copy, attr, getattr(el, attr))\n",
    " for child in el.contents: copy.append(clone(child))\n",
    " return copy"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 15,
    "id": "3e2d9459-5e40-4acf-8c6b-d04ef8699cef",
    "metadata": {},
    "outputs": [],
    "source": [
    "def display_dispatch(t, v, metadata):\n",
    " if t == \"text/plain\":\n",
    " yield \"\".join(v)\n",
    " elif t == \"text/markdown\":\n",
    " yield from Soup(get_markdown().render(\"\".join(v))).body.children\n",
    " elif t == \"text/html\":\n",
    " yield from Soup(\"\".join(v)).body.children\n",
    " elif t.startswith(\"text\"):\n",
    " # highlight form mimetype\n",
    " yield highlight(\"\".join(v))"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 16,
    "id": "eac11e7e-6ec9-4097-8f8f-ed3f93c81d9f",
    "metadata": {},
    "outputs": [],
    "source": [
    "IDREFS = (\"aria-labelledby aria-describedby aria-owns aria-controls for form id\".split())\n",
    "def populate(nb):\n",
    " tpl = clone(TEMPLATE)\n",
    " table = tpl.select_one(\"main.notebook table.cells\")\n",
    " footer = table.select_one(\"tfoot\")\n",
    " cells = clone(CELLS)\n",
    "\n",
    " for cell in nb[\"cells\"]:\n",
    " id = cell[\"id\"]\n",
    " if cell[\"metadata\"].get(\"name\"):\n",
    " id = cell[\"metadata\"][\"name\"]\n",
    " row = clone(CELL)\n",
    " row[\"class\"].append(cell[\"cell_type\"])\n",
    " # link back to the document\n",
    " row.select_one(\"th.doc a\").append(str(nb[\"file\"]))\n",
    " row.select_one(\"th.cell a\").append(str(cell[\"cell\"]))\n",
    " row.select_one(\"th.id input\").attrs[\"value\"] = id\n",
    " row.select_one(\"td.execution_count output\").append(str(cell[\"execution_count\"] or \"\") )\n",
    " row.select_one(F\"td.cell_type option[value={cell['cell_type']}]\").attrs[\"selected\"] = True\n",
    " source = \"\".join(cell[\"source\"])\n",
    " if cell[\"cell_type\"] == \"markdown\":\n",
    " cell[\"outputs\"] = [dict(data={\"text/markdown\": source}, output_type=\"display_data\")]\n",
    " row.select_one(\"td.source textarea\").append(source)\n",
    " row.select_one(\"td.source section.highlight\").append(source)\n",
    " row.select_one(\"td.form form\")\n",
    " row.select_one(\"td.metadata\")\n",
    " if cell.get(\"outputs\"):\n",
    " details = clone(OUTPUTS)\n",
    " outputs = details.select_one(\"table\")\n",
    " for output in cell[\"outputs\"]:\n",
    " if output[\"output_type\"] in {\"display_data\", \"execute_result\"}:\n",
    " body = TEMPLATE.new_tag(\"tbody\")\n",
    " body.attrs.setdefault(\"class\", []).append(output['output_type'])\n",
    " for t in itertools.chain(\n",
    " filter(output[\"data\"].__contains__, display_priority),\n",
    " filter(lambda x: x not in display_priority, output[\"data\"])\n",
    " ):\n",
    " v = output[\"data\"][t] or \"<body></body>\"\n",
    " # we can include ALL the bundles OR the preferred one\n",
    "\n",
    " entry = clone(OUTPUT)\n",
    " entry.select_one(\"td.execution_count output\").append(str(cell[\"execution_count\"] or \"\") )\n",
    " entry.select_one(\"td.output_type label\").append(t)\n",
    " if t not in entry[\"class\"]: \n",
    " entry[\"class\"].append(t)\n",
    " \n",
    " body.append(entry)\n",
    " try:\n",
    " entry.select_one(\"td.data\").extend(display_dispatch(t, v, output.get(\"metadata\", {})))\n",
    " except Exception as e: \n",
    " raise e\n",
    " entry.select_one(\"td.metadata\")\n",
    " body.append(entry)\n",
    " outputs.append(body)\n",
    " elif output[\"output_type\"] == \"stream\":\n",
    " # stdout/stderr\n",
    " entry = clone(OUTPUT)\n",
    " entry[\"class\"] += F\" {output['output_type']}\"\n",
    " entry.select_one(\"td.name\").append(output[\"name\"])\n",
    " entry.select_one(\"td.text samp\").append(\"\".join(output[\"text\"]))\n",
    " outputs.append(entry)\n",
    " elif output[\"output_type\"] == \"error\":\n",
    " entry = clone(OUTPUT)\n",
    " entry[\"class\"] += F\" {output['output_type']}\"\n",
    " entry.select_one(\"td.ename\").append(output[\"ename\"])\n",
    " entry.select_one(\"td.evalue samp\").append(\"\".join(output[\"evalue\"]))\n",
    " entry.select_one(\"td.traceback samp\").append(\"\".join(output[\"traceback\"]))\n",
    " entry.select_one(\"td.execution_count output\").append(str(cell[\"execution_count\"] or \"\") )\n",
    " outputs.append(entry)\n",
    " row.select_one(\"td.outputs\").append(outputs)\n",
    " \n",
    "\n",
    " if \"slide_type\" in cell[\"metadata\"]:\n",
    " row[\"class\"].append(cell[\"metadata\"][\"slide_type\"])\n",
    " if \"execution\" in cell[\"metadata\"]:\n",
    " row.select_one(\"td.started_at time\").append(cell[\"metadata\"][\"execution\"][\"iopub.execute_input\"])\n",
    " row.select_one(\"td.completed_at time\").append(cell[\"metadata\"][\"execution\"][\"iopub.execute_reply\"])\n",
    " # do the math for the time\n",
    " row.select_one(\"td.elapsed output time\")\n",
    " if cell[\"metadata\"].get(\"collapsed\"):\n",
    " row.select_one(\"td.outputs\")[\"class\"].append(\"collapsed\")\n",
    " if cell[\"metadata\"].get(\"scrolled\"):\n",
    " row.select_one(\"td.outputs\")[\"class\"].append(\"scrolled\")\n",
    " if cell[\"metadata\"].get(\"jupyter\"):\n",
    " if cell[\"metadata\"][\"jupyter\"].get(\"source_hidden\"):\n",
    " row.select_one(\"td.source\")[\"hidden\"] = \"\"\n",
    " if cell[\"metadata\"][\"jupyter\"].get(\"outputs_hidden\"):\n",
    " row.select_one(\"td.outputs\")[\"hidden\"] = \"\"\n",
    " row[\"class\"].extend(map(slugify.slugify, cell[\"metadata\"].get(\"tags\", \"\")))\n",
    " \n",
    " set_ids(row, id)\n",
    " cells.append(row)\n",
    " footer.insert_before(cells)\n",
    " inject_toc(tpl)\n",
    " return tpl"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 17,
    "id": "25ce0453-de81-4a43-a7d4-4407108c9f1f",
    "metadata": {},
    "outputs": [],
    "source": [
    "import slugify"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 18,
    "id": "e8366ea6-bacd-4bc6-b698-62b5a0b04058",
    "metadata": {},
    "outputs": [],
    "source": [
    "def inject_toc(document):\n",
    " toc = table = document.select_one(\"table.toc.headings\")\n",
    " ROW = table.select_one(\"template tr\")\n",
    " tbody = TEMPLATE.new_tag(\"tbody\")\n",
    " for h in document.select(\"table.cells h1,h2,h3,h4,h5,h6\"):\n",
    " row = clone(ROW) \n",
    " a = row.select_one(\"td.heading>a\")\n",
    " heading = h.get_text()\n",
    " if \"id\" not in h: h[\"id\"] = slugify.slugify(heading)\n",
    " a.append(heading)\n",
    " a[\"href\"] = \"#\" + h[\"id\"]\n",
    " row.select_one(\"th.level\").append(h.name[1])\n",
    " row.select_one(\"td.description>p\")\n",
    " tbody.append(row)\n",
    " table.append(tbody)"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 19,
    "id": "b95ae161-508d-402f-bb5c-dd6b3a75de2d",
    "metadata": {},
    "outputs": [],
    "source": [
    "idref_selection = \",\".join(map(\"[{}]\".format, IDREFS))"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 20,
    "id": "2a5eb1ec-7785-421b-bd7a-101b07eea936",
    "metadata": {},
    "outputs": [],
    "source": [
    "def set_ids(selection, id=\"\"):\n",
    " for s in [selection] + selection.select(\",\".join(map(\"[{}]\".format, IDREFS))):\n",
    " for idref in IDREFS:\n",
    " if idref in s.attrs:\n",
    " value = s.attrs[idref]\n",
    " if value == \":\":\n",
    " s[idref] = id\n",
    " elif isinstance(value, str):\n",
    " s[idref] = \" \".join((F\"{id}-{x[1:]}\" if x.startswith(\":\") else x) for x in value.split())"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 21,
    "id": "6ffa0606-3682-4fb2-95a5-d54a065d0c4b",
    "metadata": {},
    "outputs": [],
    "source": [
    "@functools.lru_cache(1)\n",
    "def get_markdown():\n",
    " from markdown_it import MarkdownIt\n",
    " return MarkdownIt()\n",
    "\n",
    "def highlight(source, lang=\"python\", attrs=None):\n",
    " import pygments\n",
    " try:\n",
    " return str(pygments.highlight(\n",
    " source,\n",
    " pygments.lexers.get_lexer_by_name(lang),\n",
    " pygments.formatters.get_formatter_by_name(\"html5\")\n",
    " )).pre\n",
    " except:\n",
    " return Soup(f\"\"\"<pre><code class=\"{lang}\">{html.escape(source)}</code></pre>\"\"\").pre"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 22,
    "id": "74ffa475-94ca-4c5c-a4b1-7835fb264786",
    "metadata": {
    "scrolled": true
    },
    "outputs": [],
    "source": [
    "contents = await asyncio.gather(*map(read_text, files[\"path\"]))\n",
    "contents = polars.Series(\n",
    " \"contents\", contents, strict=False\n",
    ").str.json_decode().struct.unnest().with_columns(\n",
    " file=files[\"file\"]\n",
    " # , path=files[\"path\"] # causes a panic cause its a python object\n",
    ")\n",
    "contents = contents.with_columns(cells=enumerate_iterable(contents[\"cells\"], \"cell\", 1))\n",
    "CONTENTS_COLUMNS = [*contents.columns]\n",
    "contents = contents.with_columns(\n",
    " contents.map_rows(lambda x: (populate(dict(zip(CONTENTS_COLUMNS, x))),)).rename({\"column_0\": \"html\"})\n",
    ")"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 23,
    "id": "558a6247-bd4c-4205-ad39-ff07ddc8e67d",
    "metadata": {},
    "outputs": [],
    "source": [
    "cells = contents[[\"file\", \"cells\"]].explode(\"cells\").unnest(\"cells\")\n",
    "cells = cells.with_columns(source=cells[\"source\"].map_elements(\"\".join, polars.String))\n",
    "outputs = cells[[\"file\", \"id\", \"outputs\"]].explode(\"outputs\").unnest(\"outputs\")\n",
    "displays = outputs[[\"file\", \"id\", \"data\"]].drop_nulls().unnest(\"data\")"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 24,
    "id": "bdc6206b-1e66-4cc2-a2b3-3fb2842412a1",
    "metadata": {},
    "outputs": [
    {
    "data": {
    "text/html": [
    "\n",
    " <iframe\n",
    " width=\"100%\"\n",
    " height=\"600\"\n",
    " src=\"test.html\"\n",
    " frameborder=\"0\"\n",
    " allowfullscreen\n",
    " \n",
    " ></iframe>\n",
    " "
    ],
    "text/plain": [
    "<IPython.lib.display.IFrame at 0x781ba2539190>"
    ]
    },
    "execution_count": 24,
    "metadata": {},
    "output_type": "execute_result"
    }
    ],
    "source": [
    "test = pathlib.Path(\"test.html\")\n",
    "test.write_text(contents[\"html\"][0].body.prettify())\n",
    "IFrame(\"test.html\", width=\"100%\", height=600)"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "4c979859-e516-40e4-9247-32749879efb5",
    "metadata": {},
    "outputs": [],
    "source": []
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "33e17f3f-a1fa-4963-aa62-d8902ed596ca",
    "metadata": {},
    "outputs": [],
    "source": []
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "599eafd7-8410-4a7d-84d7-7797efedf724",
    "metadata": {},
    "outputs": [],
    "source": []
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "e6ee2397-0d24-48fd-82c1-e12a7b867a8d",
    "metadata": {},
    "outputs": [],
    "source": []
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "3f3c50ca-6ab2-476c-83d3-8a554b8c62d6",
    "metadata": {},
    "outputs": [],
    "source": []
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "802e5732-024f-4ec1-a467-8ed38ec02738",
    "metadata": {},
    "outputs": [],
    "source": []
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "57e8e84d-6a69-4f46-b4b4-69a3b0eb1c2a",
    "metadata": {},
    "outputs": [],
    "source": []
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "0950074d-be87-4b44-ab15-54ea19c0e521",
    "metadata": {},
    "outputs": [],
    "source": []
    }
    ],
    "metadata": {
    "kernelspec": {
    "display_name": "Python [conda env:p311] *",
    "language": "python",
    "name": "conda-env-p311-py"
    },
    "language_info": {
    "codemirror_mode": {
    "name": "ipython",
    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.11.6"
    }
    },
    "nbformat": 4,
    "nbformat_minor": 5
    }