tonyfast · January 25, 2025 07:16 · Jan 25, 2025
diff --git a/2025-01-24-cold-docs.ipynb b/2025-01-24-cold-docs.ipynb
@@ -0,0 +1,435 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "bc87bb4d-16f9-4ed6-875b-5b786bce9402",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import polars\n",
+    "from nbformat import v4\n",
+    "\n",
+    "input = list(map(anyio.Path, glob.glob(str(pathlib.Path(\"~/Documents/syllabus/docs/*.ipynb\").expanduser()))))\n",
+    "files = polars.Series(\"path\", input).to_frame()\n",
+    "files = files.with_columns(\n",
+    "    file=files[\"path\"].map_elements(lambda x: str(x._path), polars.String)\n",
+    ")\n",
+    "\n",
+    "def enumerate_iterable(series, name=\"id\", start=0):\n",
+    "    target_type = series.dtype.base_type()(polars.Struct(series.dtype.inner.fields + [polars.Field(name, polars.Int64)]))\n",
+    "    return series.map_elements(lambda x: [{**body, name: i} for (i, body) in enumerate(x, start)], target_type)\n",
+    "\n",
+    "# https://github.com/jupyter/nbconvert/blob/5f508ebad9471876f53a59c737bd5f47b2b4c163/share/templates/base/display_priority.j2\n",
+    "display_priority = \"\"\"text/html text/markdown image/svg+xml image/png image/jpeg text/plain application/pdf\n",
+    "text/latex text/vnd.mermaid application/javascript application/vnd.jupyter.widget-view+json\"\"\".strip().split()\n",
+    "\n",
+    "async def read_text(path):\n",
+    "    if isinstance(path, (anyio.Path, pathlib.Path)):\n",
+    "        if path.suffix == \".ipynb\":\n",
+    "            return await path.read_text()\n",
+    "        elif path.suffix == \".md\":\n",
+    "            return json.dumps(\n",
+    "                v4.new_notebook(cells=[v4.new_markdown_cell((await  path.read_text()).splitlines(True))])\n",
+    "            )\n",
+    "        elif path.suffix == \".py\":\n",
+    "            return json.dumps(v4.new_notebook(cells=[v4.new_code_cell((await path.read_text()).splitlines(True))]))\n",
+    "    return json.dumps(v4.new_notebook())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "d263521e-7f72-447e-bb12-e7cfc8ecaa87",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import bs4\n",
+    "Soup = partial(bs4.BeautifulSoup, features=\"lxml\")\n",
+    "TEMPLATE = Soup(Path(\"~/Documents/refnb/packages/refnb-core/refnb-core/index.html\").expanduser().read_text())\n",
+    "CELLS = TEMPLATE.select_one(\"template.cells\").select_one(\"tbody\")\n",
+    "CELL = TEMPLATE.select_one(\"template.cell\").select_one(\"tr\")\n",
+    "OUTPUTS = TEMPLATE.select_one(\"template.outputs\").select_one(\"details\")\n",
+    "OUTPUT = TEMPLATE.select_one(\"template.output\").select_one(\"tr\")\n",
+    "assert all((CELLS, CELL, OUTPUTS, OUTPUT)), \"bad selector\"\n",
+    "\n",
+    "def clone(el):\n",
+    "    from bs4 import Tag, NavigableString\n",
+    "    if isinstance(el, NavigableString): return type(el)(el)\n",
+    "    copy = Tag(None, el.builder, el.name, el.namespace, el.nsprefix)\n",
+    "    copy.attrs = dict(el.attrs)\n",
+    "    for k, v in copy.attrs.items():\n",
+    "        if isinstance(v, list):\n",
+    "            copy[k] = [*v]\n",
+    "    for attr in (\"can_be_empty_element\", \"hidden\"): setattr(copy, attr, getattr(el, attr))\n",
+    "    for child in el.contents: copy.append(clone(child))\n",
+    "    return copy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "3e2d9459-5e40-4acf-8c6b-d04ef8699cef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def display_dispatch(t, v, metadata):\n",
+    "    if t == \"text/plain\":\n",
+    "        yield \"\".join(v)\n",
+    "    elif t == \"text/markdown\":\n",
+    "        yield from Soup(get_markdown().render(\"\".join(v))).body.children\n",
+    "    elif t == \"text/html\":\n",
+    "        yield from Soup(\"\".join(v)).body.children\n",
+    "    elif t.startswith(\"text\"):\n",
+    "        # highlight form mimetype\n",
+    "        yield highlight(\"\".join(v))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "eac11e7e-6ec9-4097-8f8f-ed3f93c81d9f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "IDREFS = (\"aria-labelledby aria-describedby aria-owns aria-controls for form id\".split())\n",
+    "def populate(nb):\n",
+    "    tpl = clone(TEMPLATE)\n",
+    "    table = tpl.select_one(\"main.notebook table.cells\")\n",
+    "    footer = table.select_one(\"tfoot\")\n",
+    "    cells = clone(CELLS)\n",
+    "\n",
+    "    for cell in nb[\"cells\"]:\n",
+    "        id = cell[\"id\"]\n",
+    "        if cell[\"metadata\"].get(\"name\"):\n",
+    "            id = cell[\"metadata\"][\"name\"]\n",
+    "        row = clone(CELL)\n",
+    "        row[\"class\"].append(cell[\"cell_type\"])\n",
+    "        # link back to the document\n",
+    "        row.select_one(\"th.doc a\").append(str(nb[\"file\"]))\n",
+    "        row.select_one(\"th.cell a\").append(str(cell[\"cell\"]))\n",
+    "        row.select_one(\"th.id input\").attrs[\"value\"] = id\n",
+    "        row.select_one(\"td.execution_count output\").append(str(cell[\"execution_count\"] or \"\") )\n",
+    "        row.select_one(F\"td.cell_type option[value={cell['cell_type']}]\").attrs[\"selected\"] = True\n",
+    "        source = \"\".join(cell[\"source\"])\n",
+    "        if cell[\"cell_type\"] == \"markdown\":\n",
+    "            cell[\"outputs\"] = [dict(data={\"text/markdown\": source}, output_type=\"display_data\")]\n",
+    "        row.select_one(\"td.source textarea\").append(source)\n",
+    "        row.select_one(\"td.source section.highlight\").append(source)\n",
+    "        row.select_one(\"td.form form\")\n",
+    "        row.select_one(\"td.metadata\")\n",
+    "        if cell.get(\"outputs\"):\n",
+    "            details = clone(OUTPUTS)\n",
+    "            outputs = details.select_one(\"table\")\n",
+    "            for output in cell[\"outputs\"]:\n",
+    "                if output[\"output_type\"] in {\"display_data\", \"execute_result\"}:\n",
+    "                    body = TEMPLATE.new_tag(\"tbody\")\n",
+    "                    body.attrs.setdefault(\"class\", []).append(output['output_type'])\n",
+    "                    for t in itertools.chain(\n",
+    "                        filter(output[\"data\"].__contains__, display_priority),\n",
+    "                        filter(lambda x: x not in display_priority, output[\"data\"])\n",
+    "                    ):\n",
+    "                        v = output[\"data\"][t] or \"<body></body>\"\n",
+    "                        # we can include ALL the bundles OR the preferred one\n",
+    "\n",
+    "                        entry = clone(OUTPUT)\n",
+    "                        entry.select_one(\"td.execution_count output\").append(str(cell[\"execution_count\"] or \"\") )\n",
+    "                        entry.select_one(\"td.output_type label\").append(t)\n",
+    "                        if t not in entry[\"class\"]: \n",
+    "                            entry[\"class\"].append(t)\n",
+    "                        \n",
+    "                        body.append(entry)\n",
+    "                        try:\n",
+    "                            entry.select_one(\"td.data\").extend(display_dispatch(t, v, output.get(\"metadata\", {})))\n",
+    "                        except Exception as e: \n",
+    "                            raise e\n",
+    "                        entry.select_one(\"td.metadata\")\n",
+    "                        body.append(entry)\n",
+    "                    outputs.append(body)\n",
+    "                elif output[\"output_type\"] == \"stream\":\n",
+    "                    # stdout/stderr\n",
+    "                    entry = clone(OUTPUT)\n",
+    "                    entry[\"class\"] += F\" {output['output_type']}\"\n",
+    "                    entry.select_one(\"td.name\").append(output[\"name\"])\n",
+    "                    entry.select_one(\"td.text samp\").append(\"\".join(output[\"text\"]))\n",
+    "                    outputs.append(entry)\n",
+    "                elif output[\"output_type\"] == \"error\":\n",
+    "                    entry = clone(OUTPUT)\n",
+    "                    entry[\"class\"] += F\" {output['output_type']}\"\n",
+    "                    entry.select_one(\"td.ename\").append(output[\"ename\"])\n",
+    "                    entry.select_one(\"td.evalue samp\").append(\"\".join(output[\"evalue\"]))\n",
+    "                    entry.select_one(\"td.traceback samp\").append(\"\".join(output[\"traceback\"]))\n",
+    "                    entry.select_one(\"td.execution_count output\").append(str(cell[\"execution_count\"] or \"\") )\n",
+    "                    outputs.append(entry)\n",
+    "            row.select_one(\"td.outputs\").append(outputs)\n",
+    "            \n",
+    "\n",
+    "        if \"slide_type\" in cell[\"metadata\"]:\n",
+    "            row[\"class\"].append(cell[\"metadata\"][\"slide_type\"])\n",
+    "        if \"execution\" in cell[\"metadata\"]:\n",
+    "            row.select_one(\"td.started_at time\").append(cell[\"metadata\"][\"execution\"][\"iopub.execute_input\"])\n",
+    "            row.select_one(\"td.completed_at time\").append(cell[\"metadata\"][\"execution\"][\"iopub.execute_reply\"])\n",
+    "            # do the math for the time\n",
+    "            row.select_one(\"td.elapsed output time\")\n",
+    "        if cell[\"metadata\"].get(\"collapsed\"):\n",
+    "            row.select_one(\"td.outputs\")[\"class\"].append(\"collapsed\")\n",
+    "        if cell[\"metadata\"].get(\"scrolled\"):\n",
+    "            row.select_one(\"td.outputs\")[\"class\"].append(\"scrolled\")\n",
+    "        if cell[\"metadata\"].get(\"jupyter\"):\n",
+    "            if cell[\"metadata\"][\"jupyter\"].get(\"source_hidden\"):\n",
+    "                row.select_one(\"td.source\")[\"hidden\"] = \"\"\n",
+    "            if cell[\"metadata\"][\"jupyter\"].get(\"outputs_hidden\"):\n",
+    "                row.select_one(\"td.outputs\")[\"hidden\"] = \"\"\n",
+    "        row[\"class\"].extend(map(slugify.slugify, cell[\"metadata\"].get(\"tags\", \"\")))\n",
+    "        \n",
+    "        set_ids(row, id)\n",
+    "        cells.append(row)\n",
+    "    footer.insert_before(cells)\n",
+    "    inject_toc(tpl)\n",
+    "    return tpl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "25ce0453-de81-4a43-a7d4-4407108c9f1f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import slugify"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "e8366ea6-bacd-4bc6-b698-62b5a0b04058",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def inject_toc(document):\n",
+    "    toc = table = document.select_one(\"table.toc.headings\")\n",
+    "    ROW = table.select_one(\"template tr\")\n",
+    "    tbody = TEMPLATE.new_tag(\"tbody\")\n",
+    "    for h in document.select(\"table.cells h1,h2,h3,h4,h5,h6\"):\n",
+    "        row = clone(ROW)        \n",
+    "        a = row.select_one(\"td.heading>a\")\n",
+    "        heading = h.get_text()\n",
+    "        if \"id\" not in h: h[\"id\"] = slugify.slugify(heading)\n",
+    "        a.append(heading)\n",
+    "        a[\"href\"] = \"#\" + h[\"id\"]\n",
+    "        row.select_one(\"th.level\").append(h.name[1])\n",
+    "        row.select_one(\"td.description>p\")\n",
+    "        tbody.append(row)\n",
+    "    table.append(tbody)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "b95ae161-508d-402f-bb5c-dd6b3a75de2d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "idref_selection = \",\".join(map(\"[{}]\".format, IDREFS))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "2a5eb1ec-7785-421b-bd7a-101b07eea936",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def set_ids(selection, id=\"\"):\n",
+    "    for s in [selection] + selection.select(\",\".join(map(\"[{}]\".format, IDREFS))):\n",
+    "        for idref in IDREFS:\n",
+    "            if idref in s.attrs:\n",
+    "                value = s.attrs[idref]\n",
+    "                if value == \":\":\n",
+    "                    s[idref] = id\n",
+    "                elif isinstance(value, str):\n",
+    "                    s[idref] = \" \".join((F\"{id}-{x[1:]}\" if x.startswith(\":\") else x) for x in value.split())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "6ffa0606-3682-4fb2-95a5-d54a065d0c4b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@functools.lru_cache(1)\n",
+    "def get_markdown():\n",
+    "    from markdown_it import MarkdownIt\n",
+    "    return MarkdownIt()\n",
+    "\n",
+    "def highlight(source, lang=\"python\", attrs=None):\n",
+    "    import pygments\n",
+    "    try:\n",
+    "        return str(pygments.highlight(\n",
+    "            source,\n",
+    "            pygments.lexers.get_lexer_by_name(lang),\n",
+    "            pygments.formatters.get_formatter_by_name(\"html5\")\n",
+    "        )).pre\n",
+    "    except:\n",
+    "        return Soup(f\"\"\"<pre><code class=\"{lang}\">{html.escape(source)}</code></pre>\"\"\").pre"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "74ffa475-94ca-4c5c-a4b1-7835fb264786",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "contents = await asyncio.gather(*map(read_text, files[\"path\"]))\n",
+    "contents = polars.Series(\n",
+    "    \"contents\", contents, strict=False\n",
+    ").str.json_decode().struct.unnest().with_columns(\n",
+    "    file=files[\"file\"]\n",
+    "    # , path=files[\"path\"] # causes a panic cause its a python object\n",
+    ")\n",
+    "contents = contents.with_columns(cells=enumerate_iterable(contents[\"cells\"], \"cell\", 1))\n",
+    "CONTENTS_COLUMNS = [*contents.columns]\n",
+    "contents = contents.with_columns(\n",
+    "    contents.map_rows(lambda x: (populate(dict(zip(CONTENTS_COLUMNS, x))),)).rename({\"column_0\": \"html\"})\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "558a6247-bd4c-4205-ad39-ff07ddc8e67d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cells = contents[[\"file\", \"cells\"]].explode(\"cells\").unnest(\"cells\")\n",
+    "cells = cells.with_columns(source=cells[\"source\"].map_elements(\"\".join, polars.String))\n",
+    "outputs = cells[[\"file\", \"id\", \"outputs\"]].explode(\"outputs\").unnest(\"outputs\")\n",
+    "displays = outputs[[\"file\", \"id\", \"data\"]].drop_nulls().unnest(\"data\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "bdc6206b-1e66-4cc2-a2b3-3fb2842412a1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "        <iframe\n",
+       "            width=\"100%\"\n",
+       "            height=\"600\"\n",
+       "            src=\"test.html\"\n",
+       "            frameborder=\"0\"\n",
+       "            allowfullscreen\n",
+       "            \n",
+       "        ></iframe>\n",
+       "        "
+      ],
+      "text/plain": [
+       "<IPython.lib.display.IFrame at 0x781ba2539190>"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test = pathlib.Path(\"test.html\")\n",
+    "test.write_text(contents[\"html\"][0].body.prettify())\n",
+    "IFrame(\"test.html\", width=\"100%\", height=600)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c979859-e516-40e4-9247-32749879efb5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "33e17f3f-a1fa-4963-aa62-d8902ed596ca",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "599eafd7-8410-4a7d-84d7-7797efedf724",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6ee2397-0d24-48fd-82c1-e12a7b867a8d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3f3c50ca-6ab2-476c-83d3-8a554b8c62d6",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "802e5732-024f-4ec1-a467-8ed38ec02738",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "57e8e84d-6a69-4f46-b4b4-69a3b0eb1c2a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0950074d-be87-4b44-ab15-54ea19c0e521",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:p311] *",
+   "language": "python",
+   "name": "conda-env-p311-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
No results found