Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save julianpistorius/4d78d33a78d8ff56709a962b147a154d to your computer and use it in GitHub Desktop.
Save julianpistorius/4d78d33a78d8ff56709a962b147a154d to your computer and use it in GitHub Desktop.

Revisions

  1. @manics manics revised this gist Jul 13, 2025. 1 changed file with 175 additions and 6 deletions.
    181 changes: 175 additions & 6 deletions k8tre-demo-audiotrack.ipynb
    Original file line number Diff line number Diff line change
    @@ -22,12 +22,12 @@
    "outputs": [],
    "source": [
    "lines = [\n",
    " \"CATER is designed using infrastructure-as-code, and is deployed using a git-ops workflow. We're using ArgoCD which is an open-source git-ops tools designed for Kubernetes. This means all infrastructure is fully reproducible, and all approved code updates are automatically deployed by ArgoCD.\",\n",
    " \"CATER is designed using infrastructure-as-code, and is deployed using a git-ops workflow. We\"re using ArgoCD which is an open-source git-ops tools designed for Kubernetes. This means all infrastructure is fully reproducible, and all approved code updates are automatically deployed by ArgoCD.\",\n",
    " \"CATER consists of a set of applications (or components). By default we install everything required to run a T.R.E., but all components can be disabled or replaced by another implementation.\",\n",
    " \"For the first three months work has focussed on this backend work of writing and deploying components, but we've now started integrating those components so they can be used through a frontend.\",\n",
    " \"For the first three months work has focussed on this backend work of writing and deploying components, but we\"ve now started integrating those components so they can be used through a frontend.\",\n",
    "\n",
    " \"You login to CATER using Keycloak. A username and password are used in this demo, but multi-factor authentication can be easily added, and Keycloak can federate with other identity providers using SAML or O.I.C.D.\",\n",
    " \"In this demo we're using JupyterHub as a control plane for researcher workspaces.\",\n",
    " \"In this demo we\"re using JupyterHub as a control plane for researcher workspaces.\",\n",
    " \"You can see a list of projects and workspace types. This demo only has Ubuntu Mate desktops.\",\n",
    " \"When you launch a workspace a new Kubernetes pod is created, project storage is mounted, and users are given access via Apache Guacamole which is an open-source remote desktop gateway.\",\n",
    " \"As you can see you have a full desktop via a web browser\",\n",
    @@ -74,7 +74,7 @@
    " s = r[\"AudioStream\"]\n",
    "\n",
    " with wave.open(outfile, \"wb\") as wav:\n",
    " wav.setparams((1, 2, 16000, 0, 'NONE', 'NONE'))\n",
    " wav.setparams((1, 2, 16000, 0, \"NONE\", \"NONE\"))\n",
    " wav.writeframes(s.read())\n"
    ]
    },
    @@ -90,18 +90,187 @@
    " speak(text, f\"{i:02d}.wav\")"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "5784e676-37aa-4e92-99bb-c34727329b32",
    "metadata": {},
    "outputs": [],
    "source": []
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "07110973",
    "metadata": {},
    "outputs": [],
    "source": [
    "import ffmpeg\n",
    "import os\n"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "9afc0e1e-34d8-4928-bba8-4ceb89f5f57d",
    "metadata": {},
    "outputs": [],
    "source": [
    "# Define input file paths\n",
    "audio_files = [\"00.wav\", \"01.wav\", \"02.wav\", \"03.wav\", \"04.wav\", \"05.wav\", \"06.wav\", \"07.wav\"]\n",
    "output_file = \"concatenated_with_gaps.wav\"\n",
    "# audio_codec = \"libmp3lame\" # Or \"aac\", \"pcm_s16le\", etc. based on desired output\n",
    "audio_codec = \"pcm_s16le\"\n",
    "\n",
    "# Define the gap duration in seconds\n",
    "START_DURATION = 2.0\n",
    "GAP_DURATION = 2.0\n",
    "END_DURATION = 2.0"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "752cf3cc-645a-4777-a535-89b2b6d761c4",
    "metadata": {},
    "outputs": [],
    "source": [
    "# --- Step 1: Get audio file properties for silence generation ---\n",
    "# We need to match the sample rate and channel layout of the real audio files\n",
    "# to ensure consistent silence. We\"ll just probe the first audio file.\n",
    "def get_audio_properties(file_path):\n",
    " \"\"\"\n",
    " Gets sample rate and channel layout of an audio file using ffprobe.\n",
    " \"\"\"\n",
    " try:\n",
    " probe = ffmpeg.probe(file_path)\n",
    " audio_stream = next((s for s in probe[\"streams\"] if s[\"codec_type\"] == \"audio\"), None)\n",
    " if audio_stream:\n",
    " return {\n",
    " \"sample_rate\": int(audio_stream[\"sample_rate\"]),\n",
    " \"channel_layout\": audio_stream.get(\"channel_layout\", \"mono\"), # Default to mono if not found\n",
    " \"duration\": float(audio_stream[\"duration\"]),\n",
    " }\n",
    " else:\n",
    " raise ValueError(f\"No audio stream found for: {file_path}\")\n",
    " except ffmpeg.Error as e:\n",
    " print(f\"Error probing {file_path}: {e.stderr.decode()}\")\n",
    " raise\n",
    " except Exception as e:\n",
    " print(f\"An unexpected error occurred while probing {file_path}: {e}\")\n",
    " raise"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "0149e5cd-a562-458a-a2d7-4b9ff4a24144",
    "metadata": {},
    "outputs": [],
    "source": [
    "# Properties for all audio files must match\n",
    "audio_props = [get_audio_properties(f) for f in audio_files]\n",
    "sample_rates = set(p[\"sample_rate\"] for p in audio_props)\n",
    "if len(sample_rates) != 1:\n",
    " print(f\"Multiple sample rates found: {sample_rates}\")\n",
    "channel_layouts = set(p[\"channel_layout\"] for p in audio_props)\n",
    "if len(channel_layouts) != 1:\n",
    " print(f\"Multiple channel layouts found: {channel_layouts}\")\n",
    "\n",
    "first_audio_props = audio_props[0]\n",
    "SAMPLE_RATE = next(iter(sample_rates))\n",
    "CHANNEL_LAYOUT = next(iter(channel_layouts))\n",
    "\n",
    "print(f\"Using sample rate: {SAMPLE_RATE} Hz, channel layout: {CHANNEL_LAYOUT} for silence.\")"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "ff6db6c4-e646-4e3b-8bcc-adf1699431f1",
    "metadata": {},
    "outputs": [],
    "source": [
    "# --- Step 2: Build the filtergraph including silence ---\n",
    "graph_inputs = []\n",
    "\n",
    "# Insert gap at start\n",
    "silence_input = ffmpeg.input(\n",
    " f\"anullsrc=r={SAMPLE_RATE}:cl={CHANNEL_LAYOUT}\",\n",
    " f=\"lavfi\",\n",
    " t=START_DURATION,\n",
    ").audio\n",
    "graph_inputs.append(silence_input)\n",
    "\n",
    "for i, audio_file in enumerate(audio_files):\n",
    " graph_inputs.append(ffmpeg.input(audio_file).audio)\n",
    "\n",
    " # Add silence between tracks\n",
    " if i < len(audio_files) - 1:\n",
    " silence_input = ffmpeg.input(\n",
    " f\"anullsrc=r={SAMPLE_RATE}:cl={CHANNEL_LAYOUT}\",\n",
    " f=\"lavfi\", # \"lavfi\" is for libavfilter inputs like anullsrc\n",
    " t=GAP_DURATION,\n",
    " ).audio\n",
    " graph_inputs.append(silence_input)\n",
    "\n",
    "# Insert gap at end\n",
    "silence_input = ffmpeg.input(\n",
    " f\"anullsrc=r={SAMPLE_RATE}:cl={CHANNEL_LAYOUT}\",\n",
    " f=\"lavfi\",\n",
    " t=END_DURATION,\n",
    ").audio\n",
    "graph_inputs.append(silence_input)\n",
    "\n",
    "\n",
    "# Need to tell the concat filter how many inputs it has in total.\n",
    "total_inputs_for_concat = len(graph_inputs)\n",
    "\n",
    "# Apply the concat filter\n",
    "concatenated_audio = ffmpeg.filter(\n",
    " graph_inputs,\n",
    " \"concat\",\n",
    " n=total_inputs_for_concat,\n",
    " v=0, # No video streams\n",
    " a=1 # One audio stream per input\n",
    ")"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "360882f9-5fba-476f-a741-539e3df749cb",
    "metadata": {},
    "outputs": [],
    "source": [
    "# --- Step 3: Define the output and run ---\n",
    "output_stream = ffmpeg.output(\n",
    " concatenated_audio,\n",
    " output_file,\n",
    " acodec=audio_codec,\n",
    ")\n",
    "\n",
    "try:\n",
    " ffmpeg.run(output_stream, overwrite_output=True)\n",
    " print(f\"Audio files concatenated to {output_file} with {GAP_DURATION}-second gaps successfully!\")\n",
    "\n",
    "except ffmpeg.Error as e:\n",
    " print(e)\n",
    " raise"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "2e8f960a-afa5-49c0-b423-c9eabb6d1b51",
    "metadata": {},
    "outputs": [],
    "source": []
    }
    ],
    "metadata": {
    "kernelspec": {
    "display_name": "cloud",
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
    },
    @@ -115,7 +284,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.13.3"
    "version": "3.13.5"
    }
    },
    "nbformat": 4,
  2. @manics manics created this gist Jul 10, 2025.
    123 changes: 123 additions & 0 deletions k8tre-demo-audiotrack.ipynb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,123 @@
    {
    "cells": [
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "13e39ceb",
    "metadata": {},
    "outputs": [],
    "source": [
    "import boto3\n",
    "import wave\n",
    "\n",
    "# boto3.setup_default_session(profile_name=\"...\")\n",
    "polly = boto3.client(\"polly\")"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "a670d769",
    "metadata": {},
    "outputs": [],
    "source": [
    "lines = [\n",
    " \"CATER is designed using infrastructure-as-code, and is deployed using a git-ops workflow. We're using ArgoCD which is an open-source git-ops tools designed for Kubernetes. This means all infrastructure is fully reproducible, and all approved code updates are automatically deployed by ArgoCD.\",\n",
    " \"CATER consists of a set of applications (or components). By default we install everything required to run a T.R.E., but all components can be disabled or replaced by another implementation.\",\n",
    " \"For the first three months work has focussed on this backend work of writing and deploying components, but we've now started integrating those components so they can be used through a frontend.\",\n",
    "\n",
    " \"You login to CATER using Keycloak. A username and password are used in this demo, but multi-factor authentication can be easily added, and Keycloak can federate with other identity providers using SAML or O.I.C.D.\",\n",
    " \"In this demo we're using JupyterHub as a control plane for researcher workspaces.\",\n",
    " \"You can see a list of projects and workspace types. This demo only has Ubuntu Mate desktops.\",\n",
    " \"When you launch a workspace a new Kubernetes pod is created, project storage is mounted, and users are given access via Apache Guacamole which is an open-source remote desktop gateway.\",\n",
    " \"As you can see you have a full desktop via a web browser\",\n",
    "]"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "bf70124e",
    "metadata": {},
    "outputs": [],
    "source": [
    "voices = polly.describe_voices()[\"Voices\"]"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "47c5c74e",
    "metadata": {},
    "outputs": [],
    "source": [
    "# voice = \"Brian\"\n",
    "voice = \"Amy\"\n",
    "# voice = \"Emma\""
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "6d166646",
    "metadata": {},
    "outputs": [],
    "source": [
    "def speak(text, outfile):\n",
    " r = polly.synthesize_speech(\n",
    " Engine=\"neural\",\n",
    " LanguageCode=\"en-GB\",\n",
    " OutputFormat=\"pcm\",\n",
    " Text=text,\n",
    " VoiceId=voice,\n",
    " )\n",
    " s = r[\"AudioStream\"]\n",
    "\n",
    " with wave.open(outfile, \"wb\") as wav:\n",
    " wav.setparams((1, 2, 16000, 0, 'NONE', 'NONE'))\n",
    " wav.writeframes(s.read())\n"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "f62929a0",
    "metadata": {},
    "outputs": [],
    "source": [
    "for i, text in enumerate(lines):\n",
    " print(text)\n",
    " speak(text, f\"{i:02d}.wav\")"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "07110973",
    "metadata": {},
    "outputs": [],
    "source": []
    }
    ],
    "metadata": {
    "kernelspec": {
    "display_name": "cloud",
    "language": "python",
    "name": "python3"
    },
    "language_info": {
    "codemirror_mode": {
    "name": "ipython",
    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.13.3"
    }
    },
    "nbformat": 4,
    "nbformat_minor": 5
    }