{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "name": "pyannote.ipynb",
      "gpuType": "A100",
      "machine_shape": "hm",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/StanAngeloff/91480fac18a74d8aff3e4cf566cfd0ff/pyannote.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import locale\n",
        "\n",
        "def getpreferredencoding(do_setlocale = True):\n",
        "    return \"UTF-8\"\n",
        "\n",
        "locale.getpreferredencoding = getpreferredencoding"
      ],
      "metadata": {
        "id": "rudtRKL9P7KK"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "lbtdzoCjO7DU"
      },
      "outputs": [],
      "source": [
        "!pip install \\\n",
        "  git+https://github.com/pyannote/pyannote-audio.git@7379f1c82be093078354449100e1a84cbdfbafdf \\\n",
        "  git+https://github.com/openai/whisper.git@248b6cb124225dd263bb9bd32d060b6517e067f8 \\\n",
        "  torch==2.0.0+cu118 torchvision==0.15.1+cu118 torchaudio==2.0.1 \\\n",
        "    --extra-index-url https://download.pytorch.org/whl/cu118"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import torch\n",
        "\n",
        "torch.cuda.is_available()"
      ],
      "metadata": {
        "id": "J_Ss89GhRlvq"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from google.colab import drive\n",
        "\n",
        "drive.mount('/content/gdrive')"
      ],
      "metadata": {
        "id": "10-Kt8ghQFK1"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from huggingface_hub import notebook_login\n",
        "\n",
        "notebook_login()"
      ],
      "metadata": {
        "id": "KOvvVyKSPJck"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import soundfile as sf\n",
        "\n",
        "sf.available_formats()"
      ],
      "metadata": {
        "id": "oNpve47cPJ9n"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "---"
      ],
      "metadata": {
        "id": "75IRpeQnxJYW"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import whisper\n",
        "import torch\n",
        "\n",
        "device = torch.device(\"cuda\")\n",
        "\n",
        "model = whisper.load_model(\"large\", device=device)"
      ],
      "metadata": {
        "id": "x0CLqnvDwg8P"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from pyannote.audio import Pipeline\n",
        "from pyannote.audio.pipelines.utils.hook import ProgressHook\n",
        "\n",
        "#device = torch.device(\"cuda\")\n",
        "\n",
        "speaker_diarization = Pipeline.from_pretrained(\n",
        "    \"pyannote/speaker-diarization@2.1\",\n",
        "    use_auth_token=True\n",
        ")\n",
        "\n",
        "speaker_diarization.to(device)"
      ],
      "metadata": {
        "id": "3rKnEyruPKwu"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!nvidia-smi"
      ],
      "metadata": {
        "id": "yzH_lXFOv2ur"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!cp \"/content/gdrive/MyDrive/Recordings/Day 1/Session 1.ogg\" /content/target.ogg"
      ],
      "metadata": {
        "id": "CHGdyOCFxCiQ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "---"
      ],
      "metadata": {
        "id": "05L9PUV9xb8k"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "with ProgressHook() as hook:\n",
        "    who_speaks_when = speaker_diarization(\n",
        "        \"/content/target.ogg\",\n",
        "        num_speakers=2,\n",
        "       #min_speakers=5,\n",
        "       #max_speakers=9,\n",
        "        hook=hook\n",
        "    )"
      ],
      "metadata": {
        "id": "PEH4cv5VPMfC"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from pyannote.core import Segment\n",
        "from pyannote.audio import Audio\n",
        "\n",
        "speakers = who_speaks_when.rename_labels({\n",
        "    #\"SPEAKER_00\": \"David\",\n",
        "    #\"SPEAKER_01\": \"Stan\",\n",
        "})\n",
        "\n",
        "crop = Segment(0, 999999999)\n",
        "#crop = Segment(5 * 60.0, 10 * 60.0)\n",
        "audio = Audio(sample_rate=16000, mono=\"downmix\")\n",
        "\n",
        "def float_to_timestamp(float_time):\n",
        "    hours, remainder = divmod(float_time, 3600)\n",
        "    minutes, seconds = divmod(remainder, 60)\n",
        "    return \"{:02}:{:02}:{:04.1f}\".format(int(hours), int(minutes), seconds)\n",
        "\n",
        "for segment, _, speaker in speakers.crop(crop).itertracks(yield_label=True):\n",
        "    waveform, sample_rate = audio.crop(\"/content/target.ogg\", segment)\n",
        "    text = model.transcribe(waveform.squeeze().numpy(), language=\"en\", initial_prompt=\"\"\"\n",
        "A recorded conversation between Company A consisting of Stan & David and Client consisting of … discussing a new project requirements and demonstrating … and current standard operating procedures in …, including Campaign Planning and Digital Marketing, Customer Service, IT Process, Finance.\n",
        "\"\"\".strip())[\"text\"]\n",
        "    print(f\"{float_to_timestamp(segment.start)}-{float_to_timestamp(segment.end)} {speaker.strip()}: {text.strip()}\")"
      ],
      "metadata": {
        "id": "JbC9Qj0k5fuF"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}