{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "name": "pyannote.ipynb", "gpuType": "A100", "machine_shape": "hm", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "code", "source": [ "import locale\n", "\n", "def getpreferredencoding(do_setlocale = True):\n", " return \"UTF-8\"\n", "\n", "locale.getpreferredencoding = getpreferredencoding" ], "metadata": { "id": "rudtRKL9P7KK" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "lbtdzoCjO7DU" }, "outputs": [], "source": [ "!pip install \\\n", " git+https://github.com/pyannote/pyannote-audio.git@7379f1c82be093078354449100e1a84cbdfbafdf \\\n", " git+https://github.com/openai/whisper.git@248b6cb124225dd263bb9bd32d060b6517e067f8 \\\n", " torch==2.0.0+cu118 torchvision==0.15.1+cu118 torchaudio==2.0.1 \\\n", " --extra-index-url https://download.pytorch.org/whl/cu118" ] }, { "cell_type": "code", "source": [ "import torch\n", "\n", "torch.cuda.is_available()" ], "metadata": { "id": "J_Ss89GhRlvq" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from google.colab import drive\n", "\n", "drive.mount('/content/gdrive')" ], "metadata": { "id": "10-Kt8ghQFK1" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from huggingface_hub import notebook_login\n", "\n", "notebook_login()" ], "metadata": { "id": "KOvvVyKSPJck" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import soundfile as sf\n", "\n", "sf.available_formats()" ], "metadata": { "id": "oNpve47cPJ9n" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "---" ], "metadata": { "id": "75IRpeQnxJYW" } }, { "cell_type": "code", "source": [ "import whisper\n", "import torch\n", "\n", "device = torch.device(\"cuda\")\n", "\n", "model = whisper.load_model(\"large\", device=device)" ], "metadata": { "id": "x0CLqnvDwg8P" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from pyannote.audio import Pipeline\n", "from pyannote.audio.pipelines.utils.hook import ProgressHook\n", "\n", "#device = torch.device(\"cuda\")\n", "\n", "speaker_diarization = Pipeline.from_pretrained(\n", " \"pyannote/speaker-diarization@2.1\",\n", " use_auth_token=True\n", ")\n", "\n", "speaker_diarization.to(device)" ], "metadata": { "id": "3rKnEyruPKwu" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!nvidia-smi" ], "metadata": { "id": "yzH_lXFOv2ur" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!cp \"/content/gdrive/MyDrive/Recordings/Day 1/Session 1.ogg\" /content/target.ogg" ], "metadata": { "id": "CHGdyOCFxCiQ" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "---" ], "metadata": { "id": "05L9PUV9xb8k" } }, { "cell_type": "code", "source": [ "with ProgressHook() as hook:\n", " who_speaks_when = speaker_diarization(\n", " \"/content/target.ogg\",\n", " num_speakers=2,\n", " #min_speakers=5,\n", " #max_speakers=9,\n", " hook=hook\n", " )" ], "metadata": { "id": "PEH4cv5VPMfC" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from pyannote.core import Segment\n", "from pyannote.audio import Audio\n", "\n", "speakers = who_speaks_when.rename_labels({\n", " #\"SPEAKER_00\": \"David\",\n", " #\"SPEAKER_01\": \"Stan\",\n", "})\n", "\n", "crop = Segment(0, 999999999)\n", "#crop = Segment(5 * 60.0, 10 * 60.0)\n", "audio = Audio(sample_rate=16000, mono=\"downmix\")\n", "\n", "def float_to_timestamp(float_time):\n", " hours, remainder = divmod(float_time, 3600)\n", " minutes, seconds = divmod(remainder, 60)\n", " return \"{:02}:{:02}:{:04.1f}\".format(int(hours), int(minutes), seconds)\n", "\n", "for segment, _, speaker in speakers.crop(crop).itertracks(yield_label=True):\n", " waveform, sample_rate = audio.crop(\"/content/target.ogg\", segment)\n", " text = model.transcribe(waveform.squeeze().numpy(), language=\"en\", initial_prompt=\"\"\"\n", "A recorded conversation between Company A consisting of Stan & David and Client consisting of … discussing a new project requirements and demonstrating … and current standard operating procedures in …, including Campaign Planning and Digital Marketing, Customer Service, IT Process, Finance.\n", "\"\"\".strip())[\"text\"]\n", " print(f\"{float_to_timestamp(segment.start)}-{float_to_timestamp(segment.end)} {speaker.strip()}: {text.strip()}\")" ], "metadata": { "id": "JbC9Qj0k5fuF" }, "execution_count": null, "outputs": [] } ] }