Skip to content

Instantly share code, notes, and snippets.

@huseinzol05
Created March 9, 2025 07:12
Show Gist options
  • Save huseinzol05/e570d6ecb5fe62ccd27cf462719bcbe4 to your computer and use it in GitHub Desktop.
Save huseinzol05/e570d6ecb5fe62ccd27cf462719bcbe4 to your computer and use it in GitHub Desktop.

Revisions

  1. huseinzol05 created this gist Mar 9, 2025.
    369 changes: 369 additions & 0 deletions upload-audio.ipynb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,369 @@
    {
    "cells": [
    {
    "cell_type": "code",
    "execution_count": 4,
    "id": "83272ed4",
    "metadata": {},
    "outputs": [
    {
    "data": {
    "text/plain": [
    "2222136"
    ]
    },
    "execution_count": 4,
    "metadata": {},
    "output_type": "execute_result"
    }
    ],
    "source": [
    "from glob import glob\n",
    "import os\n",
    "\n",
    "repository = 'mesolitica/pseudolabel-malaysian-youtube-whisper-large-v3-timestamp'\n",
    "folder = 'output-audio'\n",
    "files = glob(f'{folder}/*.mp3')\n",
    "len(files)"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 2,
    "id": "3d83ced9",
    "metadata": {},
    "outputs": [],
    "source": [
    "import zipfile\n",
    "import mp\n",
    "import time\n",
    "from huggingface_hub import HfFileSystem\n",
    "from huggingface_hub import HfApi\n",
    "from tqdm import tqdm\n",
    "api = HfApi()\n",
    "\n",
    "partition_size = 5e+9"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 5,
    "id": "de709a3a",
    "metadata": {},
    "outputs": [],
    "source": [
    "def loop(files):\n",
    " files, index = files\n",
    " current_index = 0\n",
    " api = HfApi()\n",
    " fs = HfFileSystem()\n",
    " total = 0\n",
    " temp = []\n",
    " for i in tqdm(range(len(files))):\n",
    " s = os.stat(files[i]).st_size\n",
    " if s + total >= partition_size:\n",
    " part_name = f\"{folder}-{index}-{current_index}.zip\"\n",
    " \n",
    " with zipfile.ZipFile(part_name, 'w', zipfile.ZIP_DEFLATED) as zipf:\n",
    " for f in temp:\n",
    " zipf.write(f, arcname=f)\n",
    "\n",
    " while True:\n",
    " try:\n",
    " api.upload_file(\n",
    " path_or_fileobj=part_name,\n",
    " path_in_repo=part_name,\n",
    " repo_id=repository,\n",
    " repo_type=\"dataset\",\n",
    " )\n",
    " break\n",
    " except:\n",
    " time.sleep(60)\n",
    "\n",
    " os.remove(part_name)\n",
    " \n",
    " current_index += 1\n",
    " temp = [files[i]]\n",
    " total = s\n",
    " else:\n",
    " temp.append(files[i])\n",
    " total += s\n",
    " \n",
    " if len(temp):\n",
    " part_name = f\"{folder}-{index}-{current_index}.zip\"\n",
    "\n",
    " with zipfile.ZipFile(part_name, 'w', zipfile.ZIP_DEFLATED) as zipf:\n",
    " for f in temp:\n",
    " zipf.write(f, arcname=f)\n",
    "\n",
    " while True:\n",
    " try:\n",
    " api.upload_file(\n",
    " path_or_fileobj=part_name,\n",
    " path_in_repo=part_name,\n",
    " repo_id=repository,\n",
    " repo_type=\"dataset\",\n",
    " )\n",
    " break\n",
    " except:\n",
    " time.sleep(60)\n",
    "\n",
    " os.remove(part_name)"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 6,
    "id": "d4efaf5d",
    "metadata": {},
    "outputs": [
    {
    "name": "stderr",
    "output_type": "stream",
    "text": [
    "100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 172371.02it/s]\n"
    ]
    },
    {
    "data": {
    "application/vnd.jupyter.widget-view+json": {
    "model_id": "97f9290861bf4ac1ad1be838378eb598",
    "version_major": 2,
    "version_minor": 0
    },
    "text/plain": [
    "output-audio-0-0.zip: 0%| | 0.00/153M [00:00<?, ?B/s]"
    ]
    },
    "metadata": {},
    "output_type": "display_data"
    }
    ],
    "source": [
    "loop((files[:1000], 0))"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "813cf6f5",
    "metadata": {},
    "outputs": [
    {
    "name": "stderr",
    "output_type": "stream",
    "text": [
    " 5%|████▏ | 20248/370356 [00:00<00:01, 202455.09it/s]"
    ]
    },
    {
    "data": {
    "application/vnd.jupyter.widget-view+json": {
    "model_id": "3c1ae58009d94ae0a41ff21c354a3e6b",
    "version_major": 2,
    "version_minor": 0
    },
    "text/plain": [
    "output-audio-0-0.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
    ]
    },
    "metadata": {},
    "output_type": "display_data"
    },
    {
    "data": {
    "application/vnd.jupyter.widget-view+json": {
    "model_id": "26427efd0dd944a195f83d34e43855da",
    "version_major": 2,
    "version_minor": 0
    },
    "text/plain": [
    "output-audio-4-0.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
    ]
    },
    "metadata": {},
    "output_type": "display_data"
    },
    {
    "data": {
    "application/vnd.jupyter.widget-view+json": {
    "model_id": "856ca3866217441fb251b39dac735eba",
    "version_major": 2,
    "version_minor": 0
    },
    "text/plain": [
    "output-audio-1-0.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
    ]
    },
    "metadata": {},
    "output_type": "display_data"
    },
    {
    "data": {
    "application/vnd.jupyter.widget-view+json": {
    "model_id": "9406188d85424ee7a079c24d66b34262",
    "version_major": 2,
    "version_minor": 0
    },
    "text/plain": [
    "output-audio-2-0.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
    ]
    },
    "metadata": {},
    "output_type": "display_data"
    },
    {
    "data": {
    "application/vnd.jupyter.widget-view+json": {
    "model_id": "f284f14e517b487cbf954e1774436a98",
    "version_major": 2,
    "version_minor": 0
    },
    "text/plain": [
    "output-audio-3-0.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
    ]
    },
    "metadata": {},
    "output_type": "display_data"
    },
    {
    "data": {
    "application/vnd.jupyter.widget-view+json": {
    "model_id": "5b0847a698f54fa684df830dbdbdbee6",
    "version_major": 2,
    "version_minor": 0
    },
    "text/plain": [
    "output-audio-5-0.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
    ]
    },
    "metadata": {},
    "output_type": "display_data"
    },
    {
    "name": "stderr",
    "output_type": "stream",
    "text": [
    " 15%|███████████▌ | 54627/370356 [42:13<4:04:06, 21.56it/s]"
    ]
    },
    {
    "data": {
    "application/vnd.jupyter.widget-view+json": {
    "model_id": "cb547761fbdb4a71bb80e99a8e1527b2",
    "version_major": 2,
    "version_minor": 0
    },
    "text/plain": [
    "output-audio-3-1.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
    ]
    },
    "metadata": {},
    "output_type": "display_data"
    },
    {
    "data": {
    "application/vnd.jupyter.widget-view+json": {
    "model_id": "1c240fff0510491692fb5c86748f5bcf",
    "version_major": 2,
    "version_minor": 0
    },
    "text/plain": [
    "output-audio-0-1.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
    ]
    },
    "metadata": {},
    "output_type": "display_data"
    },
    {
    "data": {
    "application/vnd.jupyter.widget-view+json": {
    "model_id": "e1d08915f05f464cb65942f8dd64e158",
    "version_major": 2,
    "version_minor": 0
    },
    "text/plain": [
    "output-audio-4-1.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
    ]
    },
    "metadata": {},
    "output_type": "display_data"
    },
    {
    "data": {
    "application/vnd.jupyter.widget-view+json": {
    "model_id": "246f0857d3b845b8a07e95bccc5f3fee",
    "version_major": 2,
    "version_minor": 0
    },
    "text/plain": [
    "output-audio-1-1.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
    ]
    },
    "metadata": {},
    "output_type": "display_data"
    },
    {
    "data": {
    "application/vnd.jupyter.widget-view+json": {
    "model_id": "2ea03d565ad64cdab08a03a6a54f899e",
    "version_major": 2,
    "version_minor": 0
    },
    "text/plain": [
    "output-audio-2-1.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
    ]
    },
    "metadata": {},
    "output_type": "display_data"
    },
    {
    "data": {
    "application/vnd.jupyter.widget-view+json": {
    "model_id": "c521032fde644e8d9b19e166a3ff19b0",
    "version_major": 2,
    "version_minor": 0
    },
    "text/plain": [
    "output-audio-5-1.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
    ]
    },
    "metadata": {},
    "output_type": "display_data"
    }
    ],
    "source": [
    "mp.multiprocessing(files, loop, cores = 6, returned = False)"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "id": "4b2fc596",
    "metadata": {},
    "outputs": [],
    "source": []
    }
    ],
    "metadata": {
    "kernelspec": {
    "display_name": "python3.10",
    "language": "python",
    "name": "python3.10"
    },
    "language_info": {
    "codemirror_mode": {
    "name": "ipython",
    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.10.15"
    }
    },
    "nbformat": 4,
    "nbformat_minor": 5
    }