import argparse from datetime import datetime, timedelta from pathlib import Path from typing import Optional from faster_whisper import WhisperModel # model_size = "large-v2" # Will download faster-whisper-large-v2 from Hugging face model_size = "faster-whisper-large-v2-ct2" # Load the local large-v2 model # Run on GPU with FP16 # model = WhisperModel(model_size, device="cuda", compute_type="float16") # or run on GPU with INT8 # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16") # or run on CPU with INT8 # model = WhisperModel(model_size, device="cpu", compute_type="int8") parser = argparse.ArgumentParser( usage="%(prog)s [audio file]", description="Transcribe the given audio file using faster-whisper-large-v2 model", allow_abbrev=False, ) parser.add_argument("audio_file", type=str) parser.add_argument( "lang", type=str, nargs="?", default=None, help="Specify the audio language. Will auto-detect if not specified", ) args = parser.parse_args() def srt_format_timestamp(seconds: float) -> str: """ Taken from: https://github.com/openai/whisper/discussions/98#discussioncomment-3726175 """ assert seconds >= 0, "non-negative timestamp expected" milliseconds = round(seconds * 1000.0) hours = milliseconds // 3_600_000 milliseconds -= hours * 3_600_000 minutes = milliseconds // 60_000 milliseconds -= minutes * 60_000 seconds = milliseconds // 1_000 milliseconds -= seconds * 1_000 return (f"{hours:02d}:") + f"{minutes:02d}:{seconds:02d},{milliseconds:03d}" def transcribe_audio(audio_path: str, audio_lang: Optional[str]) -> None: model = WhisperModel(model_size, device="cpu", compute_type="int8") segments, info = model.transcribe(audio_path, language=audio_lang, beam_size=5) if audio_lang is None: print(f"Detected language '{info.language}' with probability {info.language_probability}") now_time = datetime.now().strftime("%Y-%m-%d_%H-%M") file_name_no_suffix = Path(audio_path).with_suffix("").name srt_file_name = Path(f"{file_name_no_suffix}_{now_time}.srt").name transcription = "" for segment in segments: startTime = srt_format_timestamp(segment.start) endTime = srt_format_timestamp(segment.end) text_srt = (f"{segment.id}\n{startTime} --> {endTime}\n{segment.text.strip()}\n\n") text_console = (f"[{segment.id} - {startTime} --> {endTime}] {segment.text.strip()}") print(text_console) transcription += text_srt Path(srt_file_name).write_text(transcription) transcribe_audio(args.audio_file, args.lang)