Created
May 13, 2025 14:27
-
-
Save nglehuy/a3ef6a7bc34d2d0f19d94a82a3313477 to your computer and use it in GitHub Desktop.
Extract subtitles using OpenAI Whisper, add subtitles to video
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import math | |
| import os | |
| from faster_whisper import WhisperModel | |
| import ffmpeg | |
| import re | |
| import fire | |
| os.environ["PYTHONUNBUFFERED"] = "1" | |
| def extract_audio(input_video, input_video_name): | |
| extracted_audio = f"audio-{input_video_name}.wav" | |
| if os.path.exists(extracted_audio): | |
| return extracted_audio | |
| stream = ffmpeg.input(input_video) | |
| stream = ffmpeg.output(stream, extracted_audio) | |
| ffmpeg.run(stream, overwrite_output=True) | |
| return extracted_audio | |
| def transcribe(audio): | |
| model = WhisperModel("medium") | |
| segments, info = model.transcribe( | |
| audio, language="vi", log_progress=True, multilingual=True | |
| ) | |
| language = info.language | |
| print("Transcription language", info.language) | |
| return language, segments | |
| def format_time(seconds): | |
| hours = math.floor(seconds / 3600) | |
| seconds %= 3600 | |
| minutes = math.floor(seconds / 60) | |
| seconds %= 60 | |
| milliseconds = round((seconds - math.floor(seconds)) * 1000) | |
| seconds = math.floor(seconds) | |
| formatted_time = f"{hours:02d}:{minutes:02d}:{seconds:01d},{milliseconds:03d}" | |
| return formatted_time | |
| def generate_subtitle_file(language, segments, input_video_name): | |
| subtitle_file = f"sub2-{input_video_name}.{language}.srt" | |
| with open(subtitle_file, "w", encoding="utf-8") as f: | |
| for index, segment in enumerate(segments): | |
| segment_start = format_time(segment.start) | |
| segment_end = format_time(segment.end) | |
| text = "" | |
| text += f"{str(index+1)} \n" | |
| text += f"{segment_start} --> {segment_end} \n" | |
| text += f"{segment.text} \n" | |
| text += "\n" | |
| f.write(text) | |
| f.flush() | |
| return subtitle_file | |
| def add_subtitle_to_video(input_video, subtitle_file): | |
| input_video_name = os.path.splitext(os.path.basename(input_video))[0] | |
| video_input_stream = ffmpeg.input(input_video) | |
| output_video = f"output-{input_video_name}.mp4" | |
| stream = ffmpeg.output( | |
| video_input_stream, output_video, vf=f"subtitles={subtitle_file}" | |
| ) | |
| ffmpeg.run(stream, overwrite_output=True) | |
| def extract_subtitles( | |
| input_video: str, | |
| ): | |
| input_video_name = os.path.splitext(os.path.basename(input_video))[0] | |
| audio = extract_audio(input_video, input_video_name) | |
| language, segments = transcribe(audio=audio) | |
| generate_subtitle_file( | |
| language=language, segments=segments, input_video_name=input_video_name | |
| ) | |
| def srt_to_dict(srt_file): | |
| subtitles = [] | |
| with open(srt_file, "r", encoding="utf-8") as file: | |
| content = re.sub(r"\n{3,}", "\n\n", file.read().strip()).split("\n\n") | |
| indexes = set() | |
| for subtitle_block in content: | |
| lines = subtitle_block.split("\n") | |
| # Extract index, time range, and subtitle text | |
| if len(lines) >= 3: | |
| index = int(lines[0].strip()) # Subtitle index | |
| start, end = lines[1].split(" --> ") # Start and end time | |
| text = " ".join(lines[2:]).strip() # The subtitle text | |
| # Create a dictionary for each subtitle block | |
| subtitle_dict = {"index": index, "start": start, "end": end, "text": text} | |
| indexes.add(index) | |
| subtitles.append(subtitle_dict) | |
| print(len(subtitles), set(range(1, subtitles[-1]["index"] + 1)) - indexes) | |
| return subtitles | |
| def reindex_srt_file(srt_file, output_srt_file): | |
| subtitles = srt_to_dict(srt_file) | |
| new_subtitles = [] | |
| for i, subtitle in enumerate(subtitles): | |
| subtitle["index"] = i + 1 | |
| new_subtitles.append(subtitle) | |
| with open(output_srt_file, "w", encoding="utf-8") as file: | |
| for subtitle in new_subtitles: | |
| file.write(f"{subtitle['index']}\n") | |
| file.write(f"{subtitle['start']} --> {subtitle['end']}\n") | |
| file.write(f"{subtitle['text']}\n\n") | |
| def convert_srt_to_ass(input_srt, output_ass): | |
| ffmpeg.input(input_srt).output(output_ass).run() | |
| if __name__ == "__main__": | |
| fire.Fire( | |
| { | |
| "extract_subtitles": extract_subtitles, | |
| "add_subtitle_to_video": add_subtitle_to_video, | |
| "srt_to_dict": srt_to_dict, | |
| "reindex_srt_file": reindex_srt_file, | |
| "convert_srt_to_ass": convert_srt_to_ass, | |
| } | |
| ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment