Last active
June 17, 2025 12:35
-
-
Save mholtzhausen/beafda24a3bc9e4799b102bdc3df348d to your computer and use it in GitHub Desktop.
Revisions
-
mholtzhausen revised this gist
Jun 16, 2025 . 1 changed file with 1 addition and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -6,8 +6,7 @@ reading from stdin and writing to stdout. Example Usage: yt-dlp --skip-download --sub-langs en --convert-subs vtt -o ~/.tmp.subtitle <video_url> | vtt2txt; cat ~/.tmp.subtitle.en.vtt | vtt2txt | fabric -ps extract_wisdom """ import sys -
mholtzhausen revised this gist
Jun 16, 2025 . No changes.There are no files selected for viewing
-
mholtzhausen created this gist
Jun 16, 2025 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,149 @@ #!/usr/bin/env python3 """ Convert YouTube subtitles(vtt) to human readable text. This script is designed to be used as a command-line tool, reading from stdin and writing to stdout. Example Usage: youtube-dl --skip-download --convert-subs vtt -o - <video_url> | vtt2txt cat my_subtitle_file.vtt | vtt2txt """ import sys import re def remove_tags(text): """ Remove vtt markup tags. """ tags = [ r'</c>', r'<c(\.color\w+)?>', r'<\d{2}:\d{2}:\d{2}\.\d{3}>', ] for pat in tags: text = re.sub(pat, '', text) # Extract timestamp, only keep HH:MM text = re.sub( r'(\d{2}:\d{2}):\d{2}\.\d{3} --> .*', r'\g<1>', text ) # Clean up lines that might be empty after tag removal text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE) return text def remove_header(lines): """ Remove vtt file header lines. """ # Find the position of the first timestamp to reliably skip the header start_pos = 0 for i, line in enumerate(lines): if '-->' in line: start_pos = i break # Return lines from the first subtitle entry onwards return lines[start_pos:] def merge_duplicates(lines): """ Remove duplicated subtitles. Duplicates are always adjacent. """ last_timestamp = '' last_cap = '' for line in lines: if not line.strip(): # Skip empty or whitespace-only lines continue # Check if the line is a timestamp if re.match(r'^\d{2}:\d{2}$', line): if line != last_timestamp: yield line last_timestamp = line else: # Check if the line is a subtitle text if line.strip() != last_cap: yield line last_cap = line.strip() def merge_short_lines(lines): """ Merge consecutive short subtitle lines into a single line up to a certain width. """ buffer = '' for line in lines: # If it's a timestamp, print the buffer and then the timestamp if re.match(r'^\d{2}:\d{2}$', line.strip()): if buffer: yield buffer.strip() buffer = '' yield '\n' + line.strip() continue # If it's a text line, add it to the buffer if len(buffer) + len(line) < 80: buffer += ' ' + line.strip() else: yield buffer.strip() buffer = line.strip() # Yield any remaining text in the buffer if buffer: yield buffer.strip() def process_vtt(text): """ Main processing pipeline for the VTT content. """ # 1. Initial tag and metadata removal text = remove_tags(text) lines = text.splitlines() # 2. Remove header lines = remove_header(lines) # 3. Remove duplicate lines lines = list(merge_duplicates(lines)) # 4. Merge short lines for better readability lines = list(merge_short_lines(lines)) # 5. Final cleanup of any remaining timestamp-only lines processed_lines = [] for line in lines: if line and not re.match(r'^\s*\d{2}:\d{2}\s*$', line): processed_lines.append(line) return '\n'.join(processed_lines).strip() def main(): """ Reads from stdin, processes the VTT content, and prints to stdout. """ # Check if there is any input from stdin if sys.stdin.isatty(): print("Usage: cat your_file.vtt | vtt2txt", file=sys.stderr) sys.exit(1) # Read the entire VTT content from stdin vtt_content = sys.stdin.read() # Process the content plain_text = process_vtt(vtt_content) # Print the final, clean text to stdout print(plain_text) if __name__ == "__main__": main()