mholtzhausen · June 17, 2025 12:35 · Jun 16, 2025 · Jun 16, 2025 · Jun 16, 2025
diff --git a/vtt2txt.py b/vtt2txt.py
@@ -6,8 +6,7 @@
 reading from stdin and writing to stdout.
 
 Example Usage:
-youtube-dl --skip-download --convert-subs vtt -o - <video_url> | vtt2txt
-cat my_subtitle_file.vtt | vtt2txt
+yt-dlp --skip-download --sub-langs en --convert-subs vtt -o ~/.tmp.subtitle <video_url> | vtt2txt; cat ~/.tmp.subtitle.en.vtt | vtt2txt | fabric -ps extract_wisdom
 """
 
 import sys

diff --git a/vtt2txt.py b/vtt2txt.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+
+"""
+Convert YouTube subtitles(vtt) to human readable text.
+This script is designed to be used as a command-line tool,
+reading from stdin and writing to stdout.
+
+Example Usage:
+youtube-dl --skip-download --convert-subs vtt -o - <video_url> | vtt2txt
+cat my_subtitle_file.vtt | vtt2txt
+"""
+
+import sys
+import re
+
+
+def remove_tags(text):
+    """
+    Remove vtt markup tags.
+    """
+    tags = [
+        r'</c>',
+        r'<c(\.color\w+)?>',
+        r'<\d{2}:\d{2}:\d{2}\.\d{3}>',
+    ]
+
+    for pat in tags:
+        text = re.sub(pat, '', text)
+
+    # Extract timestamp, only keep HH:MM
+    text = re.sub(
+        r'(\d{2}:\d{2}):\d{2}\.\d{3} --> .*',
+        r'\g<1>',
+        text
+    )
+
+    # Clean up lines that might be empty after tag removal
+    text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE)
+    return text
+
+def remove_header(lines):
+    """
+    Remove vtt file header lines.
+    """
+    # Find the position of the first timestamp to reliably skip the header
+    start_pos = 0
+    for i, line in enumerate(lines):
+        if '-->' in line:
+            start_pos = i
+            break
+
+    # Return lines from the first subtitle entry onwards
+    return lines[start_pos:]
+
+
+def merge_duplicates(lines):
+    """
+    Remove duplicated subtitles. Duplicates are always adjacent.
+    """
+    last_timestamp = ''
+    last_cap = ''
+    for line in lines:
+        if not line.strip():  # Skip empty or whitespace-only lines
+            continue
+
+        # Check if the line is a timestamp
+        if re.match(r'^\d{2}:\d{2}$', line):
+            if line != last_timestamp:
+                yield line
+                last_timestamp = line
+        else:
+            # Check if the line is a subtitle text
+            if line.strip() != last_cap:
+                yield line
+                last_cap = line.strip()
+
+
+def merge_short_lines(lines):
+    """
+    Merge consecutive short subtitle lines into a single line up to a certain width.
+    """
+    buffer = ''
+    for line in lines:
+        # If it's a timestamp, print the buffer and then the timestamp
+        if re.match(r'^\d{2}:\d{2}$', line.strip()):
+            if buffer:
+                yield buffer.strip()
+            buffer = ''
+            yield '\n' + line.strip()
+            continue
+
+        # If it's a text line, add it to the buffer
+        if len(buffer) + len(line) < 80:
+            buffer += ' ' + line.strip()
+        else:
+            yield buffer.strip()
+            buffer = line.strip()
+
+    # Yield any remaining text in the buffer
+    if buffer:
+        yield buffer.strip()
+
+def process_vtt(text):
+    """
+    Main processing pipeline for the VTT content.
+    """
+    # 1. Initial tag and metadata removal
+    text = remove_tags(text)
+    lines = text.splitlines()
+
+    # 2. Remove header
+    lines = remove_header(lines)
+
+    # 3. Remove duplicate lines
+    lines = list(merge_duplicates(lines))
+
+    # 4. Merge short lines for better readability
+    lines = list(merge_short_lines(lines))
+
+    # 5. Final cleanup of any remaining timestamp-only lines
+    processed_lines = []
+    for line in lines:
+        if line and not re.match(r'^\s*\d{2}:\d{2}\s*$', line):
+            processed_lines.append(line)
+
+    return '\n'.join(processed_lines).strip()
+
+
+def main():
+    """
+    Reads from stdin, processes the VTT content, and prints to stdout.
+    """
+    # Check if there is any input from stdin
+    if sys.stdin.isatty():
+        print("Usage: cat your_file.vtt | vtt2txt", file=sys.stderr)
+        sys.exit(1)
+
+    # Read the entire VTT content from stdin
+    vtt_content = sys.stdin.read()
+
+    # Process the content
+    plain_text = process_vtt(vtt_content)
+
+    # Print the final, clean text to stdout
+    print(plain_text)
+
+
+if __name__ == "__main__":
+    main()