Skip to content

Instantly share code, notes, and snippets.

@mholtzhausen
Last active June 17, 2025 12:35
Show Gist options
  • Save mholtzhausen/beafda24a3bc9e4799b102bdc3df348d to your computer and use it in GitHub Desktop.
Save mholtzhausen/beafda24a3bc9e4799b102bdc3df348d to your computer and use it in GitHub Desktop.

Revisions

  1. mholtzhausen revised this gist Jun 16, 2025. 1 changed file with 1 addition and 2 deletions.
    3 changes: 1 addition & 2 deletions vtt2txt.py
    Original file line number Diff line number Diff line change
    @@ -6,8 +6,7 @@
    reading from stdin and writing to stdout.
    Example Usage:
    youtube-dl --skip-download --convert-subs vtt -o - <video_url> | vtt2txt
    cat my_subtitle_file.vtt | vtt2txt
    yt-dlp --skip-download --sub-langs en --convert-subs vtt -o ~/.tmp.subtitle <video_url> | vtt2txt; cat ~/.tmp.subtitle.en.vtt | vtt2txt | fabric -ps extract_wisdom
    """

    import sys
  2. mholtzhausen revised this gist Jun 16, 2025. No changes.
  3. mholtzhausen created this gist Jun 16, 2025.
    149 changes: 149 additions & 0 deletions vtt2txt.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,149 @@
    #!/usr/bin/env python3

    """
    Convert YouTube subtitles(vtt) to human readable text.
    This script is designed to be used as a command-line tool,
    reading from stdin and writing to stdout.
    Example Usage:
    youtube-dl --skip-download --convert-subs vtt -o - <video_url> | vtt2txt
    cat my_subtitle_file.vtt | vtt2txt
    """

    import sys
    import re


    def remove_tags(text):
    """
    Remove vtt markup tags.
    """
    tags = [
    r'</c>',
    r'<c(\.color\w+)?>',
    r'<\d{2}:\d{2}:\d{2}\.\d{3}>',
    ]

    for pat in tags:
    text = re.sub(pat, '', text)

    # Extract timestamp, only keep HH:MM
    text = re.sub(
    r'(\d{2}:\d{2}):\d{2}\.\d{3} --> .*',
    r'\g<1>',
    text
    )

    # Clean up lines that might be empty after tag removal
    text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE)
    return text

    def remove_header(lines):
    """
    Remove vtt file header lines.
    """
    # Find the position of the first timestamp to reliably skip the header
    start_pos = 0
    for i, line in enumerate(lines):
    if '-->' in line:
    start_pos = i
    break

    # Return lines from the first subtitle entry onwards
    return lines[start_pos:]


    def merge_duplicates(lines):
    """
    Remove duplicated subtitles. Duplicates are always adjacent.
    """
    last_timestamp = ''
    last_cap = ''
    for line in lines:
    if not line.strip(): # Skip empty or whitespace-only lines
    continue

    # Check if the line is a timestamp
    if re.match(r'^\d{2}:\d{2}$', line):
    if line != last_timestamp:
    yield line
    last_timestamp = line
    else:
    # Check if the line is a subtitle text
    if line.strip() != last_cap:
    yield line
    last_cap = line.strip()


    def merge_short_lines(lines):
    """
    Merge consecutive short subtitle lines into a single line up to a certain width.
    """
    buffer = ''
    for line in lines:
    # If it's a timestamp, print the buffer and then the timestamp
    if re.match(r'^\d{2}:\d{2}$', line.strip()):
    if buffer:
    yield buffer.strip()
    buffer = ''
    yield '\n' + line.strip()
    continue

    # If it's a text line, add it to the buffer
    if len(buffer) + len(line) < 80:
    buffer += ' ' + line.strip()
    else:
    yield buffer.strip()
    buffer = line.strip()

    # Yield any remaining text in the buffer
    if buffer:
    yield buffer.strip()

    def process_vtt(text):
    """
    Main processing pipeline for the VTT content.
    """
    # 1. Initial tag and metadata removal
    text = remove_tags(text)
    lines = text.splitlines()

    # 2. Remove header
    lines = remove_header(lines)

    # 3. Remove duplicate lines
    lines = list(merge_duplicates(lines))

    # 4. Merge short lines for better readability
    lines = list(merge_short_lines(lines))

    # 5. Final cleanup of any remaining timestamp-only lines
    processed_lines = []
    for line in lines:
    if line and not re.match(r'^\s*\d{2}:\d{2}\s*$', line):
    processed_lines.append(line)

    return '\n'.join(processed_lines).strip()


    def main():
    """
    Reads from stdin, processes the VTT content, and prints to stdout.
    """
    # Check if there is any input from stdin
    if sys.stdin.isatty():
    print("Usage: cat your_file.vtt | vtt2txt", file=sys.stderr)
    sys.exit(1)

    # Read the entire VTT content from stdin
    vtt_content = sys.stdin.read()

    # Process the content
    plain_text = process_vtt(vtt_content)

    # Print the final, clean text to stdout
    print(plain_text)


    if __name__ == "__main__":
    main()