Skip to content

Instantly share code, notes, and snippets.

@0xack13
Forked from danuker/youtube_summarizer.py
Created September 2, 2025 17:27
Show Gist options
  • Select an option

  • Save 0xack13/c0a00d08f9164df706733b62e2bc1255 to your computer and use it in GitHub Desktop.

Select an option

Save 0xack13/c0a00d08f9164df706733b62e2bc1255 to your computer and use it in GitHub Desktop.

Revisions

  1. @danuker danuker revised this gist Sep 2, 2025. 1 changed file with 3 additions and 2 deletions.
    5 changes: 3 additions & 2 deletions youtube_summarizer.py
    Original file line number Diff line number Diff line change
    @@ -12,12 +12,13 @@

    # A script that downloads the transcript of a YouTube video, summarizes it,
    # and then lets you chat with the language model about it.
    # It uses a local llama.cpp LLM running at http://127.0.0.1:8080.
    # and the yt-dlp (`pip install -U yt-dlp`).
    # It uses a local llama.cpp LLM running at http://127.0.0.1:8080,
    # and the yt-dlp package (`pip install -U yt-dlp`).

    # The ETA is tuned for llama.cpp running Qwen3-30B-A3B-Thinking-2507-UD-Q4_K_XL.gguf
    # from Unsloth, on my Intel(R) Core(TM) i9-10900 CPU @ 2.80GHz
    # with 64GB of RAM, which starts at about 10-ish tokens/second, but gets slower as more
    # history builds up.

    import yt_dlp
    import requests
  2. @danuker danuker created this gist Sep 2, 2025.
    337 changes: 337 additions & 0 deletions youtube_summarizer.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,337 @@
    #!/usr/bin/env python3

    # The MIT License (MIT)

    # Copyright © 2025 Dan Gheorghe Haiduc
    #
    # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
    #
    # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
    #
    # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

    # A script that downloads the transcript of a YouTube video, summarizes it,
    # and then lets you chat with the language model about it.
    # It uses a local llama.cpp LLM running at http://127.0.0.1:8080.
    # and the yt-dlp (`pip install -U yt-dlp`).

    # The ETA is tuned for llama.cpp running Qwen3-30B-A3B-Thinking-2507-UD-Q4_K_XL.gguf
    # from Unsloth, on my Intel(R) Core(TM) i9-10900 CPU @ 2.80GHz
    # with 64GB of RAM, which starts at about 10-ish tokens/second, but gets slower as more

    import yt_dlp
    import requests
    import json
    import sys
    import re
    import time
    from typing import Optional, List, Dict
    from datetime import datetime, timedelta
    import urllib.parse


    DEBUG = False

    def debug(msg):
    """
    Print but only if DEBUG is on.
    """
    if DEBUG:
    print(msg)

    def download_transcript(video_url: str) -> Optional[str]:
    """
    Download transcript from YouTube video using yt-dlp
    """
    try:
    # Try with different options to get subtitles
    ydl_opts = {
    'skip_download': True,
    'writeautomaticsub': True,
    'writesubtitles': True,
    'subtitleslangs': ['en'],
    'subtitlesformat': 'vtt',
    'verbose': False,
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    info = ydl.extract_info(video_url, download=False)
    video_id = info['id']
    debug(f"Video ID: {video_id}")

    # Check what's available in the info dict
    debug(f"Available keys: {[k for k in info.keys() if 'sub' in k.lower() or 'capt' in k.lower()]}")

    # Try to get automatic captions (this is what we're missing)
    automatic_captions = info.get('automatic_captions', {})
    debug(f"Automatic captions languages: {list(automatic_captions.keys())}")

    if 'en' in automatic_captions:
    debug("English automatic captions found")
    # Get the English automatic captions
    en_captions = automatic_captions['en']
    debug(f"Available English caption formats: {len(en_captions)}")

    # Try to find a VTT format (or any usable format)
    vtt_caption = None
    for caption in en_captions:
    debug(f"Caption format: {caption}")
    if caption.get('ext') == 'vtt':
    vtt_caption = caption
    break

    # If no VTT found, use first available
    if not vtt_caption:
    vtt_caption = en_captions[0]

    caption_url = vtt_caption['url']
    debug(f"Using caption URL: {caption_url}")

    # Download subtitle content
    response = requests.get(caption_url)
    response.raise_for_status()

    # If response is M3U8, we need to parse it and fetch individual segments
    if response.text.strip().startswith('#EXTM3U'):
    debug("Detected M3U8 playlist, parsing segments...")
    return parse_m3u8_playlist(response.text, caption_url)
    else:
    # Regular VTT file
    vtt_content = response.text
    plain_text = parse_vtt(vtt_content)
    return plain_text
    else:
    debug("No English automatic captions found")
    # Check if we can use requested_subtitles
    requested_subtitles = info.get('requested_subtitles', {})
    debug(f"Requested subtitles: {requested_subtitles}")

    if 'en' in requested_subtitles:
    # This should be the case where we requested English subtitles
    sub_url = requested_subtitles['en']['url']
    debug(f"Using requested subtitle URL: {sub_url}")
    response = requests.get(sub_url)
    response.raise_for_status()
    vtt_content = response.text
    plain_text = parse_vtt(vtt_content)
    return plain_text
    else:
    debug("No requested subtitles found either")
    return None

    except Exception as e:
    print(f"Error downloading transcript: {e}")
    import traceback
    traceback.print_exc()
    return None


    def parse_m3u8_playlist(m3u8_content: str, base_url: str) -> str:
    """
    Parse M3U8 playlist and download all segments to reconstruct transcript
    """
    lines = m3u8_content.strip().split('\n')
    segments = []

    # Extract segment URLs
    segment_urls = []
    for i, line in enumerate(lines):
    if line.startswith('#EXTINF:'):
    # Next line should be the segment URL
    if i + 1 < len(lines):
    segment_url = lines[i + 1]
    if not segment_url.startswith('#'):
    segment_urls.append(segment_url)

    # Download each segment and concatenate
    all_text = []
    for segment_url in segment_urls:
    try:
    # Handle relative URLs
    if not segment_url.startswith('http'):
    parsed_base = urllib.parse.urlparse(base_url)
    segment_url = urllib.parse.urljoin(f"{parsed_base.scheme}://{parsed_base.netloc}", segment_url)

    debug(f"Downloading segment: {segment_url[:100]}...")
    segment_response = requests.get(segment_url, timeout=30)
    segment_response.raise_for_status()

    # Parse this segment's VTT content
    segment_text = parse_vtt(segment_response.text)
    all_text.append(segment_text)

    except Exception as e:
    print(f"Error downloading segment {segment_url}: {e}")
    continue

    # Combine all text and deduplicate
    combined_text = '\n'.join(all_text)
    return deduplicate_lines(combined_text)


    def deduplicate_lines(text: str) -> str:
    """
    Remove duplicate consecutive lines
    """
    lines = text.split('\n')
    final_lines = []
    prev_line = ""
    for line in lines:
    if line.strip() and line != prev_line:
    final_lines.append(line.strip())
    prev_line = line
    return '\n'.join(final_lines)


    def parse_vtt(vtt_content: str) -> str:
    """
    Parse VTT subtitle format to plain text, handling YouTube's specific format
    """
    lines = vtt_content.strip().split('\n')
    text_lines = []

    # YouTube's VTT format has special syntax like <00:00:00.440><c> I </c>
    # We need to extract the text content between the tags

    for line in lines:
    # Skip VTT metadata lines
    if line.startswith('WEBVTT') or line.startswith('Kind:') or line.startswith('Language:'):
    continue
    # Skip empty lines
    if not line.strip():
    continue
    # Skip timing lines (lines with '-->' in them)
    if ' --> ' in line:
    continue

    # Handle YouTube's special VTT format with timing tags
    # Pattern: <00:00:00.440><c> I </c>
    # We want to extract just the text content
    if '<' in line and '>' in line:
    # Remove all HTML-like tags but preserve text content
    # This handles YouTube's format like <00:00:00.440><c> I </c>
    cleaned_line = re.sub(r'<[^>]+>', '', line)
    # Remove extra whitespace
    cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
    if cleaned_line:
    text_lines.append(cleaned_line)
    else:
    # Regular text line
    text_lines.append(line.strip())

    # Join lines but remove duplicate consecutive lines
    final_lines = []
    prev_line = ""
    for line in text_lines:
    if line != prev_line:
    final_lines.append(line)
    prev_line = line

    return '\n'.join(final_lines)


    def chat_with_llm(messages: List[Dict[str, str]], llm_url: str = "http://127.0.0.1:8080", stream: bool = True) -> str:
    payload = {
    "messages": messages,
    "temperature": 0.6,
    "max_tokens": 2000,
    "stream": stream
    }
    try:
    response = requests.post(f"{llm_url}/chat/completions", json=payload, stream=True)
    response.raise_for_status()
    full_reply = ""
    for line in response.iter_lines():
    if line:
    decoded_line = line.decode("utf-8")
    if decoded_line.startswith("data:"):
    data_str = decoded_line[5:].strip()
    if data_str == "[DONE]":
    break
    try:
    data = json.loads(data_str)
    content = data['choices'][0]['delta'].get('content', '')
    if content:
    print(content, end='', flush=True)
    full_reply += content
    except Exception:
    continue
    print("")
    return full_reply
    except Exception as e:
    print(f"Error contacting LLM: {e}")
    return ""


    def summarize_with_llm(text: str, llm_url: str = "http://127.0.0.1:8080") -> Optional[str]:
    messages = [
    {"role": "system", "content": "You are a helpful assistant that summarizes YouTube video transcripts."},
    {"role": "user", "content": f"Summarize the following YouTube video transcript:\n\n{text}"}
    ]
    return chat_with_llm(messages, llm_url)


    def interactive_console(initial_summary: str, llm_url: str = "http://127.0.0.1:8080"):
    print("\nEntering interactive chat mode. Type 'exit' to quit.")
    conversation = [
    {"role": "system", "content": "You are a helpful assistant that answers questions about the YouTube video."},
    {"role": "assistant", "content": initial_summary}
    ]
    while True:
    user_input = input("\n> ")
    if user_input.lower() in ["exit", "quit"]:
    print("Goodbye!")
    break
    conversation.append({"role": "user", "content": user_input})
    reply = chat_with_llm(conversation, llm_url)
    conversation.append({"role": "assistant", "content": reply})


    def main():
    start = time.time()
    if len(sys.argv) != 2:
    print("Usage: python youtube_summarizer.py <youtube_url>")
    sys.exit(1)

    video_url = sys.argv[1]

    print("Downloading transcript...")
    transcript = download_transcript(video_url)

    if not transcript:
    print("Failed to download transcript")
    sys.exit(1)

    print("Transcript downloaded successfully")
    print(f"Transcript length: {len(transcript)} characters")

    # Show first 200 chars to verify content
    print("\nFirst 200 characters of transcript:")
    print(transcript[:200] + "..." if len(transcript) > 200 else transcript)

    # Tuned to llama.cpp running Qwen3-30B-A3B-Thinking-2507-UD-Q4_K_XL.gguf
    # from Unsloth, on my Intel(R) Core(TM) i9-10900 CPU @ 2.80GHz
    # with 64GB of RAM
    estimated_minutes = len(transcript)*.0003032768268323+.054881491
    start_time = datetime.now()
    completion_time = start_time + timedelta(minutes=estimated_minutes)
    print(f"ETA of completion: {completion_time.strftime('%Y-%m-%d %H:%M:%S')}")

    print("\nSummarizing with LLM...")
    print("\n" + "="*50)
    print("SUMMARY")
    print("="*50)
    summary = summarize_with_llm(transcript)
    print("="*50)

    if not summary:
    print("Failed to get summary from LLM")
    sys.exit(1)

    end = time.time()
    print(f"Took {(end-start)/60:.2f} minutes.")
    interactive_console(summary)


    if __name__ == "__main__":
    main()