Skip to content

Instantly share code, notes, and snippets.

@sir-wabbit
Created September 26, 2025 22:49
Show Gist options
  • Select an option

  • Save sir-wabbit/dd48e585c025fd95d59babcea76ee97a to your computer and use it in GitHub Desktop.

Select an option

Save sir-wabbit/dd48e585c025fd95d59babcea76ee97a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Get a full YouTube transcript in English (manual or auto) by URL/ID.
- Tries native English first; if absent, translates another language to English.
- Supports TXT/SRT/VTT/CSV/JSON outputs.
- Optional: preserve HTML formatting, proxy config.
Requires: youtube-transcript-api >= 1.2.0
pip install -U youtube-transcript-api
Usage examples:
python yt_transcript.py "https://www.youtube.com/watch?v=dQw4w9WgXcQ" -f txt
python yt_transcript.py https://youtu.be/dQw4w9WgXcQ -f srt -o out.srt
python yt_transcript.py <ID> -f vtt --timestamps
python yt_transcript.py <ID> --webshare-user USER --webshare-pass PASS
"""
import argparse
import csv
import io
import json
import re
import sys
from urllib.parse import parse_qs, urlparse
# ---- youtube-transcript-api (instance API; v1.2.0+)
import youtube_transcript_api as yta
from youtube_transcript_api import YouTubeTranscriptApi
# Newer exceptions (top-level). If some aren’t present, map them to tuple() so "except X" is a no-op.
TranscriptsDisabled = getattr(yta, "TranscriptsDisabled", Exception)
NoTranscriptFound = getattr(yta, "NoTranscriptFound", Exception)
RequestBlocked = getattr(yta, "RequestBlocked", tuple())
IpBlocked = getattr(yta, "IpBlocked", tuple())
AgeRestricted = getattr(yta, "AgeRestricted", tuple())
VideoUnplayable = getattr(yta, "VideoUnplayable", tuple())
PoTokenRequired = getattr(yta, "PoTokenRequired", tuple())
# Proxy config types (optional)
try:
from youtube_transcript_api.proxies import WebshareProxyConfig, GenericProxyConfig
except Exception: # older lib or not needed
WebshareProxyConfig = GenericProxyConfig = None
ID_RE = re.compile(r"[0-9A-Za-z_-]{11}")
def extract_video_id(url_or_id: str) -> str:
s = url_or_id.strip()
if ID_RE.fullmatch(s):
return s
p = urlparse(s)
host = (p.netloc or "").lower()
if "youtu.be" in host:
vid = p.path.lstrip("/").split("/")[0]
if ID_RE.fullmatch(vid):
return vid
if "youtube.com" in host:
if p.path == "/watch":
v = parse_qs(p.query).get("v", [None])[0]
if v and ID_RE.fullmatch(v):
return v
m = re.match(r"^/(shorts|embed|v)/([0-9A-Za-z_-]{11})", p.path)
if m:
return m.group(2)
v = parse_qs(p.query).get("v", [None])[0]
if v and ID_RE.fullmatch(v):
return v
m = re.search(r"(?<=v=)[0-9A-Za-z_-]{11}", s)
if m:
return m.group(0)
raise ValueError(f"Could not extract a video ID from: {url_or_id!r}")
def _to_srt_timestamp(seconds: float) -> str:
ms = int(round(seconds * 1000))
h, rem = divmod(ms, 3_600_000); m, rem = divmod(rem, 60_000); s, ms = divmod(rem, 1_000)
return f"{h:02}:{m:02}:{s:02},{ms:03}"
def _to_vtt_timestamp(seconds: float) -> str:
ms = int(round(seconds * 1000))
h, rem = divmod(ms, 3_600_000); m, rem = divmod(rem, 60_000); s, ms = divmod(rem, 1_000)
return f"{h:02}:{m:02}:{s:02}.{ms:03}"
def segments_to_txt(segments, with_timestamps: bool) -> str:
if with_timestamps:
lines = []
for seg in segments:
ts = _to_vtt_timestamp(seg["start"])
lines.append(f"[{ts}] {seg['text']}".strip())
return "\n".join(lines) + "\n"
pieces = [seg["text"].strip() for seg in segments if seg["text"].strip()]
return " ".join(pieces) + "\n"
def segments_to_srt(segments) -> str:
out = []
for i, seg in enumerate(segments, 1):
start = seg["start"]
end = seg["start"] + seg.get("duration", 0.0)
if end <= start:
end = start + 0.001
out.append(str(i))
out.append(f"{_to_srt_timestamp(start)} --> {_to_srt_timestamp(end)}")
out.append(seg["text"])
out.append("")
return "\n".join(out).strip() + "\n"
def segments_to_vtt(segments) -> str:
lines = ["WEBVTT", ""]
for seg in segments:
start = seg["start"]
end = seg["start"] + seg.get("duration", 0.0)
if end <= start:
end = start + 0.001
lines.append(f"{_to_vtt_timestamp(start)} --> {_to_vtt_timestamp(end)}")
lines.append(seg["text"])
lines.append("")
return "\n".join(lines).strip() + "\n"
def segments_to_csv(segments) -> str:
buf = io.StringIO()
w = csv.writer(buf)
w.writerow(["start", "end", "text"])
for seg in segments:
start = seg["start"]
end = seg["start"] + seg.get("duration", 0.0)
w.writerow([f"{start:.3f}", f"{end:.3f}", seg["text"]])
return buf.getvalue()
def build_api(args) -> YouTubeTranscriptApi:
proxy_config = None
if args.webshare_user and args.webshare_pass and WebshareProxyConfig:
proxy_config = WebshareProxyConfig(
proxy_username=args.webshare_user,
proxy_password=args.webshare_pass,
filter_ip_locations=(args.webshare_locations or None),
)
elif (args.http_proxy or args.https_proxy or args.socks_proxy) and GenericProxyConfig:
proxy_config = GenericProxyConfig(
http_url=args.http_proxy or None,
https_url=args.https_proxy or None,
socks_url=args.socks_proxy or None,
)
return YouTubeTranscriptApi(proxy_config=proxy_config)
def fetch_english_transcript(api: YouTubeTranscriptApi, video_id: str, preserve_formatting: bool):
"""
Try English first; else translate any available transcript to English.
Returns (segments_raw_list, meta_dict).
"""
# Fast path: direct English (manual preferred by the lib)
try:
ft = api.fetch(video_id, languages=["en"], preserve_formatting=preserve_formatting)
return ft.to_raw_data(), {"translated": False, "origin_lang": "en", "origin_type": None}
except NoTranscriptFound:
pass
# Fallback: list and translate something to English
tlist = api.list(video_id)
manual = [t for t in tlist if not t.is_generated and t.is_translatable]
auto = [t for t in tlist if t.is_generated and t.is_translatable]
for t in manual + auto:
tr = t.translate("en")
ft = tr.fetch(preserve_formatting=preserve_formatting)
return ft.to_raw_data(), {
"translated": True,
"origin_lang": t.language_code,
"origin_type": "auto" if t.is_generated else "manual",
}
raise NoTranscriptFound("No transcript translatable to English was found.")
def main():
ap = argparse.ArgumentParser(description="Get a full YouTube transcript in English (auto or manual) by URL/ID.")
ap.add_argument("url", help="YouTube URL or 11-char video ID")
ap.add_argument("-o", "--output", help="Output file (default: stdout)")
ap.add_argument("-f", "--format", choices=["txt", "srt", "json", "vtt", "csv"], default="txt", help="Output format")
ap.add_argument("--timestamps", action="store_true", help="Include timestamps in TXT output")
ap.add_argument("--preserve-formatting", action="store_true", help="Keep HTML tags like <i>/<b> if present")
# Optional proxy knobs (see README “Working around IP bans”)
ap.add_argument("--webshare-user", help="Webshare rotating residential proxy username")
ap.add_argument("--webshare-pass", help="Webshare rotating residential proxy password")
ap.add_argument("--webshare-locations", nargs="+", help="Limit Webshare IPs to ISO country codes, e.g. de us")
ap.add_argument("--http-proxy", help="Generic HTTP proxy URL")
ap.add_argument("--https-proxy", help="Generic HTTPS proxy URL")
ap.add_argument("--socks-proxy", help="Generic SOCKS proxy URL")
args = ap.parse_args()
try:
video_id = extract_video_id(args.url)
except ValueError as e:
print(str(e), file=sys.stderr)
sys.exit(2)
api = build_api(args)
try:
segments, meta = fetch_english_transcript(api, video_id, preserve_formatting=args.preserve_formatting)
except (TranscriptsDisabled, AgeRestricted):
print("Transcripts are unavailable for this video (disabled or age-restricted).", file=sys.stderr)
sys.exit(1)
except (RequestBlocked, IpBlocked):
print("YouTube is blocking your IP. Use rotating residential proxies (see README).", file=sys.stderr)
sys.exit(1)
except (VideoUnplayable,):
print("The video is unplayable.", file=sys.stderr); sys.exit(1)
except PoTokenRequired:
print("A PO token is required for this transcript (library limitation).", file=sys.stderr)
sys.exit(1)
except NoTranscriptFound:
print("No transcript found (English or translatable).", file=sys.stderr); sys.exit(1)
except Exception as e:
print(f"Failed to fetch transcript: {e}", file=sys.stderr); sys.exit(1)
if args.format == "txt":
out = segments_to_txt(segments, with_timestamps=args.timestamps)
elif args.format == "srt":
out = segments_to_srt(segments)
elif args.format == "vtt":
out = segments_to_vtt(segments)
elif args.format == "csv":
out = segments_to_csv(segments)
elif args.format == "json":
out = json.dumps(segments, ensure_ascii=False, indent=2)
else:
raise AssertionError("unknown format")
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(out)
else:
sys.stdout.write(out)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment