Created
September 26, 2025 22:49
-
-
Save sir-wabbit/dd48e585c025fd95d59babcea76ee97a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Get a full YouTube transcript in English (manual or auto) by URL/ID. | |
| - Tries native English first; if absent, translates another language to English. | |
| - Supports TXT/SRT/VTT/CSV/JSON outputs. | |
| - Optional: preserve HTML formatting, proxy config. | |
| Requires: youtube-transcript-api >= 1.2.0 | |
| pip install -U youtube-transcript-api | |
| Usage examples: | |
| python yt_transcript.py "https://www.youtube.com/watch?v=dQw4w9WgXcQ" -f txt | |
| python yt_transcript.py https://youtu.be/dQw4w9WgXcQ -f srt -o out.srt | |
| python yt_transcript.py <ID> -f vtt --timestamps | |
| python yt_transcript.py <ID> --webshare-user USER --webshare-pass PASS | |
| """ | |
| import argparse | |
| import csv | |
| import io | |
| import json | |
| import re | |
| import sys | |
| from urllib.parse import parse_qs, urlparse | |
| # ---- youtube-transcript-api (instance API; v1.2.0+) | |
| import youtube_transcript_api as yta | |
| from youtube_transcript_api import YouTubeTranscriptApi | |
| # Newer exceptions (top-level). If some aren’t present, map them to tuple() so "except X" is a no-op. | |
| TranscriptsDisabled = getattr(yta, "TranscriptsDisabled", Exception) | |
| NoTranscriptFound = getattr(yta, "NoTranscriptFound", Exception) | |
| RequestBlocked = getattr(yta, "RequestBlocked", tuple()) | |
| IpBlocked = getattr(yta, "IpBlocked", tuple()) | |
| AgeRestricted = getattr(yta, "AgeRestricted", tuple()) | |
| VideoUnplayable = getattr(yta, "VideoUnplayable", tuple()) | |
| PoTokenRequired = getattr(yta, "PoTokenRequired", tuple()) | |
| # Proxy config types (optional) | |
| try: | |
| from youtube_transcript_api.proxies import WebshareProxyConfig, GenericProxyConfig | |
| except Exception: # older lib or not needed | |
| WebshareProxyConfig = GenericProxyConfig = None | |
| ID_RE = re.compile(r"[0-9A-Za-z_-]{11}") | |
| def extract_video_id(url_or_id: str) -> str: | |
| s = url_or_id.strip() | |
| if ID_RE.fullmatch(s): | |
| return s | |
| p = urlparse(s) | |
| host = (p.netloc or "").lower() | |
| if "youtu.be" in host: | |
| vid = p.path.lstrip("/").split("/")[0] | |
| if ID_RE.fullmatch(vid): | |
| return vid | |
| if "youtube.com" in host: | |
| if p.path == "/watch": | |
| v = parse_qs(p.query).get("v", [None])[0] | |
| if v and ID_RE.fullmatch(v): | |
| return v | |
| m = re.match(r"^/(shorts|embed|v)/([0-9A-Za-z_-]{11})", p.path) | |
| if m: | |
| return m.group(2) | |
| v = parse_qs(p.query).get("v", [None])[0] | |
| if v and ID_RE.fullmatch(v): | |
| return v | |
| m = re.search(r"(?<=v=)[0-9A-Za-z_-]{11}", s) | |
| if m: | |
| return m.group(0) | |
| raise ValueError(f"Could not extract a video ID from: {url_or_id!r}") | |
| def _to_srt_timestamp(seconds: float) -> str: | |
| ms = int(round(seconds * 1000)) | |
| h, rem = divmod(ms, 3_600_000); m, rem = divmod(rem, 60_000); s, ms = divmod(rem, 1_000) | |
| return f"{h:02}:{m:02}:{s:02},{ms:03}" | |
| def _to_vtt_timestamp(seconds: float) -> str: | |
| ms = int(round(seconds * 1000)) | |
| h, rem = divmod(ms, 3_600_000); m, rem = divmod(rem, 60_000); s, ms = divmod(rem, 1_000) | |
| return f"{h:02}:{m:02}:{s:02}.{ms:03}" | |
| def segments_to_txt(segments, with_timestamps: bool) -> str: | |
| if with_timestamps: | |
| lines = [] | |
| for seg in segments: | |
| ts = _to_vtt_timestamp(seg["start"]) | |
| lines.append(f"[{ts}] {seg['text']}".strip()) | |
| return "\n".join(lines) + "\n" | |
| pieces = [seg["text"].strip() for seg in segments if seg["text"].strip()] | |
| return " ".join(pieces) + "\n" | |
| def segments_to_srt(segments) -> str: | |
| out = [] | |
| for i, seg in enumerate(segments, 1): | |
| start = seg["start"] | |
| end = seg["start"] + seg.get("duration", 0.0) | |
| if end <= start: | |
| end = start + 0.001 | |
| out.append(str(i)) | |
| out.append(f"{_to_srt_timestamp(start)} --> {_to_srt_timestamp(end)}") | |
| out.append(seg["text"]) | |
| out.append("") | |
| return "\n".join(out).strip() + "\n" | |
| def segments_to_vtt(segments) -> str: | |
| lines = ["WEBVTT", ""] | |
| for seg in segments: | |
| start = seg["start"] | |
| end = seg["start"] + seg.get("duration", 0.0) | |
| if end <= start: | |
| end = start + 0.001 | |
| lines.append(f"{_to_vtt_timestamp(start)} --> {_to_vtt_timestamp(end)}") | |
| lines.append(seg["text"]) | |
| lines.append("") | |
| return "\n".join(lines).strip() + "\n" | |
| def segments_to_csv(segments) -> str: | |
| buf = io.StringIO() | |
| w = csv.writer(buf) | |
| w.writerow(["start", "end", "text"]) | |
| for seg in segments: | |
| start = seg["start"] | |
| end = seg["start"] + seg.get("duration", 0.0) | |
| w.writerow([f"{start:.3f}", f"{end:.3f}", seg["text"]]) | |
| return buf.getvalue() | |
| def build_api(args) -> YouTubeTranscriptApi: | |
| proxy_config = None | |
| if args.webshare_user and args.webshare_pass and WebshareProxyConfig: | |
| proxy_config = WebshareProxyConfig( | |
| proxy_username=args.webshare_user, | |
| proxy_password=args.webshare_pass, | |
| filter_ip_locations=(args.webshare_locations or None), | |
| ) | |
| elif (args.http_proxy or args.https_proxy or args.socks_proxy) and GenericProxyConfig: | |
| proxy_config = GenericProxyConfig( | |
| http_url=args.http_proxy or None, | |
| https_url=args.https_proxy or None, | |
| socks_url=args.socks_proxy or None, | |
| ) | |
| return YouTubeTranscriptApi(proxy_config=proxy_config) | |
| def fetch_english_transcript(api: YouTubeTranscriptApi, video_id: str, preserve_formatting: bool): | |
| """ | |
| Try English first; else translate any available transcript to English. | |
| Returns (segments_raw_list, meta_dict). | |
| """ | |
| # Fast path: direct English (manual preferred by the lib) | |
| try: | |
| ft = api.fetch(video_id, languages=["en"], preserve_formatting=preserve_formatting) | |
| return ft.to_raw_data(), {"translated": False, "origin_lang": "en", "origin_type": None} | |
| except NoTranscriptFound: | |
| pass | |
| # Fallback: list and translate something to English | |
| tlist = api.list(video_id) | |
| manual = [t for t in tlist if not t.is_generated and t.is_translatable] | |
| auto = [t for t in tlist if t.is_generated and t.is_translatable] | |
| for t in manual + auto: | |
| tr = t.translate("en") | |
| ft = tr.fetch(preserve_formatting=preserve_formatting) | |
| return ft.to_raw_data(), { | |
| "translated": True, | |
| "origin_lang": t.language_code, | |
| "origin_type": "auto" if t.is_generated else "manual", | |
| } | |
| raise NoTranscriptFound("No transcript translatable to English was found.") | |
| def main(): | |
| ap = argparse.ArgumentParser(description="Get a full YouTube transcript in English (auto or manual) by URL/ID.") | |
| ap.add_argument("url", help="YouTube URL or 11-char video ID") | |
| ap.add_argument("-o", "--output", help="Output file (default: stdout)") | |
| ap.add_argument("-f", "--format", choices=["txt", "srt", "json", "vtt", "csv"], default="txt", help="Output format") | |
| ap.add_argument("--timestamps", action="store_true", help="Include timestamps in TXT output") | |
| ap.add_argument("--preserve-formatting", action="store_true", help="Keep HTML tags like <i>/<b> if present") | |
| # Optional proxy knobs (see README “Working around IP bans”) | |
| ap.add_argument("--webshare-user", help="Webshare rotating residential proxy username") | |
| ap.add_argument("--webshare-pass", help="Webshare rotating residential proxy password") | |
| ap.add_argument("--webshare-locations", nargs="+", help="Limit Webshare IPs to ISO country codes, e.g. de us") | |
| ap.add_argument("--http-proxy", help="Generic HTTP proxy URL") | |
| ap.add_argument("--https-proxy", help="Generic HTTPS proxy URL") | |
| ap.add_argument("--socks-proxy", help="Generic SOCKS proxy URL") | |
| args = ap.parse_args() | |
| try: | |
| video_id = extract_video_id(args.url) | |
| except ValueError as e: | |
| print(str(e), file=sys.stderr) | |
| sys.exit(2) | |
| api = build_api(args) | |
| try: | |
| segments, meta = fetch_english_transcript(api, video_id, preserve_formatting=args.preserve_formatting) | |
| except (TranscriptsDisabled, AgeRestricted): | |
| print("Transcripts are unavailable for this video (disabled or age-restricted).", file=sys.stderr) | |
| sys.exit(1) | |
| except (RequestBlocked, IpBlocked): | |
| print("YouTube is blocking your IP. Use rotating residential proxies (see README).", file=sys.stderr) | |
| sys.exit(1) | |
| except (VideoUnplayable,): | |
| print("The video is unplayable.", file=sys.stderr); sys.exit(1) | |
| except PoTokenRequired: | |
| print("A PO token is required for this transcript (library limitation).", file=sys.stderr) | |
| sys.exit(1) | |
| except NoTranscriptFound: | |
| print("No transcript found (English or translatable).", file=sys.stderr); sys.exit(1) | |
| except Exception as e: | |
| print(f"Failed to fetch transcript: {e}", file=sys.stderr); sys.exit(1) | |
| if args.format == "txt": | |
| out = segments_to_txt(segments, with_timestamps=args.timestamps) | |
| elif args.format == "srt": | |
| out = segments_to_srt(segments) | |
| elif args.format == "vtt": | |
| out = segments_to_vtt(segments) | |
| elif args.format == "csv": | |
| out = segments_to_csv(segments) | |
| elif args.format == "json": | |
| out = json.dumps(segments, ensure_ascii=False, indent=2) | |
| else: | |
| raise AssertionError("unknown format") | |
| if args.output: | |
| with open(args.output, "w", encoding="utf-8") as f: | |
| f.write(out) | |
| else: | |
| sys.stdout.write(out) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment