Last active
November 9, 2024 23:51
-
-
Save Bluscream/00cba0c357adc5e7ecfc6234759f4be2 to your computer and use it in GitHub Desktop.
Youtube Subtitles fetcher
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # pip install requests bbpb | |
| import base64, json | |
| import requests, blackboxprotobuf | |
| from typing import Dict, Any | |
| class YouTubeTranscriptDownloader: | |
| """Class for downloading youtube transcripts | |
| """ | |
| url = 'https://www.youtube.com/youtubei/v1/get_transcript' | |
| headers = { | |
| 'Content-Type': 'application/json' | |
| } | |
| context = { | |
| 'client': { | |
| 'clientName': 'WEB', | |
| 'clientVersion': '2.20240313' | |
| } | |
| } | |
| @staticmethod | |
| def encode_protobuf(message: Dict[str, Any], typedef: Dict[str, Dict[str, str]]) -> str: | |
| """Encode a given message dict with a given typedef dict to base64-encoded protobuf | |
| Args: | |
| message (Dict[str, Any]): Message dictionary to encode | |
| typedef (Dict[str, Dict[str, str]]): Clone of the message dictionary, but instead of values it has types as strings | |
| Returns: | |
| str: base64 encoded protobuf message | |
| """ | |
| data = blackboxprotobuf.encode_message(message, typedef) | |
| return base64.b64encode(data).decode('ascii') | |
| def get(self, videoId: str, lang = "en", automatic = True): | |
| """Gets a transcript in youtube's proprietary json form for a video by it's ID | |
| Args: | |
| videoId (str): Youtube Video ID | |
| lang (str, optional): 2 Letter language code. Defaults to "en". | |
| automatic (bool, optional): Wether to get the automatically generated captions. Defaults to True | |
| Returns: | |
| _type_: Transcript as json dict | |
| """ | |
| lang_dict = { '1': 'asr', '2': lang} if automatic else { '2': lang} | |
| lang_type = { '1': { 'type': 'string' },'2': { 'type': 'string' } } if automatic else { '2': { 'type': 'string' } } | |
| message = { | |
| '1': videoId, | |
| '2': YouTubeTranscriptDownloader.encode_protobuf(lang_dict, lang_type), | |
| } | |
| msg_type = { '1': { 'type': 'string' }, '2': { 'type': 'string' } } | |
| params = YouTubeTranscriptDownloader.encode_protobuf(message, msg_type) | |
| data = { | |
| 'context': self.context, | |
| 'params': params | |
| } | |
| print("Getting transcript for video",videoId,"in language",lang,"(automatic)" if automatic else "") | |
| data = requests.post(self.url, headers = self.headers, json = data).json() | |
| return data | |
| if __name__ == "__main__": # Example usage | |
| from sys import argv | |
| downloader = YouTubeTranscriptDownloader() | |
| videoId = argv[-1] if len(argv) > 1 else input("Video ID:") | |
| lang = input("Language (2 letter lowercase):") or "en" | |
| automatic = True if input("Automaticly generated? (empty=No)") else False | |
| result = downloader.get(videoId, lang, automatic) | |
| if result: | |
| txt = json.dumps(result, indent=4) | |
| print(txt) | |
| with open("transcript.json", 'w') as f: | |
| # Writing data to a file | |
| f.write(txt) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "$schema": "http://json-schema.org/draft-06/schema#", | |
| "$ref": "#/definitions/TranscriptResponse", | |
| "definitions": { | |
| "TranscriptResponse": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "responseContext": { | |
| "$ref": "#/definitions/ResponseContext" | |
| }, | |
| "actions": { | |
| "type": "array", | |
| "items": { | |
| "$ref": "#/definitions/Action" | |
| } | |
| }, | |
| "trackingParams": { | |
| "type": "string" | |
| } | |
| }, | |
| "required": [], | |
| "title": "TranscriptResponse" | |
| }, | |
| "Action": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "clickTrackingParams": { | |
| "type": "string" | |
| }, | |
| "updateEngagementPanelAction": { | |
| "$ref": "#/definitions/UpdateEngagementPanelAction" | |
| } | |
| }, | |
| "required": [], | |
| "title": "Action" | |
| }, | |
| "UpdateEngagementPanelAction": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "targetId": { | |
| "type": "string" | |
| }, | |
| "content": { | |
| "$ref": "#/definitions/UpdateEngagementPanelActionContent" | |
| } | |
| }, | |
| "required": [], | |
| "title": "UpdateEngagementPanelAction" | |
| }, | |
| "UpdateEngagementPanelActionContent": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "transcriptRenderer": { | |
| "$ref": "#/definitions/TranscriptRenderer" | |
| } | |
| }, | |
| "required": [], | |
| "title": "UpdateEngagementPanelActionContent" | |
| }, | |
| "TranscriptRenderer": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "trackingParams": { | |
| "type": "string" | |
| }, | |
| "content": { | |
| "$ref": "#/definitions/TranscriptRendererContent" | |
| } | |
| }, | |
| "required": [], | |
| "title": "TranscriptRenderer" | |
| }, | |
| "TranscriptRendererContent": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "transcriptSearchPanelRenderer": { | |
| "$ref": "#/definitions/TranscriptSearchPanelRenderer" | |
| } | |
| }, | |
| "required": [], | |
| "title": "TranscriptRendererContent" | |
| }, | |
| "TranscriptSearchPanelRenderer": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "header": { | |
| "$ref": "#/definitions/Header" | |
| }, | |
| "body": { | |
| "$ref": "#/definitions/Body" | |
| }, | |
| "footer": { | |
| "$ref": "#/definitions/Footer" | |
| }, | |
| "trackingParams": { | |
| "type": "string" | |
| }, | |
| "targetId": { | |
| "type": "string" | |
| } | |
| }, | |
| "required": [], | |
| "title": "TranscriptSearchPanelRenderer" | |
| }, | |
| "Body": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "transcriptSegmentListRenderer": { | |
| "$ref": "#/definitions/TranscriptSegmentListRenderer" | |
| } | |
| }, | |
| "required": [], | |
| "title": "Body" | |
| }, | |
| "TranscriptSegmentListRenderer": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "initialSegments": { | |
| "type": "array", | |
| "items": { | |
| "$ref": "#/definitions/InitialSegment" | |
| } | |
| }, | |
| "noResultLabel": { | |
| "$ref": "#/definitions/NoResultLabel" | |
| }, | |
| "retryLabel": { | |
| "$ref": "#/definitions/NoResultLabel" | |
| }, | |
| "touchCaptionsEnabled": { | |
| "type": "boolean" | |
| } | |
| }, | |
| "required": [], | |
| "title": "TranscriptSegmentListRenderer" | |
| }, | |
| "InitialSegment": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "transcriptSegmentRenderer": { | |
| "$ref": "#/definitions/TranscriptSegmentRenderer" | |
| } | |
| }, | |
| "required": [], | |
| "title": "InitialSegment" | |
| }, | |
| "TranscriptSegmentRenderer": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "startMs": { | |
| "type": "string", | |
| "format": "integer" | |
| }, | |
| "endMs": { | |
| "type": "string", | |
| "format": "integer" | |
| }, | |
| "snippet": { | |
| "$ref": "#/definitions/NoResultLabel" | |
| }, | |
| "startTimeText": { | |
| "$ref": "#/definitions/StartTimeText" | |
| }, | |
| "trackingParams": { | |
| "type": "string" | |
| }, | |
| "accessibility": { | |
| "$ref": "#/definitions/Accessibility" | |
| }, | |
| "targetId": { | |
| "type": "string" | |
| } | |
| }, | |
| "required": [], | |
| "title": "TranscriptSegmentRenderer" | |
| }, | |
| "Accessibility": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "accessibilityData": { | |
| "$ref": "#/definitions/AccessibilityData" | |
| } | |
| }, | |
| "required": [], | |
| "title": "Accessibility" | |
| }, | |
| "AccessibilityData": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "label": { | |
| "type": "string" | |
| } | |
| }, | |
| "required": [], | |
| "title": "AccessibilityData" | |
| }, | |
| "NoResultLabel": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "runs": { | |
| "type": "array", | |
| "items": { | |
| "$ref": "#/definitions/Run" | |
| } | |
| } | |
| }, | |
| "required": [], | |
| "title": "NoResultLabel" | |
| }, | |
| "Run": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "text": { | |
| "type": "string" | |
| } | |
| }, | |
| "required": [], | |
| "title": "Run" | |
| }, | |
| "StartTimeText": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "simpleText": { | |
| "type": "string" | |
| } | |
| }, | |
| "required": [], | |
| "title": "StartTimeText" | |
| }, | |
| "Footer": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "transcriptFooterRenderer": { | |
| "$ref": "#/definitions/TranscriptFooterRenderer" | |
| } | |
| }, | |
| "required": [], | |
| "title": "Footer" | |
| }, | |
| "TranscriptFooterRenderer": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "languageMenu": { | |
| "$ref": "#/definitions/LanguageMenu" | |
| } | |
| }, | |
| "required": [], | |
| "title": "TranscriptFooterRenderer" | |
| }, | |
| "LanguageMenu": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "sortFilterSubMenuRenderer": { | |
| "$ref": "#/definitions/SortFilterSubMenuRenderer" | |
| } | |
| }, | |
| "required": [], | |
| "title": "LanguageMenu" | |
| }, | |
| "SortFilterSubMenuRenderer": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "subMenuItems": { | |
| "type": "array", | |
| "items": { | |
| "$ref": "#/definitions/SubMenuItem" | |
| } | |
| }, | |
| "trackingParams": { | |
| "type": "string" | |
| } | |
| }, | |
| "required": [], | |
| "title": "SortFilterSubMenuRenderer" | |
| }, | |
| "SubMenuItem": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "title": { | |
| "type": "string" | |
| }, | |
| "selected": { | |
| "type": "boolean" | |
| }, | |
| "continuation": { | |
| "$ref": "#/definitions/Continuation" | |
| }, | |
| "trackingParams": { | |
| "type": "string" | |
| } | |
| }, | |
| "required": [], | |
| "title": "SubMenuItem" | |
| }, | |
| "Continuation": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "reloadContinuationData": { | |
| "$ref": "#/definitions/ReloadContinuationData" | |
| } | |
| }, | |
| "required": [], | |
| "title": "Continuation" | |
| }, | |
| "ReloadContinuationData": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "continuation": { | |
| "type": "string" | |
| }, | |
| "clickTrackingParams": { | |
| "type": "string" | |
| } | |
| }, | |
| "required": [], | |
| "title": "ReloadContinuationData" | |
| }, | |
| "Header": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "transcriptSearchBoxRenderer": { | |
| "$ref": "#/definitions/TranscriptSearchBoxRenderer" | |
| } | |
| }, | |
| "required": [], | |
| "title": "Header" | |
| }, | |
| "TranscriptSearchBoxRenderer": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "formattedPlaceholder": { | |
| "$ref": "#/definitions/NoResultLabel" | |
| }, | |
| "accessibility": { | |
| "$ref": "#/definitions/Accessibility" | |
| }, | |
| "clearButton": { | |
| "$ref": "#/definitions/ClearButton" | |
| }, | |
| "onTextChangeCommand": { | |
| "$ref": "#/definitions/OnTextChangeCommand" | |
| }, | |
| "trackingParams": { | |
| "type": "string" | |
| }, | |
| "searchButton": { | |
| "$ref": "#/definitions/SearchButton" | |
| } | |
| }, | |
| "required": [], | |
| "title": "TranscriptSearchBoxRenderer" | |
| }, | |
| "ClearButton": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "buttonRenderer": { | |
| "$ref": "#/definitions/ClearButtonButtonRenderer" | |
| } | |
| }, | |
| "required": [], | |
| "title": "ClearButton" | |
| }, | |
| "ClearButtonButtonRenderer": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "icon": { | |
| "$ref": "#/definitions/Icon" | |
| }, | |
| "trackingParams": { | |
| "type": "string" | |
| }, | |
| "accessibilityData": { | |
| "$ref": "#/definitions/Accessibility" | |
| } | |
| }, | |
| "required": [], | |
| "title": "ClearButtonButtonRenderer" | |
| }, | |
| "Icon": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "iconType": { | |
| "type": "string" | |
| } | |
| }, | |
| "required": [], | |
| "title": "Icon" | |
| }, | |
| "OnTextChangeCommand": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "clickTrackingParams": { | |
| "type": "string" | |
| }, | |
| "commandMetadata": { | |
| "$ref": "#/definitions/CommandMetadata" | |
| }, | |
| "getTranscriptEndpoint": { | |
| "$ref": "#/definitions/GetTranscriptEndpoint" | |
| } | |
| }, | |
| "required": [], | |
| "title": "OnTextChangeCommand" | |
| }, | |
| "CommandMetadata": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "webCommandMetadata": { | |
| "$ref": "#/definitions/WebCommandMetadata" | |
| } | |
| }, | |
| "required": [], | |
| "title": "CommandMetadata" | |
| }, | |
| "WebCommandMetadata": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "sendPost": { | |
| "type": "boolean" | |
| }, | |
| "apiUrl": { | |
| "type": "string" | |
| } | |
| }, | |
| "required": [], | |
| "title": "WebCommandMetadata" | |
| }, | |
| "GetTranscriptEndpoint": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "params": { | |
| "type": "string" | |
| } | |
| }, | |
| "required": [], | |
| "title": "GetTranscriptEndpoint" | |
| }, | |
| "SearchButton": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "buttonRenderer": { | |
| "$ref": "#/definitions/SearchButtonButtonRenderer" | |
| } | |
| }, | |
| "required": [], | |
| "title": "SearchButton" | |
| }, | |
| "SearchButtonButtonRenderer": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "trackingParams": { | |
| "type": "string" | |
| } | |
| }, | |
| "required": [], | |
| "title": "SearchButtonButtonRenderer" | |
| }, | |
| "ResponseContext": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "visitorData": { | |
| "type": "string" | |
| }, | |
| "serviceTrackingParams": { | |
| "type": "array", | |
| "items": { | |
| "$ref": "#/definitions/ServiceTrackingParam" | |
| } | |
| }, | |
| "mainAppWebResponseContext": { | |
| "$ref": "#/definitions/MainAppWebResponseContext" | |
| }, | |
| "webResponseContextExtensionData": { | |
| "$ref": "#/definitions/WebResponseContextExtensionData" | |
| } | |
| }, | |
| "required": [], | |
| "title": "ResponseContext" | |
| }, | |
| "MainAppWebResponseContext": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "loggedOut": { | |
| "type": "boolean" | |
| }, | |
| "trackingParam": { | |
| "type": "string" | |
| } | |
| }, | |
| "required": [], | |
| "title": "MainAppWebResponseContext" | |
| }, | |
| "ServiceTrackingParam": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "service": { | |
| "type": "string" | |
| }, | |
| "params": { | |
| "type": "array", | |
| "items": { | |
| "$ref": "#/definitions/Param" | |
| } | |
| } | |
| }, | |
| "required": [], | |
| "title": "ServiceTrackingParam" | |
| }, | |
| "Param": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "key": { | |
| "type": "string" | |
| }, | |
| "value": { | |
| "type": "string" | |
| } | |
| }, | |
| "required": [], | |
| "title": "Param" | |
| }, | |
| "WebResponseContextExtensionData": { | |
| "type": "object", | |
| "additionalProperties": false, | |
| "properties": { | |
| "hasDecorated": { | |
| "type": "boolean" | |
| } | |
| }, | |
| "required": [], | |
| "title": "WebResponseContextExtensionData" | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
When executed inside a virtual env getting below error
From .../lib/python3.12/site-packages/blackboxprotobuf/lib/types/length_delim.py", line 56, in encode_message
if info['name'] == field_number and field_number != '':
~~~~^^^^^^^^
KeyError: 'name'