# Requirements: # - ImageMagick binary # - Windows.Media.Ocr.Cli binary # - VideoSubFinder binary # # Official GCloud Translate Setup: # First 500k characters / mo is free: https://cloud.google.com/translate/pricing # Install Python Module: pip install google-cloud-translate # Setup Google Cloud account and billing information: https://cloud.google.com/ # Make a new project and enable "Cloud Translation API": https://console.cloud.google.com/apis/dashboard # Install GCloud CLI: https://cloud.google.com/sdk/docs/install # Setup authentication: https://cloud.google.com/docs/authentication/provide-credentials-adc#on-prem import argparse import glob import os import shutil import subprocess import sys import time from google.cloud import translate def gcloud_API_translate_text(textList, project_id, target_language_code): client = translate.TranslationServiceClient() location = "global" parent = f"projects/{project_id}/locations/{location}" MAX_STRINGS = 1024 # GCloud API has hard limit of 1024 lines per request numStringsToTranslate = len(textList) numStringsTranslated = 0 translations = [] while numStringsTranslated < numStringsToTranslate: endIndex = numStringsTranslated + MAX_STRINGS if endIndex > numStringsToTranslate: endIndex = numStringsToTranslate contents = textList[numStringsTranslated:endIndex] response = client.translate_text( request={ "parent": parent, "contents": contents, "mime_type": "text/plain", "target_language_code": target_language_code, } ) numStringsTranslated += len(response.translations) translations.extend(response.translations) return translations os.system("") # enables ansi escape characters in terminal LINE_CLEAR = '\x1b[2K' # <-- ANSI sequence parser = argparse.ArgumentParser(description="Takes as input a video with hardsubs, and will generate translated softsubs in the target language.") parser.add_argument('video_path', help='Path to input video') parser.add_argument('-pid', '--project_id', help='Google Cloud Project ID', required=True) parser.add_argument('-t', '--tmp_dir', help='Path to tmp dir', default='tmp') parser.add_argument('-te', '--top_edge_offset', help='How much of top of video to ignore', default=0.75) parser.add_argument('-be', '--bottom_edge_offset', help='How much of bottom of video to ignore', default=0.0) parser.add_argument('-l', '--language', help='Language to translate to', default='en-US', choices=['en-US', 'zh-CN', 'zh-TW', 'ja', 'ko']) parser.add_argument('--test_run', action='store_true', help='Runs a few times and displays debug info') parser.add_argument('--skip_cleanup', action='store_true', help='Does not delete temporary files') parser.add_argument('--skip_extract', action='store_true', help='Skips extracting subtitle images step (uses cached temporary files)') args = parser.parse_args() video_path = args.video_path video_filename, ext = os.path.splitext(video_path) tmp_dir = args.tmp_dir txt_images_folder = os.path.join(tmp_dir, "TXTImages") txt_results_folder = os.path.join(tmp_dir, "TXTResults") target_language_code = args.language project_id = args.project_id test_run = args.test_run skip_cleanup = args.skip_cleanup or test_run skip_extract = args.skip_extract # Generate raw images of the subtitles if not skip_extract: print("Extracting subtitle images with VideoSubFinder (takes quite a long time) ...") startTime = time.time() subprocess.run([ "VideoSubFinderWXW.exe", "--clear_dirs", "--run_search", "--create_cleared_text_images", "--input_video", video_path, "--output_dir", tmp_dir, "--num_threads", str(4), "--num_ocr_threads", str(4), "--top_video_image_percent_end", str(0.25), "--bottom_video_image_percent_end", str(0.0) ], capture_output=True) endTime = time.time() print("Completed! Took "+str(endTime - startTime)+"s") # Enumerate all the images imagePaths = [] if os.path.isdir(txt_images_folder): filetypes = ('*.jpg', '*.jpeg', '*.png') for filetype in filetypes: globPath = os.path.join(txt_images_folder, filetype); imagePaths.extend(glob.glob(globPath)); else: print("ERROR: Invalid paths provided!\n") parser.print_help() sys.exit(2) numImages = len(imagePaths) # Reduce image size OCR has a max image size i = 0 totalTimeElapsedS = 0 estimateText = "Estimated time remaining: Unknown" for imagePath in imagePaths: if test_run and i > 10: break progressText = "Preprocessing image "+str(i+1)+"/"+str(numImages)+". "+estimateText+". Filename: " + os.path.basename(imagePath) print(end=LINE_CLEAR) print(progressText, end='\r') startTime = time.time() # Leaving some padding and not doing "-trim" seems to be important subprocess.run(["magick", "convert", imagePath, "-resize", "x200>", imagePath], capture_output=True) endTime = time.time() i += 1 # Some logic to provide an estimated time timeElapsed = endTime - startTime totalTimeElapsedS += timeElapsed averageTime = totalTimeElapsedS / float(i) numImagesRemaining = numImages - i estimateText = "Estimated time remaining: "+str(averageTime * numImagesRemaining)+"s" # Run Windows OCR on the images and save to text file i = 0 totalTimeElapsedS = 0 totalCharacterCount = 0 estimateText = "Estimated time remaining: Unknown" baseNameList = [] textList = [] for imagePath in imagePaths: if test_run and i > 10: break progressText = "Running OCR on image "+str(i+1)+"/"+str(numImages)+". "+estimateText+". Filename: " + os.path.basename(imagePath) print(end=LINE_CLEAR) print(progressText, end='\r') startTime = time.time() result = subprocess.run(["Windows.Media.Ocr.Cli.exe", imagePath], capture_output=True) ocrText = result.stdout.decode("utf-8") # Replace carriage returns with spaces. # This leads to more naturally flowing translations most of the time. ocrText = " ".join(ocrText.splitlines()).strip() filename, ext = os.path.splitext(imagePath) basename = os.path.basename(filename) # GCloud API throws error if we give it blanks if len(ocrText) > 0: baseNameList.append(basename) textList.append(ocrText) totalCharacterCount += len(ocrText) endTime = time.time() i += 1 # Some logic to provide an estimated time timeElapsed = endTime - startTime totalTimeElapsedS += timeElapsed averageTime = totalTimeElapsedS / float(i) numImagesRemaining = numImages - i estimateText = "Estimated time remaining: "+str(averageTime * numImagesRemaining)+"s" # Additionally writeout to file for easier debugging original_text_file = os.path.join(tmp_dir, "original_text.txt") with open(original_text_file, 'w', encoding="utf-8") as f: for i in range(len(textList)): f.write(str(i)+": "+textList[i]+"\n") print('') # Translate via Google Cloud Translation API and output to disk print("Translating text...") translations = gcloud_API_translate_text(textList, project_id, target_language_code) print("# of characters translated: " + str(totalCharacterCount)) print("# of lines translated: " + str(len(translations))) for i in range(len(translations)): out_filepath = os.path.join(txt_results_folder, baseNameList[i]) + '.txt' translated_text = translations[i].translated_text if test_run: print("Orignal text: " + repr(textList[i])) print("Translated text: " + repr(translated_text)) with open(out_filepath, 'w', encoding="utf-8") as f: f.write(translated_text) # Additionally writeout to file for easier debugging with open(os.path.join(tmp_dir, "translated_text.txt"), 'w', encoding="utf-8") as f: for i in range(len(translations)): f.write(str(i)+": "+translations[i].translated_text+"\n") # Generate subtitle file srt_path = video_filename+".srt" print("Generating softsubs with VideoSubFinder... Output file: "+srt_path) startTime = time.time() subprocess.run([ "VideoSubFinderWXW.exe", "--create_sub_from_txt_results", srt_path, "--output_dir", tmp_dir ], capture_output=True) endTime = time.time() print("Completed! Took "+str(endTime - startTime)+"s") # Cleanup if needed if not skip_cleanup: shutil.rmtree(tmp_dir, ignore_errors=True)