#!/usr/bin/env python3 import sys import os import re import subprocess as sub import argparse import tempfile import json from concurrent.futures import ThreadPoolExecutor, as_completed from multiprocessing import cpu_count # split_ffmpeg.py # # Split audio file into multiple files, using ffmpeg, with no loss in quality. # # Uses chapter metadata to decide at which timestamps to split the file. Obviously this script # will only be able to split files with such metadata included. Chapter metadata should be # visible from 'ffprobe ' output. If not, this script will be useless. Example metadata is # the form: # # Chapter #0:0: start 0.000000, end 1079.000000 # Metadata: # title : Chapter One # Chapter #0:1: start 1079.000000, end 2040.000000 # Metadata: # title : Chapter Two # Chapter #0:2: start 2040.000000, end 2878.000000 # Metadata: # title : Chapter Three # Chapter #0:3: start 2878.000000, end 3506.000000 # Metadata: # title : Chapter Four # Chapter #0:4: start 3506.000000, end 4696.000000 # Metadata: # title : Chapter Five # Chapter #0:5: start 4696.000000, end 5741.000000 # Metadata: # title : Chapter Six # Chapter #0:6: start 5741.000000, end 7131.000000 # ... # # By default, the chapter files will be written into a temporary directory under /tmp. # You may specify alternative output directory with '--outdir ', which will be created if it # does not exist. Note that this script will never overwrite files, so you must delete conflicting # files manually (or specify some other empty/nonexistent directory) # # The input file basename will be used to name the filenames. You can change this behaviour with # flag '-use-title', in which case the title of the chapters, if they are available in the chapter # metadata, will include the title instead (this is not useful is your metadata is crappy, for example). # # Work is done in parallel with the help of a thread pool. You may specify # how many parallel jobs you want with command line param '--concurrency'. # The default concurrency is equal to the number of cores available (although I think this # might be silly since this kind of processing isn't so much cpu-bound as it is IO-bound). # # Dependencies: # # - Python 3.5 or newer # - Obviously you need ffmpeg (and ffprobe) installed. Otherwise python3 stdlib should suffice. # # Author: Markus H (MawKKe) ekkwam@gmail.com # Date: 2018-07 # def parseChapters(filename): command = [ "ffprobe", '-i', filename, "-v", "error", "-print_format", "json", "-show_chapters"] try: # ffmpeg & ffprobe write output into stderr, except when # using -show_XXXX and -print_format. Strange. p = sub.run(command, stdout=sub.PIPE, stderr=sub.PIPE) # had we ran ffmpeg instead of ffprobe, this would throw since ffmpeg without # an output file will exit with exitcode != 0 p.check_returncode() # .decode() will most likely explode if the ffprobe json output (chapter metadata) # was written with some weird encoding, and even more so if the data contains text in # multiple different text encodings... # TODO? # https://stackoverflow.com/questions/10009753/python-dealing-with-mixed-encoding-files output = p.stdout.decode('utf8') d = json.loads(output) return d except sub.CalledProcessError as e: print("ERROR: ", e) print("FFPROBE-STDOUT: ", p.stdout) print("FFPROBE-STDERR: ", p.stderr) return None def main(argv): p = argparse.ArgumentParser() p.add_argument("--infile", required=True, help="Input file") p.add_argument("--concurrency", required=False, default=cpu_count(), help="Number of concurrent processes", type=int) p.add_argument("--use-title", required=False, dest='use_title', action='store_true', help="includes chapter title in the filenames") p.add_argument("--outdir", required=False, help="Output directory. If omitted, files are written into a new /tmp/ffmpeg-split-XXX directory.") args = p.parse_args(argv[1:]) fbase, fext = os.path.splitext(os.path.basename(args.infile)) if 0 in [len(fbase), len(fext)]: print("Something is wrong, basename or file extension is empty") return -1 if fext.startswith("."): fext = fext[1:] info = parseChapters(args.infile) if info is None or info.get("chapters", None) is None or len(info["chapters"]) == 0: print("Could not parse chapters, exiting...") return -1 if args.outdir: os.makedirs(args.outdir, exist_ok=True) outdir = args.outdir else: outdir = tempfile.mkdtemp(prefix="ffmpeg-split-") print("Output directory:", outdir) def validate_chapter(ch): start = ch['start'] end = ch['end'] if (end - start) <= 0: print("WARNING: chapter {0} duration is zero or negative (start: {1}, end: {2}), skipping...".format(ch['id'], start, end)) return None return ch chapters = list(filter(None, (validate_chapter(ch) for ch in info["chapters"]))) def outf(n, tags): fmt = "{0}/{1} - chapter {2}.{3}" if args.use_title and tags and tags.get("title", False): return fmt.format(outdir, tags["title"], n, fext) return fmt.format(outdir, fbase, n, fext) # WIS = WorkItemS, list of 5-tuples WIS = [(args.infile, ch["start_time"], ch["end_time"], outf(ch["id"] + 1, ch.get("tags", None))) for ch in chapters] print("Total: {0} chapters, concurrency: {1}".format(len(WIS), args.concurrency)) errors = 0 with ThreadPoolExecutor(max_workers=args.concurrency) as pool: def start_all(): for wi in WIS: print("Submitting:", wi) yield pool.submit(ffmpeg_split, wi), wi futs = dict(start_all()) for fut in as_completed(futs): try: res = fut.result() # this looks nasty as hell, but hey, this is what they do in python docs.. except Exception as e: print("Hmmm... general exeption:", e) else: item = res["item"] if res['ok']: # CompletedProcess and has 'args' instead of 'cmd', because...? print("SUCCESS: {0}".format(res["outfile"])) else: errors += 1 print("FAILURE: {0}".format(res["outfile"])) print("Command: {0}".format(item.cmd)) print("FFMPEG-STDOUT:", item.stdout) print("FFMPEG-STDERR:", item.stderr) print("-" * 20) print() if errors > 0: n = len(chapters) print("WARNING: there were errors, {0} out of {1} chapters were processed correctly".format(n-errors, n)) else: print("All valid chapters were successfully processed") print("Output directory:", outdir) return 0 def ffmpeg_split(wi): infile, start, end, outfile= wi # NOTE: # '-nostdin' param should prevent your terminal becoming all messed up during the pool processing. # But if it does, you can fix it with 'reset' and/or 'stty sane'. # If corruption still occurs, let me know (email is at the top of the file). cmd = [ "ffmpeg", "-nostdin", "-i", infile, "-v", "error", "-map_chapters", "-1", "-vn", "-c", "copy", "-ss", start, "-to", end, "-n", outfile ] s = sub.run(cmd, stdout=sub.PIPE, stderr=sub.PIPE) try: s.check_returncode() return {'ok': True, 'outfile': outfile, 'item': s} except sub.CalledProcessError as e: return {'ok': False, 'outfile': outfile, 'item': e} if __name__ == '__main__': sys.exit(main(sys.argv))