Skip to content

Instantly share code, notes, and snippets.

@argaiv91
Forked from MawKKe/split_ffmpeg.py
Last active February 4, 2020 19:58
Show Gist options
  • Save argaiv91/2caf014c61a35cf48b43a4f7074a1a86 to your computer and use it in GitHub Desktop.
Save argaiv91/2caf014c61a35cf48b43a4f7074a1a86 to your computer and use it in GitHub Desktop.
Split audio file with ffmpeg based on chapter metadata
#!/usr/bin/env python3
import sys
import os
import re
import subprocess as sub
import argparse
import tempfile
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from multiprocessing import cpu_count
# split_ffmpeg.py
#
# Split audio file into multiple files, using ffmpeg, with no loss in quality.
#
# Uses chapter metadata to decide at which timestamps to split the file. Obviously this script
# will only be able to split files with such metadata included. Chapter metadata should be
# visible from 'ffprobe <file>' output. If not, this script will be useless. Example metadata is
# the form:
#
# Chapter #0:0: start 0.000000, end 1079.000000
# Metadata:
# title : Chapter One
# Chapter #0:1: start 1079.000000, end 2040.000000
# Metadata:
# title : Chapter Two
# Chapter #0:2: start 2040.000000, end 2878.000000
# Metadata:
# title : Chapter Three
# Chapter #0:3: start 2878.000000, end 3506.000000
# Metadata:
# title : Chapter Four
# Chapter #0:4: start 3506.000000, end 4696.000000
# Metadata:
# title : Chapter Five
# Chapter #0:5: start 4696.000000, end 5741.000000
# Metadata:
# title : Chapter Six
# Chapter #0:6: start 5741.000000, end 7131.000000
# ...
#
# By default, the chapter files will be written into a temporary directory under /tmp.
# You may specify alternative output directory with '--outdir <path>', which will be created if it
# does not exist. Note that this script will never overwrite files, so you must delete conflicting
# files manually (or specify some other empty/nonexistent directory)
#
# The input file basename will be used to name the filenames. You can change this behaviour with
# flag '-use-title', in which case the title of the chapters, if they are available in the chapter
# metadata, will include the title instead (this is not useful is your metadata is crappy, for example).
#
# Work is done in parallel with the help of a thread pool. You may specify
# how many parallel jobs you want with command line param '--concurrency'.
# The default concurrency is equal to the number of cores available (although I think this
# might be silly since this kind of processing isn't so much cpu-bound as it is IO-bound).
#
# Dependencies:
#
# - Python 3.5 or newer
# - Obviously you need ffmpeg (and ffprobe) installed. Otherwise python3 stdlib should suffice.
#
# Author: Markus H (MawKKe) [email protected]
# Date: 2018-07
#
def parseChapters(filename):
command = [ "ffprobe", '-i', filename, "-v", "error", "-print_format", "json", "-show_chapters"]
try:
# ffmpeg & ffprobe write output into stderr, except when
# using -show_XXXX and -print_format. Strange.
p = sub.run(command, stdout=sub.PIPE, stderr=sub.PIPE)
# had we ran ffmpeg instead of ffprobe, this would throw since ffmpeg without
# an output file will exit with exitcode != 0
p.check_returncode()
# .decode() will most likely explode if the ffprobe json output (chapter metadata)
# was written with some weird encoding, and even more so if the data contains text in
# multiple different text encodings...
# TODO?
# https://stackoverflow.com/questions/10009753/python-dealing-with-mixed-encoding-files
output = p.stdout.decode('utf8')
d = json.loads(output)
return d
except sub.CalledProcessError as e:
print("ERROR: ", e)
print("FFPROBE-STDOUT: ", p.stdout)
print("FFPROBE-STDERR: ", p.stderr)
return None
def main(argv):
p = argparse.ArgumentParser()
p.add_argument("--infile", required=True, help="Input file")
p.add_argument("--concurrency", required=False, default=cpu_count(), help="Number of concurrent processes", type=int)
p.add_argument("--use-title", required=False, dest='use_title', action='store_true',
help="includes chapter title in the filenames")
p.add_argument("--outdir", required=False,
help="Output directory. If omitted, files are written into a new /tmp/ffmpeg-split-XXX directory.")
args = p.parse_args(argv[1:])
fbase, fext = os.path.splitext(os.path.basename(args.infile))
if 0 in [len(fbase), len(fext)]:
print("Something is wrong, basename or file extension is empty")
return -1
if fext.startswith("."):
fext = fext[1:]
info = parseChapters(args.infile)
if info is None or info.get("chapters", None) is None or len(info["chapters"]) == 0:
print("Could not parse chapters, exiting...")
return -1
if args.outdir:
os.makedirs(args.outdir, exist_ok=True)
outdir = args.outdir
else:
outdir = tempfile.mkdtemp(prefix="ffmpeg-split-")
print("Output directory:", outdir)
def validate_chapter(ch):
start = ch['start']
end = ch['end']
if (end - start) <= 0:
print("WARNING: chapter {0} duration is zero or negative (start: {1}, end: {2}), skipping...".format(ch['id'], start, end))
return None
return ch
chapters = list(filter(None, (validate_chapter(ch) for ch in info["chapters"])))
def outf(n, tags):
fmt = "{0}/{1} - chapter {2}.{3}"
if args.use_title and tags and tags.get("title", False):
return fmt.format(outdir, tags["title"], n, fext)
return fmt.format(outdir, fbase, n, fext)
# WIS = WorkItemS, list of 5-tuples
WIS = [(args.infile, ch["start_time"], ch["end_time"], outf(ch["id"] + 1, ch.get("tags", None))) for ch in chapters]
print("Total: {0} chapters, concurrency: {1}".format(len(WIS), args.concurrency))
errors = 0
with ThreadPoolExecutor(max_workers=args.concurrency) as pool:
def start_all():
for wi in WIS:
print("Submitting:", wi)
yield pool.submit(ffmpeg_split, wi), wi
futs = dict(start_all())
for fut in as_completed(futs):
try:
res = fut.result()
# this looks nasty as hell, but hey, this is what they do in python docs..
except Exception as e:
print("Hmmm... general exeption:", e)
else:
item = res["item"]
if res['ok']:
# CompletedProcess and has 'args' instead of 'cmd', because...?
print("SUCCESS: {0}".format(res["outfile"]))
else:
errors += 1
print("FAILURE: {0}".format(res["outfile"]))
print("Command: {0}".format(item.cmd))
print("FFMPEG-STDOUT:", item.stdout)
print("FFMPEG-STDERR:", item.stderr)
print("-" * 20)
print()
if errors > 0:
n = len(chapters)
print("WARNING: there were errors, {0} out of {1} chapters were processed correctly".format(n-errors, n))
else:
print("All valid chapters were successfully processed")
print("Output directory:", outdir)
return 0
def ffmpeg_split(wi):
infile, start, end, outfile= wi
# NOTE:
# '-nostdin' param should prevent your terminal becoming all messed up during the pool processing.
# But if it does, you can fix it with 'reset' and/or 'stty sane'.
# If corruption still occurs, let me know (email is at the top of the file).
cmd = [
"ffmpeg",
"-nostdin",
"-i", infile,
"-v", "error",
"-map_chapters", "-1",
"-vn",
"-c", "copy",
"-ss", start,
"-to", end,
"-n",
outfile
]
s = sub.run(cmd, stdout=sub.PIPE, stderr=sub.PIPE)
try:
s.check_returncode()
return {'ok': True, 'outfile': outfile, 'item': s}
except sub.CalledProcessError as e:
return {'ok': False, 'outfile': outfile, 'item': e}
if __name__ == '__main__':
sys.exit(main(sys.argv))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment