argaiv91 · February 4, 2020 19:58
diff --git a/split_ffmpeg.py b/split_ffmpeg.py
 #!/usr/bin/env python3

 import sys
 import os
 import re
 import subprocess as sub
 import argparse
 import tempfile
 import json
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from multiprocessing import cpu_count

 # split_ffmpeg.py
 #
 #   Split audio file into multiple files, using ffmpeg, with no loss in quality.
 #
 #   Uses chapter metadata to decide at which timestamps to split the file. Obviously this script
 #   will only be able to split files with such metadata included. Chapter metadata should be
 #   visible from 'ffprobe <file>' output. If not, this script will be useless. Example metadata is
 #   the form:
 #
 #        Chapter #0:0: start 0.000000, end 1079.000000
 #        Metadata:
 #          title           : Chapter One
 #        Chapter #0:1: start 1079.000000, end 2040.000000
 #        Metadata:
 #          title           : Chapter Two
 #        Chapter #0:2: start 2040.000000, end 2878.000000
 #        Metadata:
 #          title           : Chapter Three
 #        Chapter #0:3: start 2878.000000, end 3506.000000
 #        Metadata:
 #          title           : Chapter Four
 #        Chapter #0:4: start 3506.000000, end 4696.000000
 #        Metadata:
 #          title           : Chapter Five
 #        Chapter #0:5: start 4696.000000, end 5741.000000
 #        Metadata:
 #          title           : Chapter Six
 #        Chapter #0:6: start 5741.000000, end 7131.000000
 #        ...
 #
 #   By default, the chapter files will be written into a temporary directory under /tmp.
 #   You may specify alternative output directory with '--outdir <path>', which will be created if it
 #   does not exist. Note that this script will never overwrite files, so you must delete conflicting
 #   files manually (or specify some other empty/nonexistent directory)
 #
 #   The input file basename will be used to name the filenames. You can change this behaviour with  
 #   flag '-use-title', in which case the title of the chapters, if they are available in the chapter 
 #   metadata, will include the title instead (this is not useful is your metadata is crappy, for example).
 #
 #   Work is done in parallel with the help of a thread pool. You may specify
 #   how many parallel jobs you want with command line param '--concurrency'.
 #   The default concurrency is equal to the number of cores available (although I think this
 #   might be silly since this kind of processing isn't so much cpu-bound as it is IO-bound).
 #
 # Dependencies:
 #
 #   - Python 3.5 or newer
 #   - Obviously you need ffmpeg (and ffprobe) installed. Otherwise python3 stdlib should suffice.
 #
 # Author: Markus H (MawKKe) [email protected]
 # Date:   2018-07
 #

 def parseChapters(filename):
    command = [ "ffprobe", '-i', filename, "-v", "error", "-print_format", "json", "-show_chapters"]
    try:
        # ffmpeg & ffprobe write output into stderr, except when
        # using -show_XXXX and -print_format. Strange.
        p = sub.run(command, stdout=sub.PIPE, stderr=sub.PIPE)

        # had we ran ffmpeg instead of ffprobe, this would throw since ffmpeg without
        # an output file will exit with exitcode != 0
        p.check_returncode()

        # .decode() will most likely explode if the ffprobe json output (chapter metadata)
        # was written with some weird encoding, and even more so if the data contains text in
        # multiple different text encodings...

        # TODO?
        # https://stackoverflow.com/questions/10009753/python-dealing-with-mixed-encoding-files
        output = p.stdout.decode('utf8')

        d = json.loads(output)

        return d
    except sub.CalledProcessError as e:
        print("ERROR: ", e)
        print("FFPROBE-STDOUT: ", p.stdout)
        print("FFPROBE-STDERR: ", p.stderr)
        return None




 def main(argv):
    p = argparse.ArgumentParser()
    p.add_argument("--infile", required=True, help="Input file")
    p.add_argument("--concurrency", required=False, default=cpu_count(), help="Number of concurrent processes", type=int)
    p.add_argument("--use-title", required=False, dest='use_title', action='store_true',
            help="includes chapter title in the filenames")
    p.add_argument("--outdir", required=False,
            help="Output directory. If omitted, files are written into a new /tmp/ffmpeg-split-XXX directory.")

    args = p.parse_args(argv[1:])

    fbase, fext = os.path.splitext(os.path.basename(args.infile))

    if 0 in [len(fbase), len(fext)]:
        print("Something is wrong, basename or file extension is empty")
        return -1

    if fext.startswith("."):
        fext = fext[1:]

    info = parseChapters(args.infile)
    if info is None or info.get("chapters", None) is None or len(info["chapters"]) == 0:
        print("Could not parse chapters, exiting...")
        return -1

    if args.outdir:
        os.makedirs(args.outdir, exist_ok=True)
        outdir = args.outdir
    else:
        outdir = tempfile.mkdtemp(prefix="ffmpeg-split-")

    print("Output directory:", outdir)

    def validate_chapter(ch):
        start = ch['start']
        end   = ch['end']
        if (end - start) <= 0:
            print("WARNING: chapter {0} duration is zero or negative (start: {1}, end: {2}), skipping...".format(ch['id'], start, end))
            return None
        return ch

    chapters = list(filter(None, (validate_chapter(ch) for ch in info["chapters"])))

    def outf(n, tags):
        fmt = "{0}/{1} - chapter {2}.{3}"
        if args.use_title and tags and tags.get("title", False):
            return fmt.format(outdir, tags["title"], n, fext)
        return fmt.format(outdir, fbase, n, fext)

    # WIS = WorkItemS, list of 5-tuples
    WIS = [(args.infile, ch["start_time"], ch["end_time"], outf(ch["id"] + 1, ch.get("tags", None))) for ch in chapters]

    print("Total: {0} chapters, concurrency: {1}".format(len(WIS), args.concurrency))

    errors = 0
    with ThreadPoolExecutor(max_workers=args.concurrency) as pool:
        def start_all():
            for wi in WIS:
                print("Submitting:", wi)
                yield pool.submit(ffmpeg_split, wi), wi

        futs = dict(start_all())

        for fut in as_completed(futs):
            try:
                res = fut.result()
            # this looks nasty as hell, but hey, this is what they do in python docs..
            except Exception as e:
                print("Hmmm... general exeption:", e)
            else:
                item = res["item"]
                if res['ok']:
                    # CompletedProcess and has 'args' instead of 'cmd', because...?
                    print("SUCCESS: {0}".format(res["outfile"]))
                else:
                    errors += 1
                    print("FAILURE: {0}".format(res["outfile"]))
                    print("Command: {0}".format(item.cmd))
                    print("FFMPEG-STDOUT:", item.stdout)
                    print("FFMPEG-STDERR:", item.stderr)
                    print("-" * 20)


    print()
    if errors > 0:
        n = len(chapters)
        print("WARNING: there were errors, {0} out of {1} chapters were processed correctly".format(n-errors, n))
    else:
        print("All valid chapters were successfully processed")

    print("Output directory:", outdir)
    return 0

 def ffmpeg_split(wi):

    infile, start, end, outfile= wi
    # NOTE:
    # '-nostdin' param should prevent your terminal becoming all messed up during the pool processing.
    # But if it does, you can fix it with 'reset' and/or 'stty sane'.
    # If corruption still occurs, let me know (email is at the top of the file).

    cmd = [
        "ffmpeg",
        "-nostdin",
        "-i", infile,
        "-v", "error",
        "-map_chapters", "-1",
        "-vn",
        "-c", "copy",
        "-ss", start,
        "-to", end,
        "-n",
        outfile
    ]

    s = sub.run(cmd, stdout=sub.PIPE, stderr=sub.PIPE)

    try:
        s.check_returncode()
        return {'ok': True,  'outfile': outfile, 'item': s}
    except sub.CalledProcessError as e:
        return {'ok': False, 'outfile': outfile, 'item': e}


 if __name__ == '__main__':
    sys.exit(main(sys.argv))
	#!/usr/bin/env python3

	import sys
	import os
	import re
	import subprocess as sub
	import argparse
	import tempfile
	import json
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from multiprocessing import cpu_count

	# split_ffmpeg.py
	#
	# Split audio file into multiple files, using ffmpeg, with no loss in quality.
	#
	# Uses chapter metadata to decide at which timestamps to split the file. Obviously this script
	# will only be able to split files with such metadata included. Chapter metadata should be
	# visible from 'ffprobe <file>' output. If not, this script will be useless. Example metadata is
	# the form:
	#
	# Chapter #0:0: start 0.000000, end 1079.000000
	# Metadata:
	# title : Chapter One
	# Chapter #0:1: start 1079.000000, end 2040.000000
	# Metadata:
	# title : Chapter Two
	# Chapter #0:2: start 2040.000000, end 2878.000000
	# Metadata:
	# title : Chapter Three
	# Chapter #0:3: start 2878.000000, end 3506.000000
	# Metadata:
	# title : Chapter Four
	# Chapter #0:4: start 3506.000000, end 4696.000000
	# Metadata:
	# title : Chapter Five
	# Chapter #0:5: start 4696.000000, end 5741.000000
	# Metadata:
	# title : Chapter Six
	# Chapter #0:6: start 5741.000000, end 7131.000000
	# ...
	#
	# By default, the chapter files will be written into a temporary directory under /tmp.
	# You may specify alternative output directory with '--outdir <path>', which will be created if it
	# does not exist. Note that this script will never overwrite files, so you must delete conflicting
	# files manually (or specify some other empty/nonexistent directory)
	#
	# The input file basename will be used to name the filenames. You can change this behaviour with
	# flag '-use-title', in which case the title of the chapters, if they are available in the chapter
	# metadata, will include the title instead (this is not useful is your metadata is crappy, for example).
	#
	# Work is done in parallel with the help of a thread pool. You may specify
	# how many parallel jobs you want with command line param '--concurrency'.
	# The default concurrency is equal to the number of cores available (although I think this
	# might be silly since this kind of processing isn't so much cpu-bound as it is IO-bound).
	#
	# Dependencies:
	#
	# - Python 3.5 or newer
	# - Obviously you need ffmpeg (and ffprobe) installed. Otherwise python3 stdlib should suffice.
	#
	# Author: Markus H (MawKKe) [email protected]
	# Date: 2018-07
	#

	def parseChapters(filename):
	command = [ "ffprobe", '-i', filename, "-v", "error", "-print_format", "json", "-show_chapters"]
	try:
	# ffmpeg & ffprobe write output into stderr, except when
	# using -show_XXXX and -print_format. Strange.
	p = sub.run(command, stdout=sub.PIPE, stderr=sub.PIPE)

	# had we ran ffmpeg instead of ffprobe, this would throw since ffmpeg without
	# an output file will exit with exitcode != 0
	p.check_returncode()

	# .decode() will most likely explode if the ffprobe json output (chapter metadata)
	# was written with some weird encoding, and even more so if the data contains text in
	# multiple different text encodings...

	# TODO?
	# https://stackoverflow.com/questions/10009753/python-dealing-with-mixed-encoding-files
	output = p.stdout.decode('utf8')

	d = json.loads(output)

	return d
	except sub.CalledProcessError as e:
	print("ERROR: ", e)
	print("FFPROBE-STDOUT: ", p.stdout)
	print("FFPROBE-STDERR: ", p.stderr)
	return None




	def main(argv):
	p = argparse.ArgumentParser()
	p.add_argument("--infile", required=True, help="Input file")
	p.add_argument("--concurrency", required=False, default=cpu_count(), help="Number of concurrent processes", type=int)
	p.add_argument("--use-title", required=False, dest='use_title', action='store_true',
	help="includes chapter title in the filenames")
	p.add_argument("--outdir", required=False,
	help="Output directory. If omitted, files are written into a new /tmp/ffmpeg-split-XXX directory.")

	args = p.parse_args(argv[1:])

	fbase, fext = os.path.splitext(os.path.basename(args.infile))

	if 0 in [len(fbase), len(fext)]:
	print("Something is wrong, basename or file extension is empty")
	return -1

	if fext.startswith("."):
	fext = fext[1:]

	info = parseChapters(args.infile)
	if info is None or info.get("chapters", None) is None or len(info["chapters"]) == 0:
	print("Could not parse chapters, exiting...")
	return -1

	if args.outdir:
	os.makedirs(args.outdir, exist_ok=True)
	outdir = args.outdir
	else:
	outdir = tempfile.mkdtemp(prefix="ffmpeg-split-")

	print("Output directory:", outdir)

	def validate_chapter(ch):
	start = ch['start']
	end = ch['end']
	if (end - start) <= 0:
	print("WARNING: chapter {0} duration is zero or negative (start: {1}, end: {2}), skipping...".format(ch['id'], start, end))
	return None
	return ch

	chapters = list(filter(None, (validate_chapter(ch) for ch in info["chapters"])))

	def outf(n, tags):
	fmt = "{0}/{1} - chapter {2}.{3}"
	if args.use_title and tags and tags.get("title", False):
	return fmt.format(outdir, tags["title"], n, fext)
	return fmt.format(outdir, fbase, n, fext)

	# WIS = WorkItemS, list of 5-tuples
	WIS = [(args.infile, ch["start_time"], ch["end_time"], outf(ch["id"] + 1, ch.get("tags", None))) for ch in chapters]

	print("Total: {0} chapters, concurrency: {1}".format(len(WIS), args.concurrency))

	errors = 0
	with ThreadPoolExecutor(max_workers=args.concurrency) as pool:
	def start_all():
	for wi in WIS:
	print("Submitting:", wi)
	yield pool.submit(ffmpeg_split, wi), wi

	futs = dict(start_all())

	for fut in as_completed(futs):
	try:
	res = fut.result()
	# this looks nasty as hell, but hey, this is what they do in python docs..
	except Exception as e:
	print("Hmmm... general exeption:", e)
	else:
	item = res["item"]
	if res['ok']:
	# CompletedProcess and has 'args' instead of 'cmd', because...?
	print("SUCCESS: {0}".format(res["outfile"]))
	else:
	errors += 1
	print("FAILURE: {0}".format(res["outfile"]))
	print("Command: {0}".format(item.cmd))
	print("FFMPEG-STDOUT:", item.stdout)
	print("FFMPEG-STDERR:", item.stderr)
	print("-" * 20)


	print()
	if errors > 0:
	n = len(chapters)
	print("WARNING: there were errors, {0} out of {1} chapters were processed correctly".format(n-errors, n))
	else:
	print("All valid chapters were successfully processed")

	print("Output directory:", outdir)
	return 0

	def ffmpeg_split(wi):

	infile, start, end, outfile= wi
	# NOTE:
	# '-nostdin' param should prevent your terminal becoming all messed up during the pool processing.
	# But if it does, you can fix it with 'reset' and/or 'stty sane'.
	# If corruption still occurs, let me know (email is at the top of the file).

	cmd = [
	"ffmpeg",
	"-nostdin",
	"-i", infile,
	"-v", "error",
	"-map_chapters", "-1",
	"-vn",
	"-c", "copy",
	"-ss", start,
	"-to", end,
	"-n",
	outfile
	]

	s = sub.run(cmd, stdout=sub.PIPE, stderr=sub.PIPE)

	try:
	s.check_returncode()
	return {'ok': True, 'outfile': outfile, 'item': s}
	except sub.CalledProcessError as e:
	return {'ok': False, 'outfile': outfile, 'item': e}


	if __name__ == '__main__':
	sys.exit(main(sys.argv))