Last active
July 20, 2025 10:16
-
-
Save daniel-j/613a506a0ec9c7037897c4b3afa8e41e to your computer and use it in GitHub Desktop.
Converts images in a directory to a comic/manga EPUB3 ebook. Can be used to convert extracted CBZ/CBR to EPUB3.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import sys | |
| from os import listdir, path | |
| from lxml import etree | |
| from html import escape | |
| from uuid import uuid4 | |
| import argparse | |
| import datetime | |
| import zipfile | |
| import imagesize | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('-t', '--title', help='Title of the story', default="Unknown Title") | |
| parser.add_argument('-a', '--author', help='Author of the story', default="Unknown Author") | |
| parser.add_argument('-s', '--subject', help='Subject of the story. Can be used multiple times.', action='append') | |
| parser.add_argument('-d', '--direction', help='Reading direction (ltr or rtl, default: ltr)', default='ltr') | |
| parser.add_argument('-i', '--storyid', help='Story id (default: random)', default='urn:uuid:' + str(uuid4())) | |
| parser.add_argument('-l', '--level', help='Compression level [0-9] (default: 9)', default=9, type=int) | |
| parser.add_argument('directory', help='Path to directory with images') | |
| parser.add_argument('output', help='Output EPUB filename') | |
| args = parser.parse_args() | |
| if args.direction != 'rtl': | |
| args.direction = 'ltr' | |
| UID_FORMAT = '{:03d}' | |
| NAMESPACES = {'OPF': 'http://www.idpf.org/2007/opf', | |
| 'DC': 'http://purl.org/dc/elements/1.1/'} | |
| CONTAINER_PATH = 'META-INF/container.xml' | |
| CONTAINER_XML = '''<?xml version='1.0' encoding='utf-8'?> | |
| <container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0"> | |
| <rootfiles> | |
| <rootfile media-type="application/oebps-package+xml" full-path="OEBPS/content.opf"/> | |
| </rootfiles> | |
| </container> | |
| ''' | |
| IMAGESTYLE_CSS = ''' | |
| @page { | |
| padding: 0; | |
| margin: 0; | |
| } | |
| html, | |
| body { | |
| padding: 0; | |
| margin: 0; | |
| height: 100%; | |
| } | |
| #image { | |
| width: 100%; | |
| height: 100%; | |
| display: block; | |
| margin: 0; | |
| padding: 0; | |
| } | |
| ''' | |
| IMAGE_TYPES = { | |
| 'jpeg': 'image/jpeg', | |
| 'jpg': 'image/jpeg', | |
| 'png': 'image/png' | |
| } | |
| def image2xhtml(imgfile, width, height, title, epubtype = 'bodymatter', lang = 'en'): | |
| content = '''<?xml version="1.0" encoding="utf-8"?> | |
| <!DOCTYPE html> | |
| <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="{lang}"> | |
| <head> | |
| <meta name="viewport" content="width={width}, height={height}"/> | |
| <title>{title}</title> | |
| <link rel="stylesheet" type="text/css" href="imagestyle.css"/> | |
| </head> | |
| <body epub:type="{epubtype}"> | |
| <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" id="image" version="1.1" viewBox="0 0 {width} {height}"><image width="{width}" height="{height}" xlink:href="{filename}"/></svg> | |
| </body> | |
| </html> | |
| '''.format(width=width, height=height, | |
| filename=escape(imgfile), title=escape(title), | |
| epubtype=epubtype, lang=lang) | |
| return content | |
| def createOpf(title, author, bookId, imageFiles): | |
| package_attributes = {'xmlns': NAMESPACES['OPF'], | |
| 'unique-identifier': 'bookId', | |
| 'version': '3.0', | |
| 'prefix': 'rendition: http://www.idpf.org/vocab/rendition/#', | |
| 'dir': args.direction} | |
| nsmap = {'dc': NAMESPACES['DC'], 'opf': NAMESPACES['OPF']} | |
| root = etree.Element('package', package_attributes) | |
| # metadata | |
| metadata = etree.SubElement(root, 'metadata', nsmap=nsmap) | |
| el = etree.SubElement(metadata, 'meta', {'property': 'dcterms:modified'}) | |
| el.text = datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') | |
| el = etree.SubElement(metadata, '{' + NAMESPACES['DC'] + '}identifier', {'id': 'bookId'}) | |
| el.text = bookId | |
| el = etree.SubElement(metadata, '{' + NAMESPACES['DC'] + '}title') | |
| el.text = title | |
| el = etree.SubElement(metadata, '{' + NAMESPACES['DC'] + '}creator', {'id': 'creator'}) | |
| el.text = author | |
| el = etree.SubElement(metadata, '{' + NAMESPACES['DC'] + '}language') | |
| el.text = 'en' | |
| for subject in args.subject: | |
| el = etree.SubElement(metadata, '{' + NAMESPACES['DC'] + '}subject') | |
| el.text = subject | |
| etree.SubElement(metadata, 'meta', {'name': 'cover', 'content': 'img-' + UID_FORMAT.format(0)}) | |
| el = etree.SubElement(metadata, 'meta', {'property': 'rendition:layout'}) | |
| el.text = 'pre-paginated' | |
| el = etree.SubElement(metadata, 'meta', {'property': 'rendition:orientation'}) | |
| el.text = 'portrait' | |
| el = etree.SubElement(metadata, 'meta', {'property': 'rendition:spread'}) | |
| el.text = 'landscape' | |
| width, height = imagesize.get(path.join(args.directory, imageFiles[0])) | |
| etree.SubElement(metadata, 'meta', {'name': 'original-resolution', 'content': str(width) + 'x' + str(height)}) | |
| # manifest | |
| manifest = etree.SubElement(root, 'manifest') | |
| etree.SubElement(manifest, 'item', { | |
| 'href': 'imagestyle.css', | |
| 'id': 'imagestyle', | |
| 'media-type': 'text/css' | |
| }) | |
| for i, img in enumerate(imageFiles): | |
| uid = UID_FORMAT.format(i) | |
| imgattrs = { | |
| 'href': 'images/page-' + uid + path.splitext(img)[1], | |
| 'id': 'img-' + uid, | |
| 'media-type': IMAGE_TYPES[path.splitext(img)[1][1:]], | |
| } | |
| if i == 0: | |
| imgattrs['properties'] = 'cover-image' | |
| etree.SubElement(manifest, 'item', imgattrs) | |
| etree.SubElement(manifest, 'item', { | |
| 'href': 'page-' + uid + '.xhtml', | |
| 'id': 'page-' + uid, | |
| 'media-type': 'application/xhtml+xml', | |
| 'properties': 'svg' | |
| }) | |
| etree.SubElement(manifest, 'item', { | |
| 'href': 'toc.ncx', | |
| 'id': 'ncxtoc', | |
| 'media-type': 'application/x-dtbncx+xml', | |
| }) | |
| etree.SubElement(manifest, 'item', { | |
| 'href': 'toc.xhtml', | |
| 'id': 'toc', | |
| 'media-type': 'application/xhtml+xml', | |
| 'properties': 'nav' | |
| }) | |
| # spine | |
| spine = etree.SubElement(root, 'spine', { | |
| 'toc': 'ncxtoc', | |
| 'page-progression-direction': args.direction | |
| }) | |
| for i, img in enumerate(imageFiles): | |
| uid = UID_FORMAT.format(i) | |
| props = 'page-spread-left' | |
| if (i % 2 == 0 and args.direction == 'rtl') or (i % 2 != 0 and args.direction == 'ltr'): | |
| props = 'page-spread-right' | |
| etree.SubElement(spine, 'itemref', { | |
| 'idref': 'page-' + uid, | |
| 'properties': props | |
| }) | |
| tree_str = etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True) | |
| return tree_str | |
| def createNcx(title, author, bookId): | |
| return '''<?xml version="1.0" encoding="utf-8" standalone="no"?> | |
| <ncx:ncx xmlns:ncx="http://www.daisy.org/z3986/2005/ncx/" version="2005-1"> | |
| <ncx:head> | |
| <ncx:meta name="dtb:uid" content="{bookId}"/> | |
| <ncx:meta name="dtb:depth" content="-1"/> | |
| <ncx:meta name="dtb:totalPageCount" content="0"/> | |
| <ncx:meta name="dtb:maxPageNumber" content="0"/> | |
| </ncx:head> | |
| <ncx:docTitle> | |
| <ncx:text>{title}</ncx:text> | |
| </ncx:docTitle> | |
| <ncx:docAuthor> | |
| <ncx:text>{author}</ncx:text> | |
| </ncx:docAuthor> | |
| <ncx:navMap> | |
| <ncx:navPoint id="p01" playOrder="1"> | |
| <ncx:navLabel> | |
| <ncx:text>{title}</ncx:text> | |
| </ncx:navLabel> | |
| <ncx:content src="page-000.xhtml"/> | |
| </ncx:navPoint> | |
| </ncx:navMap> | |
| </ncx:ncx> | |
| '''.format(title=escape(title), author=escape(author), bookId=bookId) | |
| def createNav(title, pageCount): | |
| pages = [None] * pageCount | |
| for i, page in enumerate(pages): | |
| uid = UID_FORMAT.format(i) | |
| pages[i] = ' <li><a href="page-{uid}.xhtml">{pageNumber}</a></li>'.format(uid=uid, pageNumber=i+1) | |
| return '''<?xml version="1.0" encoding="UTF-8"?> | |
| <?xml-model href="http://www.idpf.org/epub/30/schema/epub-nav-30.rnc" type="application/relax-ng-compact-syntax"?> | |
| <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="en"> | |
| <head> | |
| <title>{title}</title> | |
| </head> | |
| <body> | |
| <section class="frontmatter" epub:type="frontmatter toc"> | |
| <h1>Table of Contents</h1> | |
| <nav epub:type="toc" id="toc"> | |
| <ol> | |
| <li epub:type="chapter" id="toc-01"> | |
| <a href="page-000.xhtml">{title}</a> | |
| </li> | |
| </ol> | |
| </nav> | |
| <nav epub:type="page-list"> | |
| <ol> | |
| {pages} | |
| </ol> | |
| </nav> | |
| </section> | |
| </body> | |
| </html>'''.format(pages='\n'.join(pages), title=escape(title)) | |
| imageFiles = sorted([f for f in listdir(args.directory) if path.isfile(path.join(args.directory, f))]) | |
| imageFiles = list(filter(lambda img: path.splitext(img)[1][1:] in IMAGE_TYPES, imageFiles)) | |
| if len(imageFiles) < 1: | |
| print('Too few images:', len(imageFiles)) | |
| sys.exit(1) | |
| print('Found ' + str(len(imageFiles)) + ' pages.') | |
| prev_compression = zipfile.zlib.Z_DEFAULT_COMPRESSION | |
| zipfile.zlib.Z_DEFAULT_COMPRESSION = args.level | |
| output = zipfile.ZipFile(args.output, 'w', zipfile.ZIP_DEFLATED) | |
| output.writestr('mimetype', 'application/epub+zip', compress_type=zipfile.ZIP_STORED) | |
| output.writestr(CONTAINER_PATH, CONTAINER_XML) | |
| output.writestr('OEBPS/content.opf', createOpf(args.title, args.author, args.storyid, imageFiles)) | |
| output.writestr('OEBPS/toc.ncx', createNcx(args.title, args.author, args.storyid)) | |
| output.writestr('OEBPS/toc.xhtml', createNav(args.title, len(imageFiles))) | |
| output.writestr('OEBPS/imagestyle.css', IMAGESTYLE_CSS) | |
| for i, img in enumerate(imageFiles): | |
| uid = UID_FORMAT.format(i) | |
| title = 'Page ' + str(i) | |
| epubtype = 'bodymatter' | |
| if i == 0: | |
| title = 'Cover' | |
| epubtype = 'cover' | |
| width, height = imagesize.get(path.join(args.directory, img)) | |
| print(str(round(i/len(imageFiles)*100)) + '%', 'Processing page ' + str(i+1) + ' of ' + str(len(imageFiles)) + ': ' + img, '(' + str(width) + 'x' + str(height) + ')') | |
| html = image2xhtml('images/page-' + uid + path.splitext(img)[1], width, height, title, epubtype, 'en') | |
| output.writestr('OEBPS/page-{uid}.xhtml'.format(uid=uid), html) | |
| output.write(path.join(args.directory, img), 'OEBPS/images/page-' + uid + path.splitext(img)[1]) | |
| output.close() | |
| zipfile.zlib.Z_DEFAULT_COMPRESSION = prev_compression | |
| print('Complete! Saved EPUB as ' + args.output) |
Thank you very much for providing this script. It is the best tool I have used for lossless conversion to epub files. I would like to make a suggestion, could you add a language tag option to this script? This way it can support different language tags. Thanks.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for this script @daniel-j! Especially for mangas, it's the only way I found that results in a nice formatted EPUB to read in Apple Books (Calibre completely messes the layout, I'm guessing because their EPUB conversion doesn't support fixed-layout).
It'd be great if the script would also check if there are landscape images and split them in half, otherwise Apple Books only shows half of the image. I created my own script for this to run before yours: https://gist.github.com/imkh/1e349de95879d22445550f3ac222fc0f
If I have a small request: how difficult would it be to add support for nested chapters in the table of contents file? Something like:
I tried to add it myself but my Python skills are just too limited 😬
Thanks!