Last active
May 8, 2025 06:33
-
-
Save robmcelhinney/b16f7db3a31330bb8d342c7ae03435b2 to your computer and use it in GitHub Desktop.
Convert epub to a .txt file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| from pathlib import Path | |
| import ebooklib | |
| from ebooklib import epub | |
| from bs4 import BeautifulSoup | |
| import argparse | |
| # https://medium.com/@zazazakaria18/turn-your-ebook-to-text-with-python-in-seconds-2a1e42804913 | |
| blocklist = ['[document]', 'noscript', 'header', 'html', 'meta', 'head', 'input', 'script'] | |
| def main(): | |
| parser = argparse.ArgumentParser(description='Convert epub to txt', | |
| formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
| parser.add_argument('-f', '--files', action='append', help='epub file') | |
| parser.add_argument('-s', '--src', action='store', help='Source location') | |
| parser.add_argument('-d', '--dest', action='store', help='Destination location') | |
| args = parser.parse_args() | |
| files = args.files | |
| if files == None: | |
| print('Must provide file') | |
| quit() | |
| src = args.src | |
| if src == None: | |
| src = '' | |
| dest = args.dest | |
| if dest == None: | |
| dest = '.' | |
| for file_name in args.files: | |
| file_path = os.path.join(src, file_name) | |
| add_file_to_json(file_path, dest) | |
| def add_file_to_json(filename, dest): | |
| book = epub.read_epub(filename) | |
| book_name = Path(filename).stem | |
| output_path = os.path.join(dest, book_name +'.txt') | |
| with open(output_path, 'w+', encoding='utf-8') as f: | |
| chapters = [] | |
| for item in book.get_items(): | |
| if item.get_type() == ebooklib.ITEM_DOCUMENT: | |
| chapters.append(item.get_content()) | |
| for chapter in chapters: | |
| text = chapter_to_text(chapter) | |
| f.write(text+'\n') | |
| def chapter_to_text(chap): | |
| output = '' | |
| soup = BeautifulSoup(chap, 'html.parser') | |
| text = soup.find_all(text=True) | |
| prev = '' | |
| for t in text: | |
| if t.parent.name not in blocklist: | |
| if not t.isspace(): | |
| if not (str(prev).endswith(' ') or str(t).startswith(' ')): | |
| output += '\n\n' | |
| output += '{}'.format(t) | |
| prev = t | |
| return output | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment