Created
February 10, 2025 09:32
-
-
Save kylehowells/e4eabe7d6c59b1d451f0a7ad91f18268 to your computer and use it in GitHub Desktop.
Revisions
-
kylehowells created this gist
Feb 10, 2025 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,122 @@ # Get one language: # https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?booknum=0&output=json&pub=nwt&fileformat=jwpub&alllangs=0&langwritten=S # Get all languages: # https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?booknum=0&output=json&pub=nwt&fileformat=epub&alllangs=1&langwritten=E import requests def get_media_links(language_code: str, format: str, document: str) -> list[str]: response = requests.get(f'https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?booknum=0&output=json&pub={document}&fileformat={format}&alllangs=0&langwritten={language_code}') return response.json() # MARK: - Download def download(langCode: str|None=None, docID: str|None=None): language_code = langCode or 'E' # format = 'jwpub' format = 'epub' document = docID or 'nwt' media_links = get_media_links(language_code, format, document) print(media_links) publication_name = media_links['pubName'] print("Publication Name: ", publication_name) file_url = media_links['files'][language_code][format.upper()][0]['file']['url'] print("File URL: ", file_url) file_name = file_url.split('/')[-1] print(f"Downloading: {file_name}") response = requests.get(file_url) with open(file_name, 'wb') as f: f.write(response.content) print(f"Downloaded: {file_name}") # MARK: - Main if __name__ == '__main__': import sys langCode = None docID = None if len(sys.argv) > 1: langCode = sys.argv[1] if len(sys.argv) > 2: docID = sys.argv[2] download(langCode=langCode, docID=docID) # MARK: - Notes """" { "pubName": "La Biblia. Traducción del Nuevo Mundo (revisión del 2019)", "parentPubName": "La Biblia. Traducción del Nuevo Mundo (revisión del 2019)", "booknum": 0, "pub": "nwt", "issue": "", "formattedDate": "", "fileformat": [ "JWPUB" ], "track": null, "specialty": "", "pubImage": { "url": "", "modifiedDatetime": "", "checksum": null }, "languages": { "S": { "name": "español", "direction": "ltr", "locale": "es", "script": "ROMAN" } }, "files": { "S": { "JWPUB": [ { "title": "La Biblia. Traducción del Nuevo Mundo (revisión del 2019)", "file": { "url": "https://cfp2.jw-cdn.org/a/03c55e4/2/o/nwt_S.jwpub", "stream": "https://jw.org", "modifiedDatetime": "2024-08-15 14:38:30", "checksum": "31bef50c135d9940e97ebc47fb99cc44" }, "filesize": 37413926, "trackImage": { "url": "", "modifiedDatetime": "", "checksum": null }, "markers": null, "label": "0p", "track": 0, "hasTrack": false, "pub": "nwt", "docid": 0, "booknum": 0, "mimetype": "application/octet-stream", "edition": "", "editionDescr": "Normal", "format": "", "formatDescr": "Normal", "specialty": "", "specialtyDescr": "", "subtitled": false, "frameWidth": 0, "frameHeight": 0, "frameRate": 0, "duration": 0, "bitRate": 0 } ] } } } """ This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,459 @@ import json import ebooklib from ebooklib import epub from pathlib import Path import os from bs4 import BeautifulSoup, Tag from bs4.element import Tag, NavigableString import re from dataclasses import dataclass, asdict # MARK: - Helper Function def read_item(item: epub.EpubItem) -> BeautifulSoup: print(item.get_name()) content = item.get_content() soup = BeautifulSoup(content, 'html.parser') return soup # MARK: - Extract Book into HTML Files def extract_book(book_path: str) -> None: """ Extracts content from an EPUB book and processes its documents. Args: book_path: str - Path to the EPUB file to process """ # Convert book path to Path object for proper handling path_obj = Path(book_path) basename = path_obj.stem # Create output directory if it doesn't exist output_dir = Path(basename) output_dir.mkdir(exist_ok=True) book = epub.read_epub(book_path) for item in book.get_items(): if item.get_type() == ebooklib.ITEM_DOCUMENT: print('==================================') file_name = item.get_name() print('NAME : ', file_name) print('----------------------------------') content = item.get_content() # Write content to file in the output directory output_path = output_dir / file_name os.makedirs(os.path.dirname(output_path), exist_ok=True) with open(output_path, 'wb') as f: f.write(content) print('==================================') # MARK: - Extract Verse Info @dataclass class BibleVerse: # The chapter number chapter: int # The verse number verse: int # The text of the verse text: str @dataclass class BibleParagraph: # verses don't mean a whole sentence, they can be a part of a sentence. # Group into full paragraphs. verses: list[BibleVerse] # The text of the paragraph text: str @dataclass class BibleChapter: # The chapter number number: int # The grouped paragraphs paragraphs: list[BibleParagraph] # The individual verses verses: list[BibleVerse] # The text whole of the chapter text: str def __init__(self): self.verses = [] self.paragraphs = [] self.text = "" self.number = 0 @dataclass class BibleBook: # The name of the book of the Bible name: str # The chapters of the book of the Bible chapters: list[BibleChapter] def __init__(self): self.name = "" self.chapters = [] @dataclass class Bible: # The translation name name: str # The books of the Bible books: list[BibleBook] def __init__(self): self.name = "" self.books = [] class BibleEncoder(json.JSONEncoder): """ Custom JSON encoder for Bible classes. """ def default(self, obj): if isinstance(obj, (Bible, BibleBook, BibleChapter)): return asdict(obj) return super().default(obj) # MARK: - Extract Bible def extract_verses(book_file_name: str) -> Bible: """ Extracts verses from a book file. Args: book_file_name: str - Name of the book file to process Returns: Bible - The parsed Bible object """ book = epub.read_epub(book_file_name) bible = Bible() bible.name = get_title(book) bible.books = [] toc_items = list_toc(book) for book_name, book_link in toc_items: print(f"{book_name} -> {book_link}") chapter_links = extract_chapter_links(book_link, book) bible_book = BibleBook() bible_book.name = book_name bible_book.chapters = [] for chapter_num, chapter_link in chapter_links: chapter = extract_chapter_verses(chapter_link, book) print(f"Found {len(chapter.paragraphs)} paragraphs") print(f"Found {len(chapter.verses)} verses") print(f"Found {sum([len(paragraph.verses) for paragraph in chapter.paragraphs])} p.verses") print(f"Found {len(chapter.text)} text") chapter.number = int(chapter_num) bible_book.chapters.append(chapter) # return bible.books.append(bible_book) print(f"Found {len(bible.books)} books") # Write bible to json file using the custom encoder path_obj = Path(book_file_name) basename = path_obj.stem with open(f"{basename}.json", "w") as f: json.dump(bible, f, cls=BibleEncoder, indent=2) return bible def get_title(book: epub.EpubBook) -> str: """ Extracts the title from the EPUB book. """ return book.get_metadata('DC', 'title')[0][0] def list_toc(book: epub.EpubBook) -> list[tuple[str, str]]: """ Lists all table of contents entries from the EPUB book. Args: book_path: str - Path to the EPUB file to process Returns: list[tuple[str, str]] - List of tuples containing the (title, link) of each TOC entry """ toc_items = [] # Get the table of contents for item in book.get_items(): if item.get_type() == ebooklib.ITEM_DOCUMENT: if item.get_name().startswith('toc'): soup = read_item(item) # Find the TOC navigation list nav_list = soup.find('nav', attrs={'epub:type': 'toc'}).find('ol') # Extract all links and their text for link in nav_list.find_all('a'): href = link.get('href') text = link.get_text(strip=True) toc_items.append((text, href)) # Get just the links to Bible Book Chapter Lists TOCs book_tocs = [] for item in toc_items: name = item[0] link = item[1] if link.startswith('biblechapternav'): book_tocs.append(item) print(f"{name} -> {link}") return book_tocs def extract_chapter_links(book_file_name: str, book: epub.EpubBook) -> list[tuple[str, str]]: """ Extracts chapter links from the bible chapter navigation file. Args: book_file_name: str - Name of the navigation file to process (e.g. biblechapternav1.xhtml) book: epub.EpubBook - The epub book object containing the file Returns: list[tuple[str, str]] - List of tuples containing the (chapter number, link) of each chapter """ print(f"Extracting chapter verses from {book_file_name}") item = book.get_item_with_href(book_file_name) if item is None: print(f"Could not find item with href {book_file_name}") return soup = read_item(item) # Get the book name from the h2 heading book_heading = soup.find('h2', class_='w_navigation w_bibleChapter') if book_heading: book_name = book_heading.get_text(strip=True) print(f"Processing book: {book_name}") # Find all chapter links in the table chapter_links = soup.find_all('a') chapters = [] for link in chapter_links: if link.parent.name == 'td': # Only process links inside table cells href = link.get('href') chapter_num = link.get_text(strip=True) chapters.append((chapter_num, href)) print(f"Chapter {chapter_num}: {href}") return chapters def join_parts(parts: list[str]) -> str: result = [] for i, part in enumerate(parts): part = clean_text(part) # print unicode character values for the first 3 chars in this string # for char in part[:3]: # print(f"Char: '{char}' - {ord(char)}") # # Add the last 3 chars of the part to the result # for char in part[-3:]: # print(f"Char: '{char}' - {ord(char)}") if i > 0: # If previous part ends with space and current starts with space # strip spaces from both and add single space previous_part = clean_text(result[-1]) if previous_part.endswith(' ') and part.startswith(' '): result[-1] = previous_part.rstrip() result.append(' ' + part.lstrip()) else: result.append(clean_text(part)) else: result.append(clean_text(part)) return ''.join(result) def extract_chapter_verses(chapter_link: str, book: epub.EpubBook) -> BibleChapter | None: """ Extracts verses from a chapter file in the EPUB book. Args: chapter_link: str - Link to the chapter file to process book: epub.EpubBook - The EPUB book object Returns: list[BibleVerse] - List of parsed Bible verses """ item = book.get_item_with_href(chapter_link) if item is None: print(f"Could not find item with href {chapter_link}") return None soup = read_item(item) print(f"Processing chapter: {chapter_link}") # Get all p tags with class sb verse_paragraph_elements = soup.find_all('p', class_='sb') print(f"Found {len(verse_paragraph_elements)} verse paragraph elements") # <p id="p2" data-pid="2" class="p2 sb"><span id="pos48"/><span id="chapter1"/><span id="chapter1_verse1"/><span class="w_ch"><strong>1</strong> </span>In the beginning God created the heavens and the earth.</p> # List of all the paragraph's (lists of verses) in the chapter verse_paragraphs: list[list[str]] = [] # <p id="p7" data-pid="7" class="p7 sb"> # <span id="pos1609"/> # <span id="chapter1_verse14"/><strong><sup>14</sup></strong> Then God said: “Let there be luminaries<span id="footnotesource5"/><a epub:type="noteref" href="#footnote5">*</a> in the expanse of the heavens to make a division between the day and the night, and they will serve as signs for seasons and for days and years. # <span id="chapter1_verse15"/><strong><sup>15</sup></strong> They will serve as luminaries in the expanse of the heavens to shine upon the earth.” And it was so. # <span id="chapter1_verse16"/><strong><sup>16</sup></strong> And God went on to make the two great luminaries, the greater luminary for dominating the day and the lesser luminary for dominating the night, and also the stars. # <span id="chapter1_verse17"/><strong><sup>17</sup></strong> Thus God put them in the expanse of the heavens to shine upon the earth <span id="chapter1_verse18"/><strong><sup>18</sup></strong> and to dominate by day and by night and to make a division between the light and the darkness. Then God saw that it was good. # <span id="chapter1_verse19"/><strong><sup>19</sup></strong> And there was evening and there was morning, a fourth day. # </p> for verse_paragraph in verse_paragraph_elements: verses_in_paragraph: list[tuple[int, int, str]] = [] # List of verses in the paragraph (chapter, verse, text) single_verse_sections: list[str] = [] # List of sections of text which make up a single verse current_chapter: int = 0 current_verse: int = 0 for child in verse_paragraph.children: # print(f"Child: '{child}' - '{child.name}'") # if child is not str and child.name is not None: # print(f"{child.name} - {child.get('id', '')} - {child.get('class', '')} - {child.children}") if child.name == 'span' and child.get('id', '').startswith('pos'): continue if child.name == 'span' and child.get('id', '').startswith('chapter') and not '_verse' in child.get('id', ''): current_verse = child.get_text(strip=True) # print(f"Chapter: {current_chapter} - Verse: '{current_verse}'") continue if child.name == 'span' and child.get('id', '').startswith('chapter') and '_verse' in child.get('id', ''): child_id = child.get('id', '') # Found the end of this verse if len(single_verse_sections) > 0: combined_verse_text = join_parts(single_verse_sections) verses_in_paragraph.append((current_chapter, current_verse, combined_verse_text)) single_verse_sections = [] current_chapter = child_id.split('_')[0][7:] current_verse = child_id.split('_')[1][5:] # print(f"Chapter: {current_chapter} - Verse: '{current_verse}'") continue if child.name == 'span' and 'w_ch' in child.get('class', []): # print("Verse marker: ", child.get_text(strip=True)) continue if child.name is None and child.string is not None: # print(f"Child: '{child}' - '{child.string}'") text = child.string # count = len(text) # stripped_count = len(text.strip()) # if count != stripped_count: # print(f"Stripped count: {stripped_count} - count: {count}") #text = text.lstrip() single_verse_sections.append(text) continue if child.name == 'strong' and isinstance(child, Tag) and len(child.contents) > 0 and isinstance(child.contents[0], Tag) and child.contents[0].name == 'sup': # print("Verse number: ", child.get_text(strip=True)) continue if child.name == 'strong' and len(child.contents) > 0 and child.contents[0].name == 'span' and 'altsize': span_child = child.contents[0] if span_child.get('class', []) and len(span_child.contents) > 0 and span_child.contents[0].name == 'sup': # print("Verse number: ", child.get_text(strip=True)) continue if child.name == 'span' and child.get('id', '').startswith('footnotesource'): continue if child.name == 'a' and child.get('epub:type', '') == 'noteref': continue if child.name == 'span' and "pageNum" in child.get('class', []): continue if child.name == 'em' and child.string is not None: single_verse_sections.append(child.string) continue if child.name == 'strong' and child.string is not None: strong_text = child.string.strip() if strong_text == current_verse: continue print(f"ERROR: Unknown child: {child}") return None # Add the last verse to the list combined_verse_text = join_parts(single_verse_sections) verses_in_paragraph.append((current_chapter, current_verse, combined_verse_text)) # Add the paragraph to the list verse_paragraphs.append(verses_in_paragraph) print(f"Found {len(verse_paragraphs)} paragraphs, containing {sum([len(paragraph) for paragraph in verse_paragraphs])} verses") # # Print the verses # print('----------------------------------') # print("\n\n") # print("\n\n") # for paragraph in verse_paragraphs: # for verse in paragraph: # print(f"Chapter: {verse[0]} - Verse: {verse[1]} - Text: {verse[2]}") # print("\n\n") # print("\n\n") # print('----------------------------------') # print("\n\n") # print("\n\n") # for paragraph in verse_paragraphs: # text = join_parts([verse[2] for verse in paragraph]) # print(text) # print() # print("\n\n") # print("\n\n") # print('----------------------------------') chapter = BibleChapter() for verses in verse_paragraphs: text = join_parts([verse[2] for verse in verses]) verses = [BibleVerse(int(verse[0]), int(verse[1]), verse[2]) for verse in verses] paragraph = BibleParagraph(verses, clean_text_extra(text.strip())) chapter.paragraphs.append(paragraph) chapter.verses.extend(verses) chapter.text = clean_text_extra('\n'.join([paragraph.text.strip() for paragraph in chapter.paragraphs])) # print(f"Chapter: {chapter.number} - \n\n{chapter.text}\n\n") return chapter def clean_text(text: str) -> str: """ Cleans the text by removing leading and trailing spaces and replacing multiple spaces with a single space. """ text = text.replace(chr(160), ' ') # Replace non-breaking space with space text = text.replace(u"\u201D", '"') # Replace right double quotation mark with double quotation mark text = text.replace(u"\u201C", '"') # Replace left double quotation mark with double quotation mark text = text.replace(u"\u2019", "'") # Replace right single quotation mark with single quotation mark text = text.replace(u"\u2018", "'") # Replace left single quotation mark with single quotation mark text = text.replace(u"\u02b9", "'") # Replace right single quotation mark with single quotation mark return text def clean_text_extra(text: str) -> str: """Remove extra stuff from the summary text, but leave this stuff in the verses themselves""" text = text.replace(u"\u00b7", '') # remove * dot in the middle of words for punctuation # text = text.replace(u"\u2014", '-') # remove em dash return text # MARK: - Main if __name__ == '__main__': import sys file_name = None default_file = "nwt_E.epub" try: file_name = sys.argv[1] if len(sys.argv) > 1 else default_file except IndexError: print(f"Using default file: {default_file}") except FileNotFoundError: print(f"Error: Could not find file '{file_name}'") sys.exit(1) except Exception as e: print(f"Error processing file: {str(e)}") sys.exit(1) file = file_name or default_file # extract_book(file) extract_verses(file)