kylehowells · February 10, 2025 09:32 · Feb 10, 2025
diff --git a/download_nwt.py b/download_nwt.py
@@ -0,0 +1,122 @@
+
+# Get one language:
+# https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?booknum=0&output=json&pub=nwt&fileformat=jwpub&alllangs=0&langwritten=S
+
+# Get all languages:
+# https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?booknum=0&output=json&pub=nwt&fileformat=epub&alllangs=1&langwritten=E
+
+
+import requests
+
+
+def get_media_links(language_code: str, format: str, document: str) -> list[str]:
+    response = requests.get(f'https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?booknum=0&output=json&pub={document}&fileformat={format}&alllangs=0&langwritten={language_code}')
+    return response.json()
+
+# MARK: - Download
+
+def download(langCode: str|None=None, docID: str|None=None):
+    language_code = langCode or 'E'
+    # format = 'jwpub'
+    format = 'epub'
+    document = docID or 'nwt'
+    media_links = get_media_links(language_code, format, document)
+    print(media_links)
+    publication_name = media_links['pubName']
+    print("Publication Name: ", publication_name)
+    file_url = media_links['files'][language_code][format.upper()][0]['file']['url']
+    print("File URL: ", file_url)
+    file_name = file_url.split('/')[-1]
+    print(f"Downloading: {file_name}")
+    response = requests.get(file_url)
+    with open(file_name, 'wb') as f:
+        f.write(response.content)
+    print(f"Downloaded: {file_name}")
+
+
+# MARK: - Main
+
+if __name__ == '__main__':
+    import sys
+    langCode = None
+    docID = None
+
+    if len(sys.argv) > 1:
+        langCode = sys.argv[1]
+    if len(sys.argv) > 2:
+        docID = sys.argv[2]
+
+    download(langCode=langCode, docID=docID)
+
+
+# MARK: - Notes
+
+""""
+{
+  "pubName": "La Biblia. Traducción del Nuevo Mundo (revisión del 2019)",
+  "parentPubName": "La Biblia. Traducción del Nuevo Mundo (revisión del 2019)",
+  "booknum": 0,
+  "pub": "nwt",
+  "issue": "",
+  "formattedDate": "",
+  "fileformat": [
+    "JWPUB"
+  ],
+  "track": null,
+  "specialty": "",
+  "pubImage": {
+    "url": "",
+    "modifiedDatetime": "",
+    "checksum": null
+  },
+  "languages": {
+    "S": {
+      "name": "español",
+      "direction": "ltr",
+      "locale": "es",
+      "script": "ROMAN"
+    }
+  },
+  "files": {
+    "S": {
+      "JWPUB": [
+        {
+          "title": "La Biblia. Traducción del Nuevo Mundo (revisión del 2019)",
+          "file": {
+            "url": "https://cfp2.jw-cdn.org/a/03c55e4/2/o/nwt_S.jwpub",
+            "stream": "https://jw.org",
+            "modifiedDatetime": "2024-08-15 14:38:30",
+            "checksum": "31bef50c135d9940e97ebc47fb99cc44"
+          },
+          "filesize": 37413926,
+          "trackImage": {
+            "url": "",
+            "modifiedDatetime": "",
+            "checksum": null
+          },
+          "markers": null,
+          "label": "0p",
+          "track": 0,
+          "hasTrack": false,
+          "pub": "nwt",
+          "docid": 0,
+          "booknum": 0,
+          "mimetype": "application/octet-stream",
+          "edition": "",
+          "editionDescr": "Normal",
+          "format": "",
+          "formatDescr": "Normal",
+          "specialty": "",
+          "specialtyDescr": "",
+          "subtitled": false,
+          "frameWidth": 0,
+          "frameHeight": 0,
+          "frameRate": 0,
+          "duration": 0,
+          "bitRate": 0
+        }
+      ]
+    }
+  }
+}
+"""
diff --git a/epub_to_json.py b/epub_to_json.py
@@ -0,0 +1,459 @@
+import json
+import ebooklib
+from ebooklib import epub
+from pathlib import Path
+import os
+from bs4 import BeautifulSoup, Tag
+from bs4.element import Tag, NavigableString
+import re
+from dataclasses import dataclass, asdict
+
+
+# MARK: - Helper Function
+
+def read_item(item: epub.EpubItem) -> BeautifulSoup:
+    print(item.get_name())
+    content = item.get_content()
+    soup = BeautifulSoup(content, 'html.parser')
+    return soup
+
+
+# MARK: - Extract Book into HTML Files
+
+def extract_book(book_path: str) -> None:
+    """
+    Extracts content from an EPUB book and processes its documents.
+    
+    Args:
+        book_path: str - Path to the EPUB file to process
+    """
+    # Convert book path to Path object for proper handling
+    path_obj = Path(book_path)
+    basename = path_obj.stem
+
+    # Create output directory if it doesn't exist
+    output_dir = Path(basename)
+    output_dir.mkdir(exist_ok=True)
+
+    book = epub.read_epub(book_path)
+
+    for item in book.get_items():
+        if item.get_type() == ebooklib.ITEM_DOCUMENT:
+            print('==================================')
+            file_name = item.get_name()
+            print('NAME : ', file_name)
+            print('----------------------------------')
+            content = item.get_content()
+
+            # Write content to file in the output directory
+            output_path = output_dir / file_name
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+
+            with open(output_path, 'wb') as f:
+                f.write(content)
+
+            print('==================================')
+
+
+# MARK: - Extract Verse Info
+
+@dataclass
+class BibleVerse:
+    # The chapter number
+    chapter: int
+    # The verse number
+    verse: int
+    # The text of the verse
+    text: str
+
+@dataclass
+class BibleParagraph:
+    # verses don't mean a whole sentence, they can be a part of a sentence.
+    # Group into full paragraphs.
+    verses: list[BibleVerse]
+    # The text of the paragraph
+    text: str
+
+@dataclass
+class BibleChapter:
+    # The chapter number
+    number: int
+    # The grouped paragraphs
+    paragraphs: list[BibleParagraph]
+    # The individual verses
+    verses: list[BibleVerse]
+    # The text whole of the chapter
+    text: str
+
+    def __init__(self):
+        self.verses = []
+        self.paragraphs = []
+        self.text = ""
+        self.number = 0
+
+@dataclass
+class BibleBook:
+    # The name of the book of the Bible
+    name: str
+    # The chapters of the book of the Bible
+    chapters: list[BibleChapter]
+
+    def __init__(self):
+        self.name = ""
+        self.chapters = []
+
+@dataclass
+class Bible:
+    # The translation name
+    name: str
+    # The books of the Bible
+    books: list[BibleBook]
+
+    def __init__(self):
+        self.name = ""
+        self.books = []
+
+class BibleEncoder(json.JSONEncoder):
+    """
+    Custom JSON encoder for Bible classes.
+    """
+    def default(self, obj):
+        if isinstance(obj, (Bible, BibleBook, BibleChapter)):
+            return asdict(obj)
+        return super().default(obj)
+
+
+# MARK: - Extract Bible
+
+def extract_verses(book_file_name: str) -> Bible:
+    """
+    Extracts verses from a book file.
+    
+    Args:
+        book_file_name: str - Name of the book file to process
+    Returns:
+        Bible - The parsed Bible object
+    """
+    book = epub.read_epub(book_file_name)
+
+    bible = Bible()
+    bible.name = get_title(book)
+    bible.books = []
+
+    toc_items = list_toc(book)
+    for book_name, book_link in toc_items:
+        print(f"{book_name} -> {book_link}")
+        chapter_links = extract_chapter_links(book_link, book)
+        bible_book = BibleBook()
+        bible_book.name = book_name
+        bible_book.chapters = []
+
+        for chapter_num, chapter_link in chapter_links:
+            chapter = extract_chapter_verses(chapter_link, book)
+            print(f"Found {len(chapter.paragraphs)} paragraphs")
+            print(f"Found {len(chapter.verses)} verses")
+            print(f"Found {sum([len(paragraph.verses) for paragraph in chapter.paragraphs])} p.verses")
+            print(f"Found {len(chapter.text)} text")
+            chapter.number = int(chapter_num)
+            bible_book.chapters.append(chapter)
+            # return
+
+        bible.books.append(bible_book)
+
+    print(f"Found {len(bible.books)} books")
+
+    # Write bible to json file using the custom encoder
+    path_obj = Path(book_file_name)
+    basename = path_obj.stem
+    with open(f"{basename}.json", "w") as f:
+        json.dump(bible, f, cls=BibleEncoder, indent=2)
+
+    return bible
+
+
+def get_title(book: epub.EpubBook) -> str:
+    """
+    Extracts the title from the EPUB book.
+    """
+    return book.get_metadata('DC', 'title')[0][0]
+
+
+def list_toc(book: epub.EpubBook) -> list[tuple[str, str]]:
+    """
+    Lists all table of contents entries from the EPUB book.
+    
+    Args:
+        book_path: str - Path to the EPUB file to process
+    Returns:
+        list[tuple[str, str]] - List of tuples containing the (title, link) of each TOC entry
+    """
+
+    toc_items = []
+
+    # Get the table of contents
+    for item in book.get_items():
+        if item.get_type() == ebooklib.ITEM_DOCUMENT:
+            if item.get_name().startswith('toc'):
+                soup = read_item(item)
+                # Find the TOC navigation list
+                nav_list = soup.find('nav', attrs={'epub:type': 'toc'}).find('ol')
+
+                # Extract all links and their text
+                for link in nav_list.find_all('a'):
+                    href = link.get('href')
+                    text = link.get_text(strip=True)
+                    toc_items.append((text, href))
+
+    # Get just the links to Bible Book Chapter Lists TOCs
+    book_tocs = []
+    for item in toc_items:
+        name = item[0]
+        link = item[1]
+        if link.startswith('biblechapternav'):
+            book_tocs.append(item)
+            print(f"{name} -> {link}")
+
+    return book_tocs
+
+
+def extract_chapter_links(book_file_name: str, book: epub.EpubBook) -> list[tuple[str, str]]:
+    """
+    Extracts chapter links from the bible chapter navigation file.
+    
+    Args:
+        book_file_name: str - Name of the navigation file to process (e.g. biblechapternav1.xhtml)
+        book: epub.EpubBook - The epub book object containing the file
+    Returns:
+        list[tuple[str, str]] - List of tuples containing the (chapter number, link) of each chapter
+    """
+    print(f"Extracting chapter verses from {book_file_name}")
+    item = book.get_item_with_href(book_file_name)
+    if item is None:
+        print(f"Could not find item with href {book_file_name}")
+        return
+
+    soup = read_item(item)
+
+    # Get the book name from the h2 heading
+    book_heading = soup.find('h2', class_='w_navigation w_bibleChapter')
+    if book_heading:
+        book_name = book_heading.get_text(strip=True)
+        print(f"Processing book: {book_name}")
+
+    # Find all chapter links in the table
+    chapter_links = soup.find_all('a')
+    chapters = []
+
+    for link in chapter_links:
+        if link.parent.name == 'td':  # Only process links inside table cells
+            href = link.get('href')
+            chapter_num = link.get_text(strip=True)
+            chapters.append((chapter_num, href))
+            print(f"Chapter {chapter_num}: {href}")
+
+    return chapters
+
+def join_parts(parts: list[str]) -> str:
+    result = []
+    for i, part in enumerate(parts):
+        part = clean_text(part)
+        # print unicode character values for the first 3 chars in this string
+        # for char in part[:3]:
+        #     print(f"Char: '{char}' - {ord(char)}")
+        # # Add the last 3 chars of the part to the result
+        # for char in part[-3:]:
+        #     print(f"Char: '{char}' - {ord(char)}")
+        if i > 0:
+            # If previous part ends with space and current starts with space
+            # strip spaces from both and add single space
+            previous_part = clean_text(result[-1])
+            if previous_part.endswith(' ') and part.startswith(' '):
+                result[-1] = previous_part.rstrip()
+                result.append(' ' + part.lstrip())
+            else:
+                result.append(clean_text(part))
+        else:
+            result.append(clean_text(part))
+    return ''.join(result)
+
+def extract_chapter_verses(chapter_link: str, book: epub.EpubBook) -> BibleChapter | None:
+    """
+    Extracts verses from a chapter file in the EPUB book.
+    
+    Args:
+        chapter_link: str - Link to the chapter file to process
+        book: epub.EpubBook - The EPUB book object
+        
+    Returns:
+        list[BibleVerse] - List of parsed Bible verses
+    """
+    item = book.get_item_with_href(chapter_link)
+    if item is None:
+        print(f"Could not find item with href {chapter_link}")
+        return None
+
+    soup = read_item(item)
+    print(f"Processing chapter: {chapter_link}")
+
+    # Get all p tags with class sb
+    verse_paragraph_elements = soup.find_all('p', class_='sb')
+    print(f"Found {len(verse_paragraph_elements)} verse paragraph elements")
+    # <p id="p2" data-pid="2" class="p2 sb"><span id="pos48"/><span id="chapter1"/><span id="chapter1_verse1"/><span class="w_ch"><strong>1</strong> </span>In the beginning God created the heavens and the earth.</p>&#13;
+
+    # List of all the paragraph's (lists of verses) in the chapter
+    verse_paragraphs: list[list[str]] = []
+
+    # <p id="p7" data-pid="7" class="p7 sb">
+    # <span id="pos1609"/>
+    # <span id="chapter1_verse14"/><strong><sup>14</sup></strong> Then God said: “Let there be luminaries<span id="footnotesource5"/><a epub:type="noteref" href="#footnote5">*</a> in the expanse of the heavens to make a division between the day and the night, and they will serve as signs for seasons and for days and years.
+    # <span id="chapter1_verse15"/><strong><sup>15</sup></strong> They will serve as luminaries in the expanse of the heavens to shine upon the earth.” And it was so.
+    # <span id="chapter1_verse16"/><strong><sup>16</sup></strong> And God went on to make the two great luminaries, the greater luminary for dominating the day and the lesser luminary for dominating the night, and also the stars.
+    # <span id="chapter1_verse17"/><strong><sup>17</sup></strong> Thus God put them in the expanse of the heavens to shine upon the earth <span id="chapter1_verse18"/><strong><sup>18</sup></strong> and to dominate by day and by night and to make a division between the light and the darkness. Then God saw that it was good.
+    # <span id="chapter1_verse19"/><strong><sup>19</sup></strong> And there was evening and there was morning, a fourth day.
+    # </p>&#13;
+
+    for verse_paragraph in verse_paragraph_elements:
+        verses_in_paragraph: list[tuple[int, int, str]] = [] # List of verses in the paragraph (chapter, verse, text)
+        single_verse_sections: list[str] = [] # List of sections of text which make up a single verse
+        current_chapter: int = 0
+        current_verse: int = 0
+
+        for child in verse_paragraph.children:
+            # print(f"Child: '{child}' - '{child.name}'")
+            # if child is not str and child.name is not None:
+            #     print(f"{child.name} - {child.get('id', '')} - {child.get('class', '')} - {child.children}")
+            if child.name == 'span' and child.get('id', '').startswith('pos'):
+                continue
+            if child.name == 'span' and child.get('id', '').startswith('chapter') and not '_verse' in child.get('id', ''):
+                current_verse = child.get_text(strip=True)
+                # print(f"Chapter: {current_chapter} - Verse: '{current_verse}'")
+                continue
+            if child.name == 'span' and child.get('id', '').startswith('chapter') and '_verse' in child.get('id', ''):
+                child_id = child.get('id', '')
+                # Found the end of this verse
+                if len(single_verse_sections) > 0:
+                    combined_verse_text = join_parts(single_verse_sections)
+                    verses_in_paragraph.append((current_chapter, current_verse, combined_verse_text))
+                    single_verse_sections = []
+                current_chapter = child_id.split('_')[0][7:]
+                current_verse = child_id.split('_')[1][5:]
+                # print(f"Chapter: {current_chapter} - Verse: '{current_verse}'")
+                continue
+            if child.name == 'span' and 'w_ch' in child.get('class', []):
+                # print("Verse marker: ", child.get_text(strip=True))
+                continue
+            if child.name is None and child.string is not None:
+                # print(f"Child: '{child}' - '{child.string}'")
+                text = child.string
+                # count = len(text)
+                # stripped_count = len(text.strip())
+                # if count != stripped_count:
+                #     print(f"Stripped count: {stripped_count} - count: {count}")
+                #text = text.lstrip()
+                single_verse_sections.append(text)
+                continue
+            if child.name == 'strong' and isinstance(child, Tag) and len(child.contents) > 0 and isinstance(child.contents[0], Tag) and child.contents[0].name == 'sup':
+                # print("Verse number: ", child.get_text(strip=True))
+                continue
+            if child.name == 'strong' and len(child.contents) > 0 and child.contents[0].name == 'span' and 'altsize':
+                span_child = child.contents[0]
+                if span_child.get('class', []) and len(span_child.contents) > 0 and span_child.contents[0].name == 'sup':
+                    # print("Verse number: ", child.get_text(strip=True))
+                    continue
+            if child.name == 'span' and child.get('id', '').startswith('footnotesource'):
+                continue
+            if child.name == 'a' and child.get('epub:type', '') == 'noteref':
+                continue
+            if child.name == 'span' and "pageNum" in child.get('class', []):
+                continue
+            if child.name == 'em' and child.string is not None:
+                single_verse_sections.append(child.string)
+                continue
+            if child.name == 'strong' and child.string is not None:
+                strong_text = child.string.strip()
+                if strong_text == current_verse:
+                    continue
+            print(f"ERROR: Unknown child: {child}")
+            return None
+
+        # Add the last verse to the list
+        combined_verse_text = join_parts(single_verse_sections)
+        verses_in_paragraph.append((current_chapter, current_verse, combined_verse_text))
+        # Add the paragraph to the list
+        verse_paragraphs.append(verses_in_paragraph)
+
+    print(f"Found {len(verse_paragraphs)} paragraphs, containing {sum([len(paragraph) for paragraph in verse_paragraphs])} verses")
+
+    # # Print the verses
+    # print('----------------------------------')
+    # print("\n\n")
+    # print("\n\n")
+    # for paragraph in verse_paragraphs:
+    #     for verse in paragraph:
+    #         print(f"Chapter: {verse[0]} - Verse: {verse[1]} - Text: {verse[2]}")
+    # print("\n\n")
+    # print("\n\n")
+    # print('----------------------------------')
+    # print("\n\n")
+    # print("\n\n")
+    # for paragraph in verse_paragraphs:
+    #     text = join_parts([verse[2] for verse in paragraph])
+    #     print(text)
+    #     print()
+    # print("\n\n")
+    # print("\n\n")
+    # print('----------------------------------')
+
+    chapter = BibleChapter()
+
+    for verses in verse_paragraphs:
+        text = join_parts([verse[2] for verse in verses])
+        verses = [BibleVerse(int(verse[0]), int(verse[1]), verse[2]) for verse in verses]
+        paragraph = BibleParagraph(verses, clean_text_extra(text.strip()))
+        chapter.paragraphs.append(paragraph)
+        chapter.verses.extend(verses)
+
+    chapter.text = clean_text_extra('\n'.join([paragraph.text.strip() for paragraph in chapter.paragraphs]))
+    # print(f"Chapter: {chapter.number} - \n\n{chapter.text}\n\n")
+    return chapter
+
+def clean_text(text: str) -> str:
+    """
+    Cleans the text by removing leading and trailing spaces and replacing multiple spaces with a single space.
+    """
+    text = text.replace(chr(160), ' ') # Replace non-breaking space with space
+    text = text.replace(u"\u201D", '"') # Replace right double quotation mark with double quotation mark
+    text = text.replace(u"\u201C", '"') # Replace left double quotation mark with double quotation mark
+    text = text.replace(u"\u2019", "'") # Replace right single quotation mark with single quotation mark
+    text = text.replace(u"\u2018", "'") # Replace left single quotation mark with single quotation mark
+    text = text.replace(u"\u02b9", "'") # Replace right single quotation mark with single quotation mark
+    return text
+
+def clean_text_extra(text: str) -> str:
+    """Remove extra stuff from the summary text, but leave this stuff in the verses themselves"""
+    text = text.replace(u"\u00b7", '') # remove * dot in the middle of words for punctuation
+    # text = text.replace(u"\u2014", '-') # remove em dash
+    return text
+
+# MARK: - Main
+
+if __name__ == '__main__':
+    import sys
+
+    file_name = None
+    default_file = "nwt_E.epub"
+
+    try:
+        file_name = sys.argv[1] if len(sys.argv) > 1 else default_file
+    except IndexError:
+        print(f"Using default file: {default_file}")
+    except FileNotFoundError:
+        print(f"Error: Could not find file '{file_name}'")
+        sys.exit(1)
+    except Exception as e:
+        print(f"Error processing file: {str(e)}")
+        sys.exit(1)
+
+    file = file_name or default_file
+    # extract_book(file)
+    extract_verses(file)