Skip to content

Instantly share code, notes, and snippets.

@kylehowells
Created February 10, 2025 09:32
Show Gist options
  • Save kylehowells/e4eabe7d6c59b1d451f0a7ad91f18268 to your computer and use it in GitHub Desktop.
Save kylehowells/e4eabe7d6c59b1d451f0a7ad91f18268 to your computer and use it in GitHub Desktop.

Revisions

  1. kylehowells created this gist Feb 10, 2025.
    122 changes: 122 additions & 0 deletions download_nwt.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,122 @@

    # Get one language:
    # https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?booknum=0&output=json&pub=nwt&fileformat=jwpub&alllangs=0&langwritten=S

    # Get all languages:
    # https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?booknum=0&output=json&pub=nwt&fileformat=epub&alllangs=1&langwritten=E


    import requests


    def get_media_links(language_code: str, format: str, document: str) -> list[str]:
    response = requests.get(f'https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?booknum=0&output=json&pub={document}&fileformat={format}&alllangs=0&langwritten={language_code}')
    return response.json()

    # MARK: - Download

    def download(langCode: str|None=None, docID: str|None=None):
    language_code = langCode or 'E'
    # format = 'jwpub'
    format = 'epub'
    document = docID or 'nwt'
    media_links = get_media_links(language_code, format, document)
    print(media_links)
    publication_name = media_links['pubName']
    print("Publication Name: ", publication_name)
    file_url = media_links['files'][language_code][format.upper()][0]['file']['url']
    print("File URL: ", file_url)
    file_name = file_url.split('/')[-1]
    print(f"Downloading: {file_name}")
    response = requests.get(file_url)
    with open(file_name, 'wb') as f:
    f.write(response.content)
    print(f"Downloaded: {file_name}")


    # MARK: - Main

    if __name__ == '__main__':
    import sys
    langCode = None
    docID = None

    if len(sys.argv) > 1:
    langCode = sys.argv[1]
    if len(sys.argv) > 2:
    docID = sys.argv[2]

    download(langCode=langCode, docID=docID)


    # MARK: - Notes

    """"
    {
    "pubName": "La Biblia. Traducción del Nuevo Mundo (revisión del 2019)",
    "parentPubName": "La Biblia. Traducción del Nuevo Mundo (revisión del 2019)",
    "booknum": 0,
    "pub": "nwt",
    "issue": "",
    "formattedDate": "",
    "fileformat": [
    "JWPUB"
    ],
    "track": null,
    "specialty": "",
    "pubImage": {
    "url": "",
    "modifiedDatetime": "",
    "checksum": null
    },
    "languages": {
    "S": {
    "name": "español",
    "direction": "ltr",
    "locale": "es",
    "script": "ROMAN"
    }
    },
    "files": {
    "S": {
    "JWPUB": [
    {
    "title": "La Biblia. Traducción del Nuevo Mundo (revisión del 2019)",
    "file": {
    "url": "https://cfp2.jw-cdn.org/a/03c55e4/2/o/nwt_S.jwpub",
    "stream": "https://jw.org",
    "modifiedDatetime": "2024-08-15 14:38:30",
    "checksum": "31bef50c135d9940e97ebc47fb99cc44"
    },
    "filesize": 37413926,
    "trackImage": {
    "url": "",
    "modifiedDatetime": "",
    "checksum": null
    },
    "markers": null,
    "label": "0p",
    "track": 0,
    "hasTrack": false,
    "pub": "nwt",
    "docid": 0,
    "booknum": 0,
    "mimetype": "application/octet-stream",
    "edition": "",
    "editionDescr": "Normal",
    "format": "",
    "formatDescr": "Normal",
    "specialty": "",
    "specialtyDescr": "",
    "subtitled": false,
    "frameWidth": 0,
    "frameHeight": 0,
    "frameRate": 0,
    "duration": 0,
    "bitRate": 0
    }
    ]
    }
    }
    }
    """
    459 changes: 459 additions & 0 deletions epub_to_json.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,459 @@
    import json
    import ebooklib
    from ebooklib import epub
    from pathlib import Path
    import os
    from bs4 import BeautifulSoup, Tag
    from bs4.element import Tag, NavigableString
    import re
    from dataclasses import dataclass, asdict


    # MARK: - Helper Function

    def read_item(item: epub.EpubItem) -> BeautifulSoup:
    print(item.get_name())
    content = item.get_content()
    soup = BeautifulSoup(content, 'html.parser')
    return soup


    # MARK: - Extract Book into HTML Files

    def extract_book(book_path: str) -> None:
    """
    Extracts content from an EPUB book and processes its documents.
    Args:
    book_path: str - Path to the EPUB file to process
    """
    # Convert book path to Path object for proper handling
    path_obj = Path(book_path)
    basename = path_obj.stem

    # Create output directory if it doesn't exist
    output_dir = Path(basename)
    output_dir.mkdir(exist_ok=True)

    book = epub.read_epub(book_path)

    for item in book.get_items():
    if item.get_type() == ebooklib.ITEM_DOCUMENT:
    print('==================================')
    file_name = item.get_name()
    print('NAME : ', file_name)
    print('----------------------------------')
    content = item.get_content()

    # Write content to file in the output directory
    output_path = output_dir / file_name
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    with open(output_path, 'wb') as f:
    f.write(content)

    print('==================================')


    # MARK: - Extract Verse Info

    @dataclass
    class BibleVerse:
    # The chapter number
    chapter: int
    # The verse number
    verse: int
    # The text of the verse
    text: str

    @dataclass
    class BibleParagraph:
    # verses don't mean a whole sentence, they can be a part of a sentence.
    # Group into full paragraphs.
    verses: list[BibleVerse]
    # The text of the paragraph
    text: str

    @dataclass
    class BibleChapter:
    # The chapter number
    number: int
    # The grouped paragraphs
    paragraphs: list[BibleParagraph]
    # The individual verses
    verses: list[BibleVerse]
    # The text whole of the chapter
    text: str

    def __init__(self):
    self.verses = []
    self.paragraphs = []
    self.text = ""
    self.number = 0

    @dataclass
    class BibleBook:
    # The name of the book of the Bible
    name: str
    # The chapters of the book of the Bible
    chapters: list[BibleChapter]

    def __init__(self):
    self.name = ""
    self.chapters = []

    @dataclass
    class Bible:
    # The translation name
    name: str
    # The books of the Bible
    books: list[BibleBook]

    def __init__(self):
    self.name = ""
    self.books = []

    class BibleEncoder(json.JSONEncoder):
    """
    Custom JSON encoder for Bible classes.
    """
    def default(self, obj):
    if isinstance(obj, (Bible, BibleBook, BibleChapter)):
    return asdict(obj)
    return super().default(obj)


    # MARK: - Extract Bible

    def extract_verses(book_file_name: str) -> Bible:
    """
    Extracts verses from a book file.
    Args:
    book_file_name: str - Name of the book file to process
    Returns:
    Bible - The parsed Bible object
    """
    book = epub.read_epub(book_file_name)

    bible = Bible()
    bible.name = get_title(book)
    bible.books = []

    toc_items = list_toc(book)
    for book_name, book_link in toc_items:
    print(f"{book_name} -> {book_link}")
    chapter_links = extract_chapter_links(book_link, book)
    bible_book = BibleBook()
    bible_book.name = book_name
    bible_book.chapters = []

    for chapter_num, chapter_link in chapter_links:
    chapter = extract_chapter_verses(chapter_link, book)
    print(f"Found {len(chapter.paragraphs)} paragraphs")
    print(f"Found {len(chapter.verses)} verses")
    print(f"Found {sum([len(paragraph.verses) for paragraph in chapter.paragraphs])} p.verses")
    print(f"Found {len(chapter.text)} text")
    chapter.number = int(chapter_num)
    bible_book.chapters.append(chapter)
    # return

    bible.books.append(bible_book)

    print(f"Found {len(bible.books)} books")

    # Write bible to json file using the custom encoder
    path_obj = Path(book_file_name)
    basename = path_obj.stem
    with open(f"{basename}.json", "w") as f:
    json.dump(bible, f, cls=BibleEncoder, indent=2)

    return bible


    def get_title(book: epub.EpubBook) -> str:
    """
    Extracts the title from the EPUB book.
    """
    return book.get_metadata('DC', 'title')[0][0]


    def list_toc(book: epub.EpubBook) -> list[tuple[str, str]]:
    """
    Lists all table of contents entries from the EPUB book.
    Args:
    book_path: str - Path to the EPUB file to process
    Returns:
    list[tuple[str, str]] - List of tuples containing the (title, link) of each TOC entry
    """

    toc_items = []

    # Get the table of contents
    for item in book.get_items():
    if item.get_type() == ebooklib.ITEM_DOCUMENT:
    if item.get_name().startswith('toc'):
    soup = read_item(item)
    # Find the TOC navigation list
    nav_list = soup.find('nav', attrs={'epub:type': 'toc'}).find('ol')

    # Extract all links and their text
    for link in nav_list.find_all('a'):
    href = link.get('href')
    text = link.get_text(strip=True)
    toc_items.append((text, href))

    # Get just the links to Bible Book Chapter Lists TOCs
    book_tocs = []
    for item in toc_items:
    name = item[0]
    link = item[1]
    if link.startswith('biblechapternav'):
    book_tocs.append(item)
    print(f"{name} -> {link}")

    return book_tocs


    def extract_chapter_links(book_file_name: str, book: epub.EpubBook) -> list[tuple[str, str]]:
    """
    Extracts chapter links from the bible chapter navigation file.
    Args:
    book_file_name: str - Name of the navigation file to process (e.g. biblechapternav1.xhtml)
    book: epub.EpubBook - The epub book object containing the file
    Returns:
    list[tuple[str, str]] - List of tuples containing the (chapter number, link) of each chapter
    """
    print(f"Extracting chapter verses from {book_file_name}")
    item = book.get_item_with_href(book_file_name)
    if item is None:
    print(f"Could not find item with href {book_file_name}")
    return

    soup = read_item(item)

    # Get the book name from the h2 heading
    book_heading = soup.find('h2', class_='w_navigation w_bibleChapter')
    if book_heading:
    book_name = book_heading.get_text(strip=True)
    print(f"Processing book: {book_name}")

    # Find all chapter links in the table
    chapter_links = soup.find_all('a')
    chapters = []

    for link in chapter_links:
    if link.parent.name == 'td': # Only process links inside table cells
    href = link.get('href')
    chapter_num = link.get_text(strip=True)
    chapters.append((chapter_num, href))
    print(f"Chapter {chapter_num}: {href}")

    return chapters

    def join_parts(parts: list[str]) -> str:
    result = []
    for i, part in enumerate(parts):
    part = clean_text(part)
    # print unicode character values for the first 3 chars in this string
    # for char in part[:3]:
    # print(f"Char: '{char}' - {ord(char)}")
    # # Add the last 3 chars of the part to the result
    # for char in part[-3:]:
    # print(f"Char: '{char}' - {ord(char)}")
    if i > 0:
    # If previous part ends with space and current starts with space
    # strip spaces from both and add single space
    previous_part = clean_text(result[-1])
    if previous_part.endswith(' ') and part.startswith(' '):
    result[-1] = previous_part.rstrip()
    result.append(' ' + part.lstrip())
    else:
    result.append(clean_text(part))
    else:
    result.append(clean_text(part))
    return ''.join(result)

    def extract_chapter_verses(chapter_link: str, book: epub.EpubBook) -> BibleChapter | None:
    """
    Extracts verses from a chapter file in the EPUB book.
    Args:
    chapter_link: str - Link to the chapter file to process
    book: epub.EpubBook - The EPUB book object
    Returns:
    list[BibleVerse] - List of parsed Bible verses
    """
    item = book.get_item_with_href(chapter_link)
    if item is None:
    print(f"Could not find item with href {chapter_link}")
    return None

    soup = read_item(item)
    print(f"Processing chapter: {chapter_link}")

    # Get all p tags with class sb
    verse_paragraph_elements = soup.find_all('p', class_='sb')
    print(f"Found {len(verse_paragraph_elements)} verse paragraph elements")
    # <p id="p2" data-pid="2" class="p2 sb"><span id="pos48"/><span id="chapter1"/><span id="chapter1_verse1"/><span class="w_ch"><strong>1</strong> </span>In the beginning God created the heavens and the earth.</p>&#13;

    # List of all the paragraph's (lists of verses) in the chapter
    verse_paragraphs: list[list[str]] = []

    # <p id="p7" data-pid="7" class="p7 sb">
    # <span id="pos1609"/>
    # <span id="chapter1_verse14"/><strong><sup>14</sup></strong> Then God said: “Let there be luminaries<span id="footnotesource5"/><a epub:type="noteref" href="#footnote5">*</a> in the expanse of the heavens to make a division between the day and the night, and they will serve as signs for seasons and for days and years.
    # <span id="chapter1_verse15"/><strong><sup>15</sup></strong> They will serve as luminaries in the expanse of the heavens to shine upon the earth.” And it was so.
    # <span id="chapter1_verse16"/><strong><sup>16</sup></strong> And God went on to make the two great luminaries, the greater luminary for dominating the day and the lesser luminary for dominating the night, and also the stars.
    # <span id="chapter1_verse17"/><strong><sup>17</sup></strong> Thus God put them in the expanse of the heavens to shine upon the earth <span id="chapter1_verse18"/><strong><sup>18</sup></strong> and to dominate by day and by night and to make a division between the light and the darkness. Then God saw that it was good.
    # <span id="chapter1_verse19"/><strong><sup>19</sup></strong> And there was evening and there was morning, a fourth day.
    # </p>&#13;

    for verse_paragraph in verse_paragraph_elements:
    verses_in_paragraph: list[tuple[int, int, str]] = [] # List of verses in the paragraph (chapter, verse, text)
    single_verse_sections: list[str] = [] # List of sections of text which make up a single verse
    current_chapter: int = 0
    current_verse: int = 0

    for child in verse_paragraph.children:
    # print(f"Child: '{child}' - '{child.name}'")
    # if child is not str and child.name is not None:
    # print(f"{child.name} - {child.get('id', '')} - {child.get('class', '')} - {child.children}")
    if child.name == 'span' and child.get('id', '').startswith('pos'):
    continue
    if child.name == 'span' and child.get('id', '').startswith('chapter') and not '_verse' in child.get('id', ''):
    current_verse = child.get_text(strip=True)
    # print(f"Chapter: {current_chapter} - Verse: '{current_verse}'")
    continue
    if child.name == 'span' and child.get('id', '').startswith('chapter') and '_verse' in child.get('id', ''):
    child_id = child.get('id', '')
    # Found the end of this verse
    if len(single_verse_sections) > 0:
    combined_verse_text = join_parts(single_verse_sections)
    verses_in_paragraph.append((current_chapter, current_verse, combined_verse_text))
    single_verse_sections = []
    current_chapter = child_id.split('_')[0][7:]
    current_verse = child_id.split('_')[1][5:]
    # print(f"Chapter: {current_chapter} - Verse: '{current_verse}'")
    continue
    if child.name == 'span' and 'w_ch' in child.get('class', []):
    # print("Verse marker: ", child.get_text(strip=True))
    continue
    if child.name is None and child.string is not None:
    # print(f"Child: '{child}' - '{child.string}'")
    text = child.string
    # count = len(text)
    # stripped_count = len(text.strip())
    # if count != stripped_count:
    # print(f"Stripped count: {stripped_count} - count: {count}")
    #text = text.lstrip()
    single_verse_sections.append(text)
    continue
    if child.name == 'strong' and isinstance(child, Tag) and len(child.contents) > 0 and isinstance(child.contents[0], Tag) and child.contents[0].name == 'sup':
    # print("Verse number: ", child.get_text(strip=True))
    continue
    if child.name == 'strong' and len(child.contents) > 0 and child.contents[0].name == 'span' and 'altsize':
    span_child = child.contents[0]
    if span_child.get('class', []) and len(span_child.contents) > 0 and span_child.contents[0].name == 'sup':
    # print("Verse number: ", child.get_text(strip=True))
    continue
    if child.name == 'span' and child.get('id', '').startswith('footnotesource'):
    continue
    if child.name == 'a' and child.get('epub:type', '') == 'noteref':
    continue
    if child.name == 'span' and "pageNum" in child.get('class', []):
    continue
    if child.name == 'em' and child.string is not None:
    single_verse_sections.append(child.string)
    continue
    if child.name == 'strong' and child.string is not None:
    strong_text = child.string.strip()
    if strong_text == current_verse:
    continue
    print(f"ERROR: Unknown child: {child}")
    return None

    # Add the last verse to the list
    combined_verse_text = join_parts(single_verse_sections)
    verses_in_paragraph.append((current_chapter, current_verse, combined_verse_text))
    # Add the paragraph to the list
    verse_paragraphs.append(verses_in_paragraph)

    print(f"Found {len(verse_paragraphs)} paragraphs, containing {sum([len(paragraph) for paragraph in verse_paragraphs])} verses")

    # # Print the verses
    # print('----------------------------------')
    # print("\n\n")
    # print("\n\n")
    # for paragraph in verse_paragraphs:
    # for verse in paragraph:
    # print(f"Chapter: {verse[0]} - Verse: {verse[1]} - Text: {verse[2]}")
    # print("\n\n")
    # print("\n\n")
    # print('----------------------------------')
    # print("\n\n")
    # print("\n\n")
    # for paragraph in verse_paragraphs:
    # text = join_parts([verse[2] for verse in paragraph])
    # print(text)
    # print()
    # print("\n\n")
    # print("\n\n")
    # print('----------------------------------')

    chapter = BibleChapter()

    for verses in verse_paragraphs:
    text = join_parts([verse[2] for verse in verses])
    verses = [BibleVerse(int(verse[0]), int(verse[1]), verse[2]) for verse in verses]
    paragraph = BibleParagraph(verses, clean_text_extra(text.strip()))
    chapter.paragraphs.append(paragraph)
    chapter.verses.extend(verses)

    chapter.text = clean_text_extra('\n'.join([paragraph.text.strip() for paragraph in chapter.paragraphs]))
    # print(f"Chapter: {chapter.number} - \n\n{chapter.text}\n\n")
    return chapter

    def clean_text(text: str) -> str:
    """
    Cleans the text by removing leading and trailing spaces and replacing multiple spaces with a single space.
    """
    text = text.replace(chr(160), ' ') # Replace non-breaking space with space
    text = text.replace(u"\u201D", '"') # Replace right double quotation mark with double quotation mark
    text = text.replace(u"\u201C", '"') # Replace left double quotation mark with double quotation mark
    text = text.replace(u"\u2019", "'") # Replace right single quotation mark with single quotation mark
    text = text.replace(u"\u2018", "'") # Replace left single quotation mark with single quotation mark
    text = text.replace(u"\u02b9", "'") # Replace right single quotation mark with single quotation mark
    return text

    def clean_text_extra(text: str) -> str:
    """Remove extra stuff from the summary text, but leave this stuff in the verses themselves"""
    text = text.replace(u"\u00b7", '') # remove * dot in the middle of words for punctuation
    # text = text.replace(u"\u2014", '-') # remove em dash
    return text

    # MARK: - Main

    if __name__ == '__main__':
    import sys

    file_name = None
    default_file = "nwt_E.epub"

    try:
    file_name = sys.argv[1] if len(sys.argv) > 1 else default_file
    except IndexError:
    print(f"Using default file: {default_file}")
    except FileNotFoundError:
    print(f"Error: Could not find file '{file_name}'")
    sys.exit(1)
    except Exception as e:
    print(f"Error processing file: {str(e)}")
    sys.exit(1)

    file = file_name or default_file
    # extract_book(file)
    extract_verses(file)