Skip to content

Instantly share code, notes, and snippets.

@skmezanul
Forked from Krazybug/calisuck.py
Created December 30, 2019 18:50
Show Gist options
  • Save skmezanul/96b22f9ac04c4a74f2c6896a7ecbf5c2 to your computer and use it in GitHub Desktop.
Save skmezanul/96b22f9ac04c4a74f2c6896a7ecbf5c2 to your computer and use it in GitHub Desktop.

Revisions

  1. @Krazybug Krazybug revised this gist Dec 30, 2019. 1 changed file with 15 additions and 14 deletions.
    29 changes: 15 additions & 14 deletions calisuck.py
    Original file line number Diff line number Diff line change
    @@ -682,20 +682,6 @@ def download_ebooks(dir= 'my_books', formats=[], single_format=False, ignored_fo
    print()
    print("Reporting ...")

    print()
    table = BeautifulTable()
    table.column_headers = ["", "Total count"]
    table.append_row(["Formats", total_format_count])
    table.append_row(["Ebooks", total_ebook_count])
    print(table)

    print()
    table = BeautifulTable()
    table.column_headers = ["", "Size"]
    table.append_row(["Min", hsize(size_min)])
    table.append_row(["Max", hsize(size_max)])
    table.append_row(["Total", hsize(total_size)])
    print(table)

    print()
    print("Total ebooks updated by language:")
    @@ -729,6 +715,21 @@ def download_ebooks(dir= 'my_books', formats=[], single_format=False, ignored_fo
    table.append_row([f, hsize(s)])
    print(table)

    print()
    table = BeautifulTable()
    table.column_headers = ["", "Total count"]
    table.append_row(["Formats", total_format_count])
    table.append_row(["Ebooks", total_ebook_count])
    print(table)

    print()
    table = BeautifulTable()
    table.column_headers = ["", "Size"]
    table.append_row(["Min", hsize(size_min)])
    table.append_row(["Max", hsize(size_max)])
    table.append_row(["Total", hsize(total_size)])
    print(table)

    print()
    print("Done !!!")

  2. @Krazybug Krazybug revised this gist Dec 30, 2019. 1 changed file with 88 additions and 5 deletions.
    93 changes: 88 additions & 5 deletions calisuck.py
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,44 @@
    #!/usr/bin/env python3

    '''
    calisuck: index, filter-out smartly and download ebooks from Calibre open directories
    Installation:
    You need python 3.5 installed
    Download the file as a zip and unzip-it and get into the dir
    OR
    > git clone https://gist.github.com/b7e814d7189db9ee1d6b9c1d1a1de95c.git
    > mv b7e814d7189db9ee1d6b9c1d1a1de95c calisuck
    > cd calisuck
    >
    THEN
    > python3 -m venv .
    > . bin/activate
    > pip install requests fire humanize langid iso639 beautifultable
    > python calisuck.py --help
    > python calisuck.py index-ebooks --help
    > python calisuck.py download-ebooks --help
    > python calisuck.py download-covers --help
    '''

    '''
    DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
    Version 2, December 2004
    Copyright (C) 2004 Sam Hocevar <[email protected]>
    Everyone is permitted to copy and distribute verbatim or modified
    copies of this license document, and changing it is allowed as long
    as the name is changed.
    DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
    0. You just DO WHAT THE FUCK YOU WANT TO.
    '''

    import sys
    import os
    import time
    @@ -223,7 +264,7 @@ def index_ebooks(site, library="", start=0, stop=0, dir="my_books", inc=1000, fo
    --library=<string> (default=my_books) : Id of library to index. The script index the default library by default.
    The id is string following '&library_id=' in the url
    --force_refresh (defaul=False) : Force a refresh of the metadata. By default all the metdata
    --force-refresh (defaul=False) : Force a refresh of the metadata. By default all the metdata
    already gathered are ignored
    --start=<int> (default=0)
    @@ -236,8 +277,8 @@ def index_ebooks(site, library="", start=0, stop=0, dir="my_books", inc=1000, fo

    offset= 0 if not start else start-1
    num=min(1000,inc)
    server=site
    api=server+'ajax/'
    server=site.rstrip('/')
    api=server+'/ajax/'
    library= '/'+library if library else library

    print("Server:", server)
    @@ -460,11 +501,53 @@ def has_identifiers(book, identifiers=[], ignore_empty_identifiers=False):
    return True

    def download_ebooks(dir= 'my_books', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False, dry_run=False, map="", map_lib=""):
    '''
    Download ebooks in matching subdirs:
    The different formats of the same book are groupe in the same directory
    with an UUID name close to the metadata file (metadata.json).
    The status of the formats for a book and its global status are initially set to 'todo'.
    They move to 'done' after their download. This allows you to rerun the download and progressively collect books.
    You can use different options to filter the formats for the download
    by language, size, format and identifiers(isbn, ...).
    A report of the download is displayed at the end of the process.
    You can run this command in dry mode (--dry-run) with different settings
    to only display the report and prepare your effective.
    Params:
    --min-size=<int> (default=0)
    --max-size=<int> (default=infinity) : Delimit the size in MB for the accepted formats
    --dry-run (defaul=False) : Run the command to simulate the download
    --language=<string> : Restrict the download to a list of specific languages
    (Ex: --languages='["eng","ita"]'
    --ignore-empty-language (defaul=False) : Ignore books with unidentfied language
    --formats=<string> : Restrict the download to a list of specific formats
    (Ex: --formats='["epub", "mobi", "pdf"]'
    --ignore-formats=<string> : Ignore the formats of a list of specific.
    Compliant with --formats.
    (Ex: --ignored-formats='["mp3", "rar", "zip"]'
    --single-format (defaul=False) : Limit the download to 1 format per book with this preference order
    'azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub',
    'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar'
    , 'rtf', 'txt', 'zip', 'fb2'
    --identifiers=<string> : Restrict the download to a list of specific identifiers
    (Ex: --identifiers='["isbn","asin"]'
    --ignore-empty-identifiers (defaul=False) : Ignore books without identifiers (often OCR)
    '''



    # all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip']

    print()

    if single_format: my_formats = formats if formats else all_ordered_formats
    else: my_formats=formats
    print("formats=", my_formats)
    # print("formats=", my_formats)

    min_size=int(min_size)*1024*1024
    max_size=int(max_size)*1024*1024
    @@ -521,7 +604,7 @@ def download_ebooks(dir= 'my_books', formats=[], single_format=False, ignored_fo
    # print("Format '{}': size expected={}".format(f, hsize(source['formats'][f]['size'])))
    pass

    print(f"--> format '{f}' for ({book['title']} / {book['authors'][0]} / {str(book['series'])})")
    # print(f"--> format '{f}' for ({book['title']} / {book['authors'][0]} / {str(book['series'])})")
    if not dry_run:
    try:
    get_file(dir, book, f, s, map, map_lib)
  3. @Krazybug Krazybug revised this gist Dec 30, 2019. 1 changed file with 159 additions and 142 deletions.
    301 changes: 159 additions & 142 deletions calisuck.py
    Original file line number Diff line number Diff line change
    @@ -9,16 +9,19 @@
    from humanize import naturalsize as hsize
    from langid.langid import LanguageIdentifier, model
    import iso639
    import pickle
    import time
    from requests.adapters import HTTPAdapter
    import urllib.parse
    import urllib3
    from beautifultable import BeautifulTable


    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip', 'fb2']
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)


    def load_metadata(path, uuid):
    filepath=path+'/'+uuid+'/metadata.json'
    # print (filepath)
    @@ -30,18 +33,19 @@ def load_metadata(path, uuid):
    print ("Error loading metadata for:", uuid, "from path:", path)
    return 0
    else:
    print ("Metadata not found for:", uuid, "from path:", path)
    # print ("Metadata not found for:", uuid, "from path:", path)
    return 0


    def save_metadata(path, book):
    filepath=path+'/'+book['uuid']+'/metadata.json'
    print("Saving book metadata for:", book['uuid'], "to:", filepath)
    # print("Saving book metadata for:", book['uuid'], "to:", filepath)
    os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True)
    with open(filepath+".tmp", 'w') as fd:
    json.dump(book, fd, indent=4, separators=(',', ': '))
    try:
    shutil.move(filepath+".tmp", filepath)
    print("Saved to:", filepath)
    # print("Saved to:", filepath)
    except:
    print("Unable to rename .tmp file:", filepath+".tmp")

    @@ -51,6 +55,7 @@ def get_cover_path(path, uuid):
    if os.path.isfile(filepath): return filepath
    else: return 0


    def get_file_path(path, uuid, fileformat):
    files=os.listdir(path+'/'+uuid)
    if files:
    @@ -83,7 +88,9 @@ def get_cover(path, book, map):
    print("Saved to:", filepath)


    def download_covers(dir= '.', server='', map=""):
    def download_covers(dir='my_books', server='', map=""):
    """ Download covers for each books"""

    for root, dirs, files in os.walk(dir, topdown=True):
    for d in dirs:
    # print()
    @@ -108,6 +115,7 @@ def download_covers(dir= '.', server='', map=""):
    else:
    print ("No ebook metadata found in:", root)


    def get_file_size(url):
    print("Downloading size:", url)
    r = requests.head(url, verify=False)
    @@ -116,6 +124,7 @@ def get_file_size(url):
    print("Size received="+ hsize(size))
    return int(size)


    def get_file(path, book, format, session, map, map_lib):
    uuid = book['uuid']
    url=book['source']['formats'][format]['url']
    @@ -173,7 +182,6 @@ def set_status(uuid, status, dir='.'):
    print("Status unchanged changed ", status+":", book['uuid'])
    else:
    print ("No ebook metadata found for:", uuid)



    def remove_book(uuid, path='.'):
    @@ -189,36 +197,6 @@ def remove_book(uuid, path='.'):
    print(uuid, "not found")



    def explore(site, help=False):
    server=site
    api=server+'ajax/'
    print("Server:", server)
    url=api+'library-info'
    print()
    print("Getting libraries:", server)
    print(url)
    try:
    r = requests.get(url)
    r.raise_for_status()
    except:
    print("Unable to open site:", url)
    sys.exit(1)

    libraries = r.json()["library_map"].keys()
    print("Libraries:")
    for l in libraries:
    library='/'+l
    url=api+'search'+library+'?num=0'
    try:
    r = requests.get(url)
    r.raise_for_status()
    except:
    print("Unable to open site:", url)
    continue
    print("\t{}: {} ebooks".format(l, r.json()["total_num"]))


    def update_done_status(book):
    source=book['source']
    if source['status']!='ignored':
    @@ -228,19 +206,44 @@ def update_done_status(book):
    book['source']['status']="todo"


    def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False):
    def index_ebooks(site, library="", start=0, stop=0, dir="my_books", inc=1000, force_refresh=False):
    """
    Index a remote Calibre library
    You will get in your <dir> all the metadata (title, authors, isbn, ...) for each book.
    They're stored as simple JSON files (metadata.json) so that you can easily visualize them or process them with 'jq' program.
    They are stored in subdirectories with a UUID as a name. These directories do match different books and allow you to group all
    the different formats of the same book and eventually the cover file.
    You can mix books from different sites without any (theoric) collisions
    Params:
    --site=<string> : Url of the site to index (ex: http://123.123.123.123/)
    --library=<string> (default=my_books) : Id of library to index. The script index the default library by default.
    The id is string following '&library_id=' in the url
    --force_refresh (defaul=False) : Force a refresh of the metadata. By default all the metdata
    already gathered are ignored
    --start=<int> (default=0)
    --stop=<int> (default=0) : Allow indexing between a range of ebooks
    --inc=<int> (default=1000) : Fix the number of ebooks for each request one the server
    """

    os.makedirs(dir, exist_ok=True)

    offset= 0 if not start else start-1
    num=1000
    num=min(1000,inc)
    server=site
    api=server+'ajax/'
    #api=server+'calibre/ajax/'
    library= '/'+library if library else library

    print("Server:", server)
    url=api+'search'+library+'?num=0'
    print()
    print("Getting ebooks count:", server)
    print(url)
    try:
    r = requests.get(url,verify=False)
    r.raise_for_status()
    @@ -251,63 +254,63 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False
    total_num=int(r.json()["total_num"])
    total_num= total_num if not stop else stop

    print()
    print("Start indexing")

    range=offset+1
    while offset < total_num:
    remaining_num = min(num, total_num - offset)
    print()
    print("Downloading ids: offset="+str(offset), "num="+str(remaining_num))
    # url=api+'search?num='+str(remaining_num)+'&offset='+str(offset)
    # print()
    # print("Downloading ids: offset="+str(offset), "num="+str(remaining_num))
    url=api+'search'+library+'?num='+str(remaining_num)+'&offset='+str(offset)+'&sort=timestamp&sort_order=desc'

    print("->", url)
    # print("->", url)
    r=requests.get(url, verify=False)
    print("Ids received from:"+str(offset), "to:"+str(offset+remaining_num-1))
    # print("Ids received from:"+str(offset), "to:"+str(offset+remaining_num-1))

    print()
    print("Downloading metadata from", str(offset+1), "to", str(offset+remaining_num))
    # print()
    # print("\rDownloading metadata from", str(offset+1), "to", str(offset+remaining_num),end='')
    books_s=",".join(str(i) for i in r.json()['book_ids'])
    url=api+'books'+library+'?ids='+books_s
    print("->", url)
    # print("->", url)
    r=requests.get(url, verify=False)
    print(len(r.json()), "received")
    # print(len(r.json()), "received")

    for id in r.json().keys():
    print()
    print ('--> range={}/{}'.format(str(range),str(total_num)))
    r_book=r.json()[id]
    for id, r_book in r.json().items():
    uuid=r_book['uuid']
    if not uuid:
    print ("No uuid for ebook: ignored")
    continue
    # print ('\r--> range={}/{}'.format(str(range),str(total_num)), "uuid="+uuid, "("+r.json()[id]['title']+")", end='')
    # print (r.json()[id])
    # title= r.json()[id]['title'] if 'title' in r.json()[id] else "<untitled>"

    if r_book['authors']:
    print("uuid="+uuid, "("+r_book['title']+" -- "+r_book['authors'][0]+")")
    desc= f"uuid={uuid} ({r_book['title']} / {r_book['authors'][0]})"
    else:
    print("uuid="+uuid, "("+r_book['title']+")")
    desc= f"uuid={uuid} ({r_book['title']})"
    s=f"\r--> {range}/{total_num} - {desc}"
    s='{:140.140}'.format(s)
    print (s, end='')

    if not force_refresh:
    # print("Checking local metadata:", uuid)
    try:
    book = load_metadata(dir, uuid)
    except:
    print()
    print("Unable to get metadata from:", uuid)
    range+=1
    continue
    if book:
    print("Metadata already present for:", uuid)
    # print("Metadata already present for:", uuid)
    range+=1
    continue

    if not r.json()[id]['formats']:

    if not r_book['formats']:
    print()
    print("No format found for {}".format(r_book['uuid']))
    range+=1
    continue


    print("Analyzing for:", uuid)

    book={}
    url=api+'book/'+id
    book['title']=r_book['title']
    @@ -323,17 +326,13 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False
    languages=r_book['languages']
    if not languages:
    # if True:
    # pass
    print ("Analyzing languages")
    if book['comments']:
    text=book['comments']
    else:
    text=book['title']
    s_language, prob=identifier.classify(text)
    print (s_language, prob)
    if prob >= 0.85:
    language = iso639.to_iso639_2(s_language)
    print("language=", language)
    book['languages']=[language]
    else:
    book['languages']=[]
    @@ -362,7 +361,7 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False
    source['timestamp']=r_book['timestamp']

    format_sources={}
    formats=r.json()[id]['formats']
    formats=r_book['formats']
    for f in formats:
    s={}
    url=''
    @@ -375,6 +374,7 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False
    if 'size' in r_book['format_metadata'][f]:
    s['size']=int(r_book['format_metadata'][f]['size'])
    else:
    print()
    print("Size not found for format '{}' : {}".format(f, uuid))
    print("Trying to get size online: {}".format(s['url']))
    try:
    @@ -387,76 +387,79 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False

    source['formats']=format_sources
    book['source']=source
    print("Analyzed:", uuid)


    if not source['formats']:
    print("No format found for {}".format(r.json()[id]['uuid']))
    print("No format found for {}".format(r_book['uuid']))
    range+=1
    continue
    update_done_status(book)
    print("Saving metadata for:", uuid)
    # print("Saving metadata for:", uuid)
    try:
    save_metadata(dir, book)
    except:
    print()
    print("Unable to save book metadata", book['uuid'])
    range+=1
    offset=offset+num

    print()
    print("Done")


    def has_languages(book, languages=[], ignore_empty_language=False):

    print("Accepted languages", languages)
    # print("Accepted languages", languages)
    if not ignore_empty_language:
    print("Unknown language accepted")
    # print("Unknown language accepted")
    pass

    # rustine
    if not 'languages' in book:
    book['languages']=[]

    print("Book languages", book['languages'])
    # print("Book languages", book['languages'])

    if ignore_empty_language and not book['languages']:
    print ("'{}' ignored: language is empty".format(book['uuid']))
    # print ("'{}' ignored: language is empty".format(book['uuid']))
    return False

    if not ignore_empty_language and not book['languages']:
    print ("'{}' todo: language is empty".format(book['uuid']))
    # print ("'{}' todo: language is empty".format(book['uuid']))
    return True

    expected_languages=list(set(book['languages']) & set(languages))
    if languages and not expected_languages:
    print ("'{}' ignored: language {} not in {}".format(book['uuid'], book['languages'],languages))
    # print ("'{}' ignored: language {} not in {}".format(book['uuid'], book['languages'],languages))
    return False

    print ("'{}' todo: expected languages {}".format(book['uuid'], expected_languages))
    # print ("'{}' todo: expected languages {}".format(book['uuid'], expected_languages))
    return True

    def has_identifiers(book, identifiers=[], ignore_empty_identifiers=False):

    print("Accepted identifiers", identifiers)
    # print("Accepted identifiers", identifiers)
    if not ignore_empty_identifiers:
    print("Unknown identifiers accepted")
    print("Book identifiers", book['identifiers'].keys())
    # print("Unknown identifiers accepted")
    pass
    # print("Book identifiers", book['identifiers'].keys())

    if ignore_empty_identifiers and not book['identifiers']:
    print ("'{}' ignored: identifier is empty".format(book['uuid']))
    # print ("'{}' ignored: identifier is empty".format(book['uuid']))
    return False

    if not ignore_empty_identifiers and not book['identifiers']:
    print ("'{}' todo: identifiers is empty".format(book['uuid']))
    # print ("'{}' todo: identifiers is empty".format(book['uuid']))
    return True

    expected_identifiers=list(set(book['identifiers'].keys()) & set(identifiers))
    if identifiers and not expected_identifiers:
    print ("'{}' ignored: identifiers {} not in {}".format(book['uuid'], book['identifiers'].keys(), identifiers))
    # print ("'{}' ignored: identifiers {} not in {}".format(book['uuid'], book['identifiers'].keys(), identifiers))
    return False

    print ("'{}' todo: expected identifiers {}".format(book['uuid'], expected_identifiers))
    # print ("'{}' todo: expected identifiers {}".format(book['uuid'], expected_identifiers))
    return True

    def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False, dry_run=False, map="", map_lib=""):
    def download_ebooks(dir= 'my_books', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False, dry_run=False, map="", map_lib=""):
    # all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip']

    if single_format: my_formats = formats if formats else all_ordered_formats
    @@ -467,8 +470,6 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore
    max_size=int(max_size)*1024*1024
    print ("Format expected between {} and {}".format(hsize(min_size), hsize(max_size) if max_size else "infinity"))

    # sys.exit()

    total_size=0
    total_size_by_format={}
    total_ebook_count=0
    @@ -488,9 +489,6 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore
    if book:
    status=book['source']['status']
    if status=="todo":
    print()
    print()
    print("-->", uuid, "("+book['title']+" -- "+book['authors'][0]+" -- serie: "+ str(book['series'])+")")

    if not has_languages(book, languages=languages, ignore_empty_language=ignore_empty_language):
    continue
    @@ -501,8 +499,9 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore
    source=book['source']
    download_formats = get_formats_to_download(book, accepted_formats=my_formats, single_format=single_format, ignored_formats=ignored_formats, max_size=max_size, min_size=min_size)
    if not len(download_formats):
    print ("'{}' ignored: no more format available in formats expected {}".format(uuid, download_formats))
    print()
    # print ("'{}' ignored: no more format available in formats expected {}".format(uuid, download_formats))
    # print()
    pass
    else:
    ebook_kept=False
    for f in download_formats:
    @@ -515,19 +514,20 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore
    if url:
    # # It shouldn't occur: Need to download again
    if get_file_path(dir, uuid, f):
    print ("Format '{}' already present for {}: Retrying".format(f, uuid))
    print()
    # print ("Format '{}' already present for {}: Retrying".format(f, uuid))
    # print()
    # continue

    print("Format '{}': size expected={}".format(f, hsize(source['formats'][f]['size'])))

    # print("Format '{}': size expected={}".format(f, hsize(source['formats'][f]['size'])))
    pass

    print(f"--> format '{f}' for ({book['title']} / {book['authors'][0]} / {str(book['series'])})")
    if not dry_run:
    try:
    get_file(dir, book, f, s, map, map_lib)
    book['formats'].append(f)
    book['source']['formats'][f]['status']="done"
    time.sleep(0)
    # except:
    except Exception as msg:
    print("Unable to get book:", url)
    print(msg)
    @@ -553,8 +553,9 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore
    total_count_by_format[f]+=1
    total_format_count +=1
    else:
    print ("Format '{}' ignored for {} ({}): No url)".format(f, uuid, book['title']))
    print()
    # print ("Format '{}' ignored for {} ({}): No url)".format(f, uuid, book['title']))
    # print()
    pass
    if ebook_kept:
    total_ebook_count+=1
    if not book['languages']:
    @@ -596,36 +597,68 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore
    print(f'--> {counter} books handled', end="\r")

    print()
    print("Total count of updated ebooks:", total_ebook_count)
    print("Reporting ...")

    print()
    table = BeautifulTable()
    table.column_headers = ["", "Total count"]
    table.append_row(["Formats", total_format_count])
    table.append_row(["Ebooks", total_ebook_count])
    print(table)

    print()
    table = BeautifulTable()
    table.column_headers = ["", "Size"]
    table.append_row(["Min", hsize(size_min)])
    table.append_row(["Max", hsize(size_max)])
    table.append_row(["Total", hsize(total_size)])
    print(table)

    print()
    print("Total ebooks updated by language:")
    table = BeautifulTable()
    table.column_headers = ["Language", "Ebooks count"]
    for l, c in language_count.items():
    print(" '{}': {}".format(l, c))
    table.append_row([l, c])
    print(table)

    print()
    print("Total ebooks updated by identifiers:")
    for l, c in identifiers_count.items():
    print(" '{}': {}".format(l, c))
    print("Total count of formats:", total_format_count)
    table = BeautifulTable()
    table.column_headers = ["Identifiers", "Ebooks count"]
    for i, c in identifiers_count.items():
    table.append_row([i, c])
    print(table)

    print()
    print("Total count of ebooks by format:")
    table = BeautifulTable()
    table.column_headers = ["Formats", "Ebooks count"]
    for f, c in total_count_by_format.items():
    print("\t'{}': {}".format(f, c))
    table.append_row([f, c])
    print(table)

    print()
    print("Total size:", hsize(total_size))
    print("Maximum file size:", hsize(size_max))
    print("Minimum file size:", hsize(size_min))
    print("Total size by format:")
    table = BeautifulTable()
    table.column_headers = ["Format:", "Size"]
    for f, s in total_size_by_format.items():
    print("\t'{}': {}".format(f, hsize(s)))
    table.append_row([f, hsize(s)])
    print(table)

    print()
    print("Done !!!")


    def get_formats_to_download(book, accepted_formats=[], ignored_formats=[], single_format=False, min_size=0, max_size=0):
    print("Accepted formats", accepted_formats)
    # print("Accepted formats", accepted_formats)
    source=book['source']
    print("Formats available in source: {}".format(list(source['formats'].keys())))
    # print("Formats available in source: {}".format(list(source['formats'].keys())))
    my_formats=[]
    for f,v in source['formats'].items():
    if v['status']=='todo':
    my_formats.append(f)
    print("Formats in 'todo': {}".format(my_formats))
    # print("Formats in 'todo': {}".format(my_formats))

    formats=[]
    if single_format:
    @@ -644,53 +677,37 @@ def get_formats_to_download(book, accepted_formats=[], ignored_formats=[], singl
    else:
    formats=my_formats

    print("Formats expected: {}".format(formats))
    # print("Formats expected: {}".format(formats))

    download_formats=formats[:]
    for f in formats:
    if not 'size' in source['formats'][f] and max_size:
    print ("Format '{}' ignored for {}: Size unknown".format(f, book['uuid']))
    # print ("Format '{}' ignored for {}: Size unknown".format(f, book['uuid']))
    download_formats.remove(f)
    else:
    size = source['formats'][f]['size']
    if size < min_size or (max_size and size > max_size):
    download_formats.remove(f)
    print ("Format '{}' ignored for {}: size={} but expected between {} and {}".format(f, book['uuid'], hsize(size), hsize(min_size), hsize(max_size) if max_size else "infinity"))
    # print ("Format '{}' ignored for {}: size={} but expected between {} and {}".format(f, book['uuid'], hsize(size), hsize(min_size), hsize(max_size) if max_size else "infinity"))
    return download_formats


    def update_format_statuses(book,refresh_ignored):
    formats=book['source']['formats']
    for f, v in formats.items():
    if v['status']=='ignored' and not refresh_ignored:
    print ("Format '{}' ignored: {} ({}))".format(f, book['uuid'], book['title']))
    # print ("Format '{}' ignored: {} ({}))".format(f, book['uuid'], book['title']))
    pass
    else:
    print ("Format '{}' todo: {} ({}))".format(f, book['uuid'], book['title']))
    # print ("Format '{}' todo: {} ({}))".format(f, book['uuid'], book['title']))
    book['source']['formats'][f]['status']='todo'


    def reset_ignored(dir= '.', server=''):
    for root, dirs, files in os.walk(dir, topdown=True):
    for uuid in dirs:
    save_ebook=False
    book = load_metadata(root, uuid)
    if book:
    status=book['source']['status']
    if status=="ignored":
    print ("'{}' status 'ignored' reset to 'todo'".format(book['uuid']))
    book['source']['status']='todo'
    save_ebook=True

    formats=book['source']['formats']
    for f, v in formats.items():
    if v['status']=='ignored':
    print ("'{}' format 'ignored' reset to 'todo'".format(book['uuid']))
    book['source']['formats'][f]['status']='todo'
    save_ebook=True

    if save_ebook:
    save_metadata(dir, book)


    if __name__ == "__main__":
    fire.Fire()
    fire.Fire({
    "index_ebooks": index_ebooks,
    "download_ebooks": download_ebooks,
    "download_covers": download_covers,
    "set_status": set_status
    })
  4. @Krazybug Krazybug revised this gist Nov 11, 2019. 1 changed file with 0 additions and 5 deletions.
    5 changes: 0 additions & 5 deletions calisuck.py
    Original file line number Diff line number Diff line change
    @@ -16,11 +16,6 @@
    import urllib3
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


    # l= book.get_items_of_type(ebooklib.ITEM_DOCUMENT)
    # >>> for i in l:
    # ... print(BeautifulSoup(i.get_content()).text)

    all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip', 'fb2']
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)

  5. @Krazybug Krazybug revised this gist Nov 11, 2019. 1 changed file with 101 additions and 106 deletions.
    207 changes: 101 additions & 106 deletions calisuck.py
    Original file line number Diff line number Diff line change
    @@ -11,8 +11,15 @@
    import iso639
    import pickle
    import time
    from requests.adapters import HTTPAdapter
    import urllib.parse
    import urllib3
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


    # l= book.get_items_of_type(ebooklib.ITEM_DOCUMENT)
    # >>> for i in l:
    # ... print(BeautifulSoup(i.get_content()).text)

    all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip', 'fb2']
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    @@ -59,11 +66,18 @@ def get_file_path(path, uuid, fileformat):
    else: return 0
    else: return 0

    def get_cover(path, book):

    def get_cover(path, book, map):
    url=book['source']['cover']
    if map:
    pu=urllib.parse.urlparse(url)
    pu=(pu[0], map, *pu[2:])
    print(pu)
    url=urllib.parse.urlunparse(pu)

    print("Downloading cover from:", url)

    r=requests.get(url, timeout=10)
    r=requests.get(url, timeout=(20, 3), verify=False)
    r.raise_for_status()

    filepath=path+'/'+book['uuid']+'/cover.jpg'
    @@ -74,20 +88,21 @@ def get_cover(path, book):
    print("Saved to:", filepath)


    def download_covers(dir= '.', server=''):
    def download_covers(dir= '.', server='', map=""):
    for root, dirs, files in os.walk(dir, topdown=True):
    for d in dirs:
    # print()
    # print("-->", d)
    book = load_metadata(root, d)
    if book:
    if book['source']['status'] != "ignored":
    # if book['source']['status'] != "ignored":
    if True:
    if not get_cover_path(root, book['uuid']):
    print()
    print("-->", d)
    print(book['uuid'])
    try:
    get_cover(root, book)
    get_cover(root, book, map)
    except:
    print ("Unable to get cover", book['uuid'])
    else:
    @@ -100,19 +115,34 @@ def download_covers(dir= '.', server=''):

    def get_file_size(url):
    print("Downloading size:", url)
    r = requests.head(url)
    r = requests.head(url, verify=False)
    r.raise_for_status()
    size=r.headers['Content-Length']
    print("Size received="+ hsize(size))
    return int(size)

    def get_file(path, book, format, session):
    def get_file(path, book, format, session, map, map_lib):
    uuid = book['uuid']
    url=book['source']['formats'][format]['url']
    if map:
    pu=urllib.parse.urlparse(url)
    pu=(pu[0], map, *pu[2:])
    print(pu)
    url=urllib.parse.urlunparse(pu)

    if map_lib:
    # pu=urllib.parse.urlparse(url)
    # print(pu)
    url_s=url.split("/")
    # print(url_s)
    url_s=url_s[:-1]+[map_lib]
    # print('/'.join(url_s))
    url='/'.join(url_s)


    print("Downloading ebook:", url)
    print("Size expected (estimation):", hsize(book['source']['formats'][format]['size']))
    r = session.get(url, timeout=5)
    r = session.get(url, timeout=(25,15), verify=False)
    # headers = {"Range": "bytes=0-1023"}
    # r = requests.get(url, headers=headers)
    r.raise_for_status()
    @@ -122,6 +152,7 @@ def get_file(path, book, format, session):
    else:
    print("Fize received")


    filename=re.findall(r'filename="(.*)"', r.headers['Content-Disposition'])
    # print(filename)
    if len(filename):
    @@ -142,7 +173,7 @@ def set_status(uuid, status, dir='.'):
    if book['source']['status'] != status:
    book['source']['status'] = status
    save_metadata(dir, book)
    print("Status changed to", status+":", book['uuid'])
    print("Status changed to", status+":", book['uuid'], "(", book['title'], ")")
    else:
    print("Status unchanged changed ", status+":", book['uuid'])
    else:
    @@ -204,7 +235,7 @@ def update_done_status(book):

    def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False):
    offset= 0 if not start else start-1
    num=500
    num=1000
    server=site
    api=server+'ajax/'
    #api=server+'calibre/ajax/'
    @@ -234,7 +265,7 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False
    url=api+'search'+library+'?num='+str(remaining_num)+'&offset='+str(offset)+'&sort=timestamp&sort_order=desc'

    print("->", url)
    r=requests.get(url)
    r=requests.get(url, verify=False)
    print("Ids received from:"+str(offset), "to:"+str(offset+remaining_num-1))

    print()
    @@ -248,14 +279,18 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False
    for id in r.json().keys():
    print()
    print ('--> range={}/{}'.format(str(range),str(total_num)))
    uuid=r.json()[id]['uuid']
    r_book=r.json()[id]
    uuid=r_book['uuid']
    if not uuid:
    print ("No uuid for ebook: ignored")
    continue
    # print ('\r--> range={}/{}'.format(str(range),str(total_num)), "uuid="+uuid, "("+r.json()[id]['title']+")", end='')
    # print (r.json()[id])
    # title= r.json()[id]['title'] if 'title' in r.json()[id] else "<untitled>"
    print("uuid="+uuid, "("+r.json()[id]['title']+")")
    if r_book['authors']:
    print("uuid="+uuid, "("+r_book['title']+" -- "+r_book['authors'][0]+")")
    else:
    print("uuid="+uuid, "("+r_book['title']+")")

    if not force_refresh:
    # print("Checking local metadata:", uuid)
    @@ -271,26 +306,28 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False
    continue

    if not r.json()[id]['formats']:
    print("No format found for {}".format(r.json()[id]['uuid']))
    print("No format found for {}".format(r_book['uuid']))
    range+=1
    continue


    print("Analyzing for:", uuid)

    book={}
    url=api+'book/'+id
    book['title']=r.json()[id]['title']
    book['authors']=r.json()[id]['authors']
    book['series']=r.json()[id]['series']
    book['series']=r.json()[id]['series']
    book['series_index']=r.json()[id]['series_index']
    book['title']=r_book['title']
    book['authors']=r_book['authors']
    book['series']=r_book['series']
    book['series_index']=r_book['series_index']
    book['edition']=0
    book['uuid']=r.json()[id]['uuid']
    book['identifiers']=r.json()[id]['identifiers']
    book['comments']=r.json()[id]['comments']
    book['pubdate']=r.json()[id]['pubdate']
    book['publisher']=r.json()[id]['publisher']
    languages=r.json()[id]['languages']
    book['uuid']=r_book['uuid']
    book['identifiers']=r_book['identifiers']
    book['comments']=r_book['comments']
    book['pubdate']=r_book['pubdate']
    book['publisher']=r_book['publisher']
    languages=r_book['languages']
    if not languages:
    # if True:
    # pass
    print ("Analyzing languages")
    if book['comments']:
    @@ -310,11 +347,11 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False
    for l in languages:
    book['languages'].append(iso639.to_iso639_2(l))

    book['tags']=r.json()[id]['tags']
    book['tags']=r_book['tags']
    book['formats']=[]
    book['metadata_version']=0.1
    source={}
    source['url']=url
    source['url']=url+library
    source['id']=id
    try:
    tmpbook = load_metadata(dir, uuid)
    @@ -326,22 +363,22 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False
    source['status']="ignored"
    else:
    source['status']="todo"
    source['cover']=server+r.json()[id]['cover']
    source['timestamp']=r.json()[id]['timestamp']
    source['cover']=server+r_book['cover']
    source['timestamp']=r_book['timestamp']

    format_sources={}
    formats=r.json()[id]['formats']
    for f in formats:
    s={}
    url=''
    if f in r.json()[id]['main_format']:
    url=r.json()[id]['main_format'][f]
    if f in r_book['main_format']:
    url=r_book['main_format'][f]
    else:
    url=r.json()[id]['other_formats'][f]
    url=r_book['other_formats'][f]
    s['url']=server+url

    if 'size' in r.json()[id]['format_metadata'][f]:
    s['size']=int(r.json()[id]['format_metadata'][f]['size'])
    if 'size' in r_book['format_metadata'][f]:
    s['size']=int(r_book['format_metadata'][f]['size'])
    else:
    print("Size not found for format '{}' : {}".format(f, uuid))
    print("Trying to get size online: {}".format(s['url']))
    @@ -355,6 +392,8 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False

    source['formats']=format_sources
    book['source']=source
    print("Analyzed:", uuid)


    if not source['formats']:
    print("No format found for {}".format(r.json()[id]['uuid']))
    @@ -422,7 +461,7 @@ def has_identifiers(book, identifiers=[], ignore_empty_identifiers=False):
    print ("'{}' todo: expected identifiers {}".format(book['uuid'], expected_identifiers))
    return True

    def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False, dry_run=False):
    def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False, dry_run=False, map="", map_lib=""):
    # all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip']

    if single_format: my_formats = formats if formats else all_ordered_formats
    @@ -446,15 +485,17 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore
    identifiers_count={}

    s = requests.Session()


    for root, dirs, files in os.walk(dir, topdown=True):
    for uuid in dirs:
    for counter, uuid in enumerate(dirs):
    book = load_metadata(root, uuid)
    if book:
    status=book['source']['status']
    if status=="todo":
    print()
    print("-->", uuid, "("+book['title']+")")
    print()
    print("-->", uuid, "("+book['title']+" -- "+book['authors'][0]+" -- serie: "+ str(book['series'])+")")

    if not has_languages(book, languages=languages, ignore_empty_language=ignore_empty_language):
    continue
    @@ -466,32 +507,42 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore
    download_formats = get_formats_to_download(book, accepted_formats=my_formats, single_format=single_format, ignored_formats=ignored_formats, max_size=max_size, min_size=min_size)
    if not len(download_formats):
    print ("'{}' ignored: no more format available in formats expected {}".format(uuid, download_formats))
    print()
    else:
    ebook_kept=False
    for f in download_formats:
    url = source['formats'][f]['url']
    # if map:
    # pu=urllib.parse.urlparse(url)
    # pu=(pu[0], map, *pu[2:])
    # print(pu)
    # print(urllib.parse.urlunparse(pu))
    if url:
    # # It shouldn't occur: Need to download again
    if get_file_path(dir, uuid, f):
    print ("Format '{}' already present for {}: Skipped".format(f, uuid))
    continue
    print ("Format '{}' already present for {}: Retrying".format(f, uuid))
    print()
    # continue

    print("Format '{}': size expected={}".format(f, hsize(source['formats'][f]['size'])))

    if not dry_run:
    try:
    get_file(dir, book, f, s)
    get_file(dir, book, f, s, map, map_lib)
    book['formats'].append(f)
    time.sleep(0.5)
    book['source']['formats'][f]['status']="done"
    time.sleep(0)
    # except:
    except Exception as msg:
    print("Unable to get book:", url)
    print(msg)
    time.sleep(5)
    continue
    save_metadata(dir, book)

    ebook_kept=True
    size=source['formats'][f]['size']
    total_size += size
    total_size += size
    size_max = size if size>size_max else size_max
    if not size_min:
    size_min = size
    @@ -508,6 +559,7 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore
    total_format_count +=1
    else:
    print ("Format '{}' ignored for {} ({}): No url)".format(f, uuid, book['title']))
    print()
    if ebook_kept:
    total_ebook_count+=1
    if not book['languages']:
    @@ -538,11 +590,15 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore
    if book['source']['status']=="done":
    save_metadata(dir, book)
    print("Book done:", book['uuid'])
    print()
    # total_ebook_count+=1
    else:
    print()
    print("-->", uuid, "("+book['title']+")")
    print ('{} in status "{}": skipped'.format(book['uuid'], status))
    # print()
    # print("-->", uuid, "("+book['title']+")")
    # print ('{} in status "{}": skipped'.format(book['uuid'], status))
    # print(f"--> {uuid} ({book['title']}) in status {status}: skipped", end="\r")
    # print(f"--> {uuid} ({book['title']})", end="\r")
    print(f'--> {counter} books handled', end="\r")

    print()
    print("Total count of updated ebooks:", total_ebook_count)
    @@ -617,66 +673,6 @@ def update_format_statuses(book,refresh_ignored):
    print ("Format '{}' todo: {} ({}))".format(f, book['uuid'], book['title']))
    book['source']['formats'][f]['status']='todo'

    def filter_ebooks(dir= '.', server='', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False):

    if single_format: my_formats = formats if formats else all_ordered_formats
    else: my_formats=formats
    print("formats=", my_formats)

    min_size=int(min_size)*1024*1024
    max_size=int(max_size)*1024*1024
    print ("Format expected between {} and {}".format(hsize(min_size), hsize(max_size) if max_size else "infinity"))

    # sys.exit()

    total_ebook_count=0
    total_format_count=0

    for root, dirs, files in os.walk(dir, topdown=True):
    for uuid in dirs:
    book = load_metadata(root, uuid)
    if book:
    status=book['source']['status']
    if status=="todo":
    print()
    print("-->", uuid, "("+book['title']+")")

    if not has_languages(book, languages=languages, ignore_empty_language=ignore_empty_language):
    book['source']['status']='ignored'
    print ("{} ignored: languages filtered".format(uuid))
    save_metadata(dir, book)
    total_ebook_count+=1
    continue

    if not has_identifiers(book, identifiers=identifiers, ignore_empty_identifiers=ignore_empty_identifiers):
    book['source']['status']='ignored'
    print ("{} ignored: identifiers filtered".format(uuid))
    save_metadata(dir, book)
    total_ebook_count+=1
    continue

    download_formats = get_formats_to_download(book, accepted_formats=my_formats, single_format=single_format, ignored_formats=ignored_formats, max_size=max_size, min_size=min_size)

    save_ebook=False
    source=book['source']
    formats_to_ignore=list(set(source['formats'].keys()) - set(book['formats']) - set(download_formats))
    print("formats to ignore:", formats_to_ignore)
    for f in formats_to_ignore:
    if source['formats'][f]['status']!='ignored':
    source['formats'][f]['status']='ignored'
    print ("{} format ignored: '{}'".format(uuid, f))
    total_format_count +=1
    save_ebook=True
    if save_ebook:
    save_metadata(dir, book)
    else:
    print()
    print("-->", uuid, "("+book['title']+")")
    print ('{} in status "{}": skipped'.format(book['uuid'], status))

    print()
    print("Total count of newly ignored ebooks:", total_ebook_count)
    print("Total count of newly formats to ignore:", total_format_count)

    def reset_ignored(dir= '.', server=''):
    for root, dirs, files in os.walk(dir, topdown=True):
    @@ -701,6 +697,5 @@ def reset_ignored(dir= '.', server=''):
    save_metadata(dir, book)



    if __name__ == "__main__":
    fire.Fire()
  6. @Krazybug Krazybug revised this gist Mar 16, 2019. 1 changed file with 31 additions and 127 deletions.
    158 changes: 31 additions & 127 deletions calisuck.py
    Original file line number Diff line number Diff line change
    @@ -10,6 +10,7 @@
    from langid.langid import LanguageIdentifier, model
    import iso639
    import pickle
    import time



    @@ -18,7 +19,7 @@

    def load_metadata(path, uuid):
    filepath=path+'/'+uuid+'/metadata.json'
    print (filepath)
    # print (filepath)
    if os.path.isfile(filepath):
    try:
    with open(filepath, 'r') as fd:
    @@ -62,7 +63,7 @@ def get_cover(path, book):
    url=book['source']['cover']
    print("Downloading cover from:", url)

    r=requests.get(url)
    r=requests.get(url, timeout=10)
    r.raise_for_status()

    filepath=path+'/'+book['uuid']+'/cover.jpg'
    @@ -76,125 +77,26 @@ def get_cover(path, book):
    def download_covers(dir= '.', server=''):
    for root, dirs, files in os.walk(dir, topdown=True):
    for d in dirs:
    print()
    print("-->", d)
    # print()
    # print("-->", d)
    book = load_metadata(root, d)
    if book:
    if book['source']['status'] != "ignored":
    if not get_cover_path(root, book['uuid']):
    print()
    print("-->", d)
    print(book['uuid'])
    try:
    get_cover(root, book)
    except:
    print ("Unable to get cover", book['uuid'])
    else:
    print ("Cover already present:", book['uuid'])
    else:
    print ('book {} in status {}: ignored'.format(book['uuid'], book['source']['status']))
    else:
    print ("No ebook metadata found in:", root)

    def index_ebooks1(dir= '.', server=''):
    filepath=dir+'/.index'

    if os.path.isfile(filepath):
    index=pickle.load(open(filepath, 'rb'))
    else:
    index = {}
    index['uuids']=[]
    index['identifiers']={}
    index['authors']={}
    index['titles']={}

    for root, dirs, files in os.walk(dir, topdown=True):

    for d in dirs:
    print()
    print("-->", d)
    book = load_metadata(root, d)
    if book:
    if book['source']['status'] == "todo":
    print(book['uuid'])
    if not book['uuid'] in index['uuids']:
    index['uuids'].append(book['uuid'])
    if book['title'] not in index['titles']:
    index['titles'][book['title']] = [book['uuid']]
    elif not book['uuid'] in index['titles'][book['title']]:
    index['titles'][book['title']].append(book['uuid'])
    # index['titles'][book['title']] = index['titles'].get(book['title'], []) + [book['uuid']]
    for a in book["authors"]:
    if a not in index['authors']:
    index['authors'][a] = [book['uuid']]
    elif not book['uuid'] in index['authors'][a]:
    index['authors'][a].append(book['uuid'])
    # index['authors'][a] = index['authors'].get(a, []) + [book['uuid']]
    for k, i in book["identifiers"].items():
    if k not in index['identifiers']:
    index['identifiers'][k]={i:[book['uuid']]}
    elif i not in index['identifiers'][k]:
    index['identifiers'][k][i]=[book['uuid']]
    elif not book['uuid'] in index['identifiers'][k][i]:
    index['identifiers'][k][i].append(book['uuid'])
    # index['identifiers'][k][i]=index['identifiers'].get(k, {}).get(i, []) + [book['uuid']]
    pass
    # print ("Cover already present:", book['uuid'])
    else:
    print ('book {} in status {}: ignored'.format(book['uuid'], book['source']['status']))
    else:
    print ("No ebook metadata found in:", root)
    print("titles indexed:", len(index['titles']))
    print("authors indexed:", len(index['authors']))
    print("identifiers indexed:", len(index['identifiers']))
    # print("identifiers:",index['identifiers'])
    pickle.dump(index, open(filepath, 'wb'))

    def index_calibre_local(dir= '.', calibre_dir=''):
    filepath=dir+'/.index'

    if os.path.isfile(filepath):
    index=pickle.load(open(filepath, 'rb'))
    else:
    index = {}
    index['identifiers']={}
    index['authors']={}
    index['titles']={}

    for root, dirs, files in os.walk(dir, topdown=True):

    for d in dirs:
    print()
    print("-->", d)
    book = load_metadata(root, d)
    if book:
    if book['source']['status'] == "todo":
    print(book['uuid'])
    if book['title'] not in index['titles']:
    index['titles'][book['title']] = [book['uuid']]
    elif not book['uuid'] in index['titles'][book['title']]:
    index['titles'][book['title']].append(book['uuid'])
    # index['titles'][book['title']] = index['titles'].get(book['title'], []) + [book['uuid']]
    for a in book["authors"]:
    if a not in index['authors']:
    index['authors'][a] = [book['uuid']]
    elif not book['uuid'] in index['authors'][a]:
    index['authors'][a].append(book['uuid'])
    # index['authors'][a] = index['authors'].get(a, []) + [book['uuid']]
    for k, i in book["identifiers"].items():
    if k not in index['identifiers']:
    index['identifiers'][k]={i:[book['uuid']]}
    elif i not in index['identifiers'][k]:
    index['identifiers'][k][i]=[book['uuid']]
    elif not book['uuid'] in index['identifiers'][k][i]:
    index['identifiers'][k][i].append(book['uuid'])
    # index['identifiers'][k][i]=index['identifiers'].get(k, {}).get(i, []) + [book['uuid']]
    else:
    print ('book {} in status {}: ignored'.format(book['uuid'], book['source']['status']))
    else:
    print ("No ebook metadata found in:", root)
    print("titles indexed:", len(index['titles']))
    print("authors indexed:", len(index['authors']))
    print("identifiers indexed:", len(index['identifiers']))
    # print("identifiers:",index['identifiers'])
    pickle.dump(index, open(filepath, 'wb'))


    def get_file_size(url):
    print("Downloading size:", url)
    @@ -204,13 +106,13 @@ def get_file_size(url):
    print("Size received="+ hsize(size))
    return int(size)

    def get_file(path, book, format):
    def get_file(path, book, format, session):
    uuid = book['uuid']
    url=book['source']['formats'][format]['url']

    print("Downloading ebook:", url)
    print("Size expected (estimation):", hsize(book['source']['formats'][format]['size']))
    r = requests.get(url)
    r = session.get(url, timeout=5)
    # headers = {"Range": "bytes=0-1023"}
    # r = requests.get(url, headers=headers)
    r.raise_for_status()
    @@ -314,7 +216,7 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False
    print("Getting ebooks count:", server)
    print(url)
    try:
    r = requests.get(url)
    r = requests.get(url,verify=False)
    r.raise_for_status()
    except:
    print("Unable to open site:", url)
    @@ -340,7 +242,7 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False
    books_s=",".join(str(i) for i in r.json()['book_ids'])
    url=api+'books'+library+'?ids='+books_s
    print("->", url)
    r=requests.get(url)
    r=requests.get(url, verify=False)
    print(len(r.json()), "received")

    for id in r.json().keys():
    @@ -389,20 +291,20 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False
    book['publisher']=r.json()[id]['publisher']
    languages=r.json()[id]['languages']
    if not languages:
    pass
    # print ("Analyzing languages")
    # if book['comments']:
    # text=book['comments']
    # else:
    # text=book['title']
    # s_language, prob=identifier.classify(text)
    # print (s_language, prob)
    # if prob >= 0.85:
    # language = iso639.to_iso639_2(s_language)
    # print("language=", language)
    # book['languages']=[language]
    # else:
    # book['languages']=[]
    # pass
    print ("Analyzing languages")
    if book['comments']:
    text=book['comments']
    else:
    text=book['title']
    s_language, prob=identifier.classify(text)
    print (s_language, prob)
    if prob >= 0.85:
    language = iso639.to_iso639_2(s_language)
    print("language=", language)
    book['languages']=[language]
    else:
    book['languages']=[]
    else:
    book['languages']=[]
    for l in languages:
    @@ -543,6 +445,8 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore
    language_count={}
    identifiers_count={}

    s = requests.Session()

    for root, dirs, files in os.walk(dir, topdown=True):
    for uuid in dirs:
    book = load_metadata(root, uuid)
    @@ -575,8 +479,9 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore

    if not dry_run:
    try:
    get_file(dir, book, f)
    get_file(dir, book, f, s)
    book['formats'].append(f)
    time.sleep(0.5)
    # except:
    except Exception as msg:
    print("Unable to get book:", url)
    @@ -663,7 +568,6 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore

    def get_formats_to_download(book, accepted_formats=[], ignored_formats=[], single_format=False, min_size=0, max_size=0):
    print("Accepted formats", accepted_formats)

    source=book['source']
    print("Formats available in source: {}".format(list(source['formats'].keys())))
    my_formats=[]
  7. @Krazybug Krazybug revised this gist Mar 13, 2019. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion calisuck.py
    Original file line number Diff line number Diff line change
    @@ -763,7 +763,7 @@ def filter_ebooks(dir= '.', server='', formats=[], single_format=False, ignored_
    print ("{} format ignored: '{}'".format(uuid, f))
    total_format_count +=1
    save_ebook=True
    if save_ebook:æ
    if save_ebook:
    save_metadata(dir, book)
    else:
    print()
  8. @Krazybug Krazybug renamed this gist Mar 11, 2019. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  9. @Krazybug Krazybug created this gist Mar 11, 2019.
    802 changes: 802 additions & 0 deletions .py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,802 @@
    import sys
    import os
    import time
    import re
    import shutil
    import requests
    import json
    import fire
    from humanize import naturalsize as hsize
    from langid.langid import LanguageIdentifier, model
    import iso639
    import pickle



    all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip', 'fb2']
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)

    def load_metadata(path, uuid):
    filepath=path+'/'+uuid+'/metadata.json'
    print (filepath)
    if os.path.isfile(filepath):
    try:
    with open(filepath, 'r') as fd:
    return json.load(fd)
    except:
    print ("Error loading metadata for:", uuid, "from path:", path)
    return 0
    else:
    print ("Metadata not found for:", uuid, "from path:", path)
    return 0

    def save_metadata(path, book):
    filepath=path+'/'+book['uuid']+'/metadata.json'
    print("Saving book metadata for:", book['uuid'], "to:", filepath)
    os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True)
    with open(filepath+".tmp", 'w') as fd:
    json.dump(book, fd, indent=4, separators=(',', ': '))
    try:
    shutil.move(filepath+".tmp", filepath)
    print("Saved to:", filepath)
    except:
    print("Unable to rename .tmp file:", filepath+".tmp")


    def get_cover_path(path, uuid):
    filepath=path+'/'+uuid+'/cover.jpg'
    if os.path.isfile(filepath): return filepath
    else: return 0

    def get_file_path(path, uuid, fileformat):
    files=os.listdir(path+'/'+uuid)
    if files:
    for f in files:
    fname, ext=os.path.splitext(f)
    if ext =='.'+fileformat:
    return path+'/'+uuid+'/'+f
    else: return 0
    else: return 0

    def get_cover(path, book):
    url=book['source']['cover']
    print("Downloading cover from:", url)

    r=requests.get(url)
    r.raise_for_status()

    filepath=path+'/'+book['uuid']+'/cover.jpg'
    os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True)
    with open(filepath+".tmp", 'wb') as fd:
    fd.write(r.content)
    shutil.move(filepath+".tmp", filepath)
    print("Saved to:", filepath)


    def download_covers(dir= '.', server=''):
    for root, dirs, files in os.walk(dir, topdown=True):
    for d in dirs:
    print()
    print("-->", d)
    book = load_metadata(root, d)
    if book:
    if book['source']['status'] != "ignored":
    if not get_cover_path(root, book['uuid']):
    print(book['uuid'])
    try:
    get_cover(root, book)
    except:
    print ("Unable to get cover", book['uuid'])
    else:
    print ("Cover already present:", book['uuid'])
    else:
    print ('book {} in status {}: ignored'.format(book['uuid'], book['source']['status']))
    else:
    print ("No ebook metadata found in:", root)

    def index_ebooks1(dir= '.', server=''):
    filepath=dir+'/.index'

    if os.path.isfile(filepath):
    index=pickle.load(open(filepath, 'rb'))
    else:
    index = {}
    index['uuids']=[]
    index['identifiers']={}
    index['authors']={}
    index['titles']={}

    for root, dirs, files in os.walk(dir, topdown=True):

    for d in dirs:
    print()
    print("-->", d)
    book = load_metadata(root, d)
    if book:
    if book['source']['status'] == "todo":
    print(book['uuid'])
    if not book['uuid'] in index['uuids']:
    index['uuids'].append(book['uuid'])
    if book['title'] not in index['titles']:
    index['titles'][book['title']] = [book['uuid']]
    elif not book['uuid'] in index['titles'][book['title']]:
    index['titles'][book['title']].append(book['uuid'])
    # index['titles'][book['title']] = index['titles'].get(book['title'], []) + [book['uuid']]
    for a in book["authors"]:
    if a not in index['authors']:
    index['authors'][a] = [book['uuid']]
    elif not book['uuid'] in index['authors'][a]:
    index['authors'][a].append(book['uuid'])
    # index['authors'][a] = index['authors'].get(a, []) + [book['uuid']]
    for k, i in book["identifiers"].items():
    if k not in index['identifiers']:
    index['identifiers'][k]={i:[book['uuid']]}
    elif i not in index['identifiers'][k]:
    index['identifiers'][k][i]=[book['uuid']]
    elif not book['uuid'] in index['identifiers'][k][i]:
    index['identifiers'][k][i].append(book['uuid'])
    # index['identifiers'][k][i]=index['identifiers'].get(k, {}).get(i, []) + [book['uuid']]
    else:
    print ('book {} in status {}: ignored'.format(book['uuid'], book['source']['status']))
    else:
    print ("No ebook metadata found in:", root)
    print("titles indexed:", len(index['titles']))
    print("authors indexed:", len(index['authors']))
    print("identifiers indexed:", len(index['identifiers']))
    # print("identifiers:",index['identifiers'])
    pickle.dump(index, open(filepath, 'wb'))

    def index_calibre_local(dir= '.', calibre_dir=''):
    filepath=dir+'/.index'

    if os.path.isfile(filepath):
    index=pickle.load(open(filepath, 'rb'))
    else:
    index = {}
    index['identifiers']={}
    index['authors']={}
    index['titles']={}

    for root, dirs, files in os.walk(dir, topdown=True):

    for d in dirs:
    print()
    print("-->", d)
    book = load_metadata(root, d)
    if book:
    if book['source']['status'] == "todo":
    print(book['uuid'])
    if book['title'] not in index['titles']:
    index['titles'][book['title']] = [book['uuid']]
    elif not book['uuid'] in index['titles'][book['title']]:
    index['titles'][book['title']].append(book['uuid'])
    # index['titles'][book['title']] = index['titles'].get(book['title'], []) + [book['uuid']]
    for a in book["authors"]:
    if a not in index['authors']:
    index['authors'][a] = [book['uuid']]
    elif not book['uuid'] in index['authors'][a]:
    index['authors'][a].append(book['uuid'])
    # index['authors'][a] = index['authors'].get(a, []) + [book['uuid']]
    for k, i in book["identifiers"].items():
    if k not in index['identifiers']:
    index['identifiers'][k]={i:[book['uuid']]}
    elif i not in index['identifiers'][k]:
    index['identifiers'][k][i]=[book['uuid']]
    elif not book['uuid'] in index['identifiers'][k][i]:
    index['identifiers'][k][i].append(book['uuid'])
    # index['identifiers'][k][i]=index['identifiers'].get(k, {}).get(i, []) + [book['uuid']]
    else:
    print ('book {} in status {}: ignored'.format(book['uuid'], book['source']['status']))
    else:
    print ("No ebook metadata found in:", root)
    print("titles indexed:", len(index['titles']))
    print("authors indexed:", len(index['authors']))
    print("identifiers indexed:", len(index['identifiers']))
    # print("identifiers:",index['identifiers'])
    pickle.dump(index, open(filepath, 'wb'))


    def get_file_size(url):
    print("Downloading size:", url)
    r = requests.head(url)
    r.raise_for_status()
    size=r.headers['Content-Length']
    print("Size received="+ hsize(size))
    return int(size)

    def get_file(path, book, format):
    uuid = book['uuid']
    url=book['source']['formats'][format]['url']

    print("Downloading ebook:", url)
    print("Size expected (estimation):", hsize(book['source']['formats'][format]['size']))
    r = requests.get(url)
    # headers = {"Range": "bytes=0-1023"}
    # r = requests.get(url, headers=headers)
    r.raise_for_status()
    # print(r.headers)
    if('Content-Length' in r.headers ):
    print("Size received="+hsize(r.headers['Content-Length']))
    else:
    print("Fize received")

    filename=re.findall(r'filename="(.*)"', r.headers['Content-Disposition'])
    # print(filename)
    if len(filename):
    filepath=path+'/'+uuid+'/'+filename[0]
    else:
    filepath=path+'/'+uuid+'/'+uuid+"."+format

    os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True)
    with open(filepath+".tmp", 'wb') as fd:
    fd.write(r.content)
    shutil.move(filepath+".tmp", filepath)
    print("Saved to:", filepath)


    def set_status(uuid, status, dir='.'):
    book = load_metadata(dir, uuid)
    if book:
    if book['source']['status'] != status:
    book['source']['status'] = status
    save_metadata(dir, book)
    print("Status changed to", status+":", book['uuid'])
    else:
    print("Status unchanged changed ", status+":", book['uuid'])
    else:
    print ("No ebook metadata found for:", uuid)



    def remove_book(uuid, path='.'):
    print(os.getcwd())
    bookpath=path+'/'+uuid
    if os.path.isdir(bookpath):
    try:
    shutil.rmtree(bookpath)
    print(uuid, "removed")
    except:
    print("problem")
    else:
    print(uuid, "not found")



    def explore(site, help=False):
    server=site
    api=server+'ajax/'
    print("Server:", server)
    url=api+'library-info'
    print()
    print("Getting libraries:", server)
    print(url)
    try:
    r = requests.get(url)
    r.raise_for_status()
    except:
    print("Unable to open site:", url)
    sys.exit(1)

    libraries = r.json()["library_map"].keys()
    print("Libraries:")
    for l in libraries:
    library='/'+l
    url=api+'search'+library+'?num=0'
    try:
    r = requests.get(url)
    r.raise_for_status()
    except:
    print("Unable to open site:", url)
    continue
    print("\t{}: {} ebooks".format(l, r.json()["total_num"]))


    def update_done_status(book):
    source=book['source']
    if source['status']!='ignored':
    if set(source['formats'].keys()) == set(book['formats']) & set(source['formats'].keys()):
    book['source']['status']="done"
    else:
    book['source']['status']="todo"


    def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False):
    offset= 0 if not start else start-1
    num=500
    server=site
    api=server+'ajax/'
    #api=server+'calibre/ajax/'
    library= '/'+library if library else library

    print("Server:", server)
    url=api+'search'+library+'?num=0'
    print()
    print("Getting ebooks count:", server)
    print(url)
    try:
    r = requests.get(url)
    r.raise_for_status()
    except:
    print("Unable to open site:", url)
    sys.exit(1)
    print("Total count=",r.json()["total_num"])
    total_num=int(r.json()["total_num"])
    total_num= total_num if not stop else stop

    range=offset+1
    while offset < total_num:
    remaining_num = min(num, total_num - offset)
    print()
    print("Downloading ids: offset="+str(offset), "num="+str(remaining_num))
    # url=api+'search?num='+str(remaining_num)+'&offset='+str(offset)
    url=api+'search'+library+'?num='+str(remaining_num)+'&offset='+str(offset)+'&sort=timestamp&sort_order=desc'

    print("->", url)
    r=requests.get(url)
    print("Ids received from:"+str(offset), "to:"+str(offset+remaining_num-1))

    print()
    print("Downloading metadata from", str(offset+1), "to", str(offset+remaining_num))
    books_s=",".join(str(i) for i in r.json()['book_ids'])
    url=api+'books'+library+'?ids='+books_s
    print("->", url)
    r=requests.get(url)
    print(len(r.json()), "received")

    for id in r.json().keys():
    print()
    print ('--> range={}/{}'.format(str(range),str(total_num)))
    uuid=r.json()[id]['uuid']
    if not uuid:
    print ("No uuid for ebook: ignored")
    continue
    # print ('\r--> range={}/{}'.format(str(range),str(total_num)), "uuid="+uuid, "("+r.json()[id]['title']+")", end='')
    # print (r.json()[id])
    # title= r.json()[id]['title'] if 'title' in r.json()[id] else "<untitled>"
    print("uuid="+uuid, "("+r.json()[id]['title']+")")

    if not force_refresh:
    # print("Checking local metadata:", uuid)
    try:
    book = load_metadata(dir, uuid)
    except:
    print("Unable to get metadata from:", uuid)
    range+=1
    continue
    if book:
    print("Metadata already present for:", uuid)
    range+=1
    continue

    if not r.json()[id]['formats']:
    print("No format found for {}".format(r.json()[id]['uuid']))
    range+=1
    continue


    book={}
    url=api+'book/'+id
    book['title']=r.json()[id]['title']
    book['authors']=r.json()[id]['authors']
    book['series']=r.json()[id]['series']
    book['series']=r.json()[id]['series']
    book['series_index']=r.json()[id]['series_index']
    book['edition']=0
    book['uuid']=r.json()[id]['uuid']
    book['identifiers']=r.json()[id]['identifiers']
    book['comments']=r.json()[id]['comments']
    book['pubdate']=r.json()[id]['pubdate']
    book['publisher']=r.json()[id]['publisher']
    languages=r.json()[id]['languages']
    if not languages:
    pass
    # print ("Analyzing languages")
    # if book['comments']:
    # text=book['comments']
    # else:
    # text=book['title']
    # s_language, prob=identifier.classify(text)
    # print (s_language, prob)
    # if prob >= 0.85:
    # language = iso639.to_iso639_2(s_language)
    # print("language=", language)
    # book['languages']=[language]
    # else:
    # book['languages']=[]
    else:
    book['languages']=[]
    for l in languages:
    book['languages'].append(iso639.to_iso639_2(l))

    book['tags']=r.json()[id]['tags']
    book['formats']=[]
    book['metadata_version']=0.1
    source={}
    source['url']=url
    source['id']=id
    try:
    tmpbook = load_metadata(dir, uuid)
    except:
    print("Unable to get metadata from:", uuid)
    range+=1
    continue
    if tmpbook and tmpbook['source']['status']=="ignored":
    source['status']="ignored"
    else:
    source['status']="todo"
    source['cover']=server+r.json()[id]['cover']
    source['timestamp']=r.json()[id]['timestamp']

    format_sources={}
    formats=r.json()[id]['formats']
    for f in formats:
    s={}
    url=''
    if f in r.json()[id]['main_format']:
    url=r.json()[id]['main_format'][f]
    else:
    url=r.json()[id]['other_formats'][f]
    s['url']=server+url

    if 'size' in r.json()[id]['format_metadata'][f]:
    s['size']=int(r.json()[id]['format_metadata'][f]['size'])
    else:
    print("Size not found for format '{}' : {}".format(f, uuid))
    print("Trying to get size online: {}".format(s['url']))
    try:
    s['size']=get_file_size(s['url'])
    except:
    print("Unable to access format '{}' : {} skipped".format(f, uuid))
    continue
    s['status']='todo'
    format_sources[f]=s

    source['formats']=format_sources
    book['source']=source

    if not source['formats']:
    print("No format found for {}".format(r.json()[id]['uuid']))
    range+=1
    continue
    update_done_status(book)
    print("Saving metadata for:", uuid)
    try:
    save_metadata(dir, book)
    except:
    print("Unable to save book metadata", book['uuid'])
    range+=1
    offset=offset+num



    def has_languages(book, languages=[], ignore_empty_language=False):

    print("Accepted languages", languages)
    if not ignore_empty_language:
    print("Unknown language accepted")

    # rustine
    if not 'languages' in book:
    book['languages']=[]

    print("Book languages", book['languages'])

    if ignore_empty_language and not book['languages']:
    print ("'{}' ignored: language is empty".format(book['uuid']))
    return False

    if not ignore_empty_language and not book['languages']:
    print ("'{}' todo: language is empty".format(book['uuid']))
    return True

    expected_languages=list(set(book['languages']) & set(languages))
    if languages and not expected_languages:
    print ("'{}' ignored: language {} not in {}".format(book['uuid'], book['languages'],languages))
    return False

    print ("'{}' todo: expected languages {}".format(book['uuid'], expected_languages))
    return True

    def has_identifiers(book, identifiers=[], ignore_empty_identifiers=False):

    print("Accepted identifiers", identifiers)
    if not ignore_empty_identifiers:
    print("Unknown identifiers accepted")
    print("Book identifiers", book['identifiers'].keys())

    if ignore_empty_identifiers and not book['identifiers']:
    print ("'{}' ignored: identifier is empty".format(book['uuid']))
    return False

    if not ignore_empty_identifiers and not book['identifiers']:
    print ("'{}' todo: identifiers is empty".format(book['uuid']))
    return True

    expected_identifiers=list(set(book['identifiers'].keys()) & set(identifiers))
    if identifiers and not expected_identifiers:
    print ("'{}' ignored: identifiers {} not in {}".format(book['uuid'], book['identifiers'].keys(), identifiers))
    return False

    print ("'{}' todo: expected identifiers {}".format(book['uuid'], expected_identifiers))
    return True

    def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False, dry_run=False):
    # all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip']

    if single_format: my_formats = formats if formats else all_ordered_formats
    else: my_formats=formats
    print("formats=", my_formats)

    min_size=int(min_size)*1024*1024
    max_size=int(max_size)*1024*1024
    print ("Format expected between {} and {}".format(hsize(min_size), hsize(max_size) if max_size else "infinity"))

    # sys.exit()

    total_size=0
    total_size_by_format={}
    total_ebook_count=0
    total_format_count=0
    total_count_by_format={}
    size_max=0
    size_min=0
    language_count={}
    identifiers_count={}

    for root, dirs, files in os.walk(dir, topdown=True):
    for uuid in dirs:
    book = load_metadata(root, uuid)
    if book:
    status=book['source']['status']
    if status=="todo":
    print()
    print("-->", uuid, "("+book['title']+")")

    if not has_languages(book, languages=languages, ignore_empty_language=ignore_empty_language):
    continue

    if not has_identifiers(book, identifiers=identifiers, ignore_empty_identifiers=ignore_empty_identifiers):
    continue

    source=book['source']
    download_formats = get_formats_to_download(book, accepted_formats=my_formats, single_format=single_format, ignored_formats=ignored_formats, max_size=max_size, min_size=min_size)
    if not len(download_formats):
    print ("'{}' ignored: no more format available in formats expected {}".format(uuid, download_formats))
    else:
    ebook_kept=False
    for f in download_formats:
    url = source['formats'][f]['url']
    if url:
    if get_file_path(dir, uuid, f):
    print ("Format '{}' already present for {}: Skipped".format(f, uuid))
    continue

    print("Format '{}': size expected={}".format(f, hsize(source['formats'][f]['size'])))

    if not dry_run:
    try:
    get_file(dir, book, f)
    book['formats'].append(f)
    # except:
    except Exception as msg:
    print("Unable to get book:", url)
    print(msg)
    continue
    save_metadata(dir, book)

    ebook_kept=True
    size=source['formats'][f]['size']
    total_size += size
    size_max = size if size>size_max else size_max
    if not size_min:
    size_min = size
    else:
    size_min = size if size<size_min else size_min

    if not f in total_size_by_format:
    total_size_by_format[f] = size
    else: total_size_by_format[f] +=size
    if not f in total_count_by_format:
    total_count_by_format[f] = 1
    else:
    total_count_by_format[f]+=1
    total_format_count +=1
    else:
    print ("Format '{}' ignored for {} ({}): No url)".format(f, uuid, book['title']))
    if ebook_kept:
    total_ebook_count+=1
    if not book['languages']:
    if not '<unknown>' in language_count:
    language_count['<unknown>'] = 1
    else:
    language_count['<unknown>']+=1
    else:
    for l in book['languages']:
    if not l in language_count:
    language_count[l] = 1
    else:
    language_count[l]+=1
    if not book['identifiers']:
    if not '<unknown>' in identifiers_count:
    identifiers_count['<unknown>'] = 1
    else:
    identifiers_count['<unknown>']+=1
    else:
    for l in book['identifiers'].keys():
    if not l in identifiers_count:
    identifiers_count[l] = 1
    else:
    identifiers_count[l]+=1

    if not dry_run:
    update_done_status(book)
    if book['source']['status']=="done":
    save_metadata(dir, book)
    print("Book done:", book['uuid'])
    # total_ebook_count+=1
    else:
    print()
    print("-->", uuid, "("+book['title']+")")
    print ('{} in status "{}": skipped'.format(book['uuid'], status))

    print()
    print("Total count of updated ebooks:", total_ebook_count)
    print("Total ebooks updated by language:")
    for l, c in language_count.items():
    print(" '{}': {}".format(l, c))
    print("Total ebooks updated by identifiers:")
    for l, c in identifiers_count.items():
    print(" '{}': {}".format(l, c))
    print("Total count of formats:", total_format_count)
    print("Total count of ebooks by format:")
    for f, c in total_count_by_format.items():
    print("\t'{}': {}".format(f, c))
    print()
    print("Total size:", hsize(total_size))
    print("Maximum file size:", hsize(size_max))
    print("Minimum file size:", hsize(size_min))
    print("Total size by format:")
    for f, s in total_size_by_format.items():
    print("\t'{}': {}".format(f, hsize(s)))



    def get_formats_to_download(book, accepted_formats=[], ignored_formats=[], single_format=False, min_size=0, max_size=0):
    print("Accepted formats", accepted_formats)

    source=book['source']
    print("Formats available in source: {}".format(list(source['formats'].keys())))
    my_formats=[]
    for f,v in source['formats'].items():
    if v['status']=='todo':
    my_formats.append(f)
    print("Formats in 'todo': {}".format(my_formats))

    formats=[]
    if single_format:
    if accepted_formats:
    for f in accepted_formats:
    if f in my_formats:
    formats=[f]
    break
    else:
    print("need at least 1 format for ordering")
    else:
    if accepted_formats:
    formats=list(set(accepted_formats) & set(my_formats))
    elif ignored_formats:
    formats = list(set(my_formats) - set(ignored_formats))
    else:
    formats=my_formats

    print("Formats expected: {}".format(formats))

    download_formats=formats[:]
    for f in formats:
    if not 'size' in source['formats'][f] and max_size:
    print ("Format '{}' ignored for {}: Size unknown".format(f, book['uuid']))
    download_formats.remove(f)
    else:
    size = source['formats'][f]['size']
    if size < min_size or (max_size and size > max_size):
    download_formats.remove(f)
    print ("Format '{}' ignored for {}: size={} but expected between {} and {}".format(f, book['uuid'], hsize(size), hsize(min_size), hsize(max_size) if max_size else "infinity"))
    return download_formats


    def update_format_statuses(book,refresh_ignored):
    formats=book['source']['formats']
    for f, v in formats.items():
    if v['status']=='ignored' and not refresh_ignored:
    print ("Format '{}' ignored: {} ({}))".format(f, book['uuid'], book['title']))
    else:
    print ("Format '{}' todo: {} ({}))".format(f, book['uuid'], book['title']))
    book['source']['formats'][f]['status']='todo'

    def filter_ebooks(dir= '.', server='', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False):

    if single_format: my_formats = formats if formats else all_ordered_formats
    else: my_formats=formats
    print("formats=", my_formats)

    min_size=int(min_size)*1024*1024
    max_size=int(max_size)*1024*1024
    print ("Format expected between {} and {}".format(hsize(min_size), hsize(max_size) if max_size else "infinity"))

    # sys.exit()

    total_ebook_count=0
    total_format_count=0

    for root, dirs, files in os.walk(dir, topdown=True):
    for uuid in dirs:
    book = load_metadata(root, uuid)
    if book:
    status=book['source']['status']
    if status=="todo":
    print()
    print("-->", uuid, "("+book['title']+")")

    if not has_languages(book, languages=languages, ignore_empty_language=ignore_empty_language):
    book['source']['status']='ignored'
    print ("{} ignored: languages filtered".format(uuid))
    save_metadata(dir, book)
    total_ebook_count+=1
    continue

    if not has_identifiers(book, identifiers=identifiers, ignore_empty_identifiers=ignore_empty_identifiers):
    book['source']['status']='ignored'
    print ("{} ignored: identifiers filtered".format(uuid))
    save_metadata(dir, book)
    total_ebook_count+=1
    continue

    download_formats = get_formats_to_download(book, accepted_formats=my_formats, single_format=single_format, ignored_formats=ignored_formats, max_size=max_size, min_size=min_size)

    save_ebook=False
    source=book['source']
    formats_to_ignore=list(set(source['formats'].keys()) - set(book['formats']) - set(download_formats))
    print("formats to ignore:", formats_to_ignore)
    for f in formats_to_ignore:
    if source['formats'][f]['status']!='ignored':
    source['formats'][f]['status']='ignored'
    print ("{} format ignored: '{}'".format(uuid, f))
    total_format_count +=1
    save_ebook=True
    if save_ebook:æ
    save_metadata(dir, book)
    else:
    print()
    print("-->", uuid, "("+book['title']+")")
    print ('{} in status "{}": skipped'.format(book['uuid'], status))

    print()
    print("Total count of newly ignored ebooks:", total_ebook_count)
    print("Total count of newly formats to ignore:", total_format_count)

    def reset_ignored(dir= '.', server=''):
    for root, dirs, files in os.walk(dir, topdown=True):
    for uuid in dirs:
    save_ebook=False
    book = load_metadata(root, uuid)
    if book:
    status=book['source']['status']
    if status=="ignored":
    print ("'{}' status 'ignored' reset to 'todo'".format(book['uuid']))
    book['source']['status']='todo'
    save_ebook=True

    formats=book['source']['formats']
    for f, v in formats.items():
    if v['status']=='ignored':
    print ("'{}' format 'ignored' reset to 'todo'".format(book['uuid']))
    book['source']['formats'][f]['status']='todo'
    save_ebook=True

    if save_ebook:
    save_metadata(dir, book)



    if __name__ == "__main__":
    fire.Fire()