-
-
Save skmezanul/96b22f9ac04c4a74f2c6896a7ecbf5c2 to your computer and use it in GitHub Desktop.
Revisions
-
Krazybug revised this gist
Dec 30, 2019 . 1 changed file with 15 additions and 14 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -682,20 +682,6 @@ def download_ebooks(dir= 'my_books', formats=[], single_format=False, ignored_fo print() print("Reporting ...") print() print("Total ebooks updated by language:") @@ -729,6 +715,21 @@ def download_ebooks(dir= 'my_books', formats=[], single_format=False, ignored_fo table.append_row([f, hsize(s)]) print(table) print() table = BeautifulTable() table.column_headers = ["", "Total count"] table.append_row(["Formats", total_format_count]) table.append_row(["Ebooks", total_ebook_count]) print(table) print() table = BeautifulTable() table.column_headers = ["", "Size"] table.append_row(["Min", hsize(size_min)]) table.append_row(["Max", hsize(size_max)]) table.append_row(["Total", hsize(total_size)]) print(table) print() print("Done !!!") -
Krazybug revised this gist
Dec 30, 2019 . 1 changed file with 88 additions and 5 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,3 +1,44 @@ #!/usr/bin/env python3 ''' calisuck: index, filter-out smartly and download ebooks from Calibre open directories Installation: You need python 3.5 installed Download the file as a zip and unzip-it and get into the dir OR > git clone https://gist.github.com/b7e814d7189db9ee1d6b9c1d1a1de95c.git > mv b7e814d7189db9ee1d6b9c1d1a1de95c calisuck > cd calisuck > THEN > python3 -m venv . > . bin/activate > pip install requests fire humanize langid iso639 beautifultable > python calisuck.py --help > python calisuck.py index-ebooks --help > python calisuck.py download-ebooks --help > python calisuck.py download-covers --help ''' ''' DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE Version 2, December 2004 Copyright (C) 2004 Sam Hocevar <[email protected]> Everyone is permitted to copy and distribute verbatim or modified copies of this license document, and changing it is allowed as long as the name is changed. DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. You just DO WHAT THE FUCK YOU WANT TO. ''' import sys import os import time @@ -223,7 +264,7 @@ def index_ebooks(site, library="", start=0, stop=0, dir="my_books", inc=1000, fo --library=<string> (default=my_books) : Id of library to index. The script index the default library by default. The id is string following '&library_id=' in the url --force-refresh (defaul=False) : Force a refresh of the metadata. By default all the metdata already gathered are ignored --start=<int> (default=0) @@ -236,8 +277,8 @@ def index_ebooks(site, library="", start=0, stop=0, dir="my_books", inc=1000, fo offset= 0 if not start else start-1 num=min(1000,inc) server=site.rstrip('/') api=server+'/ajax/' library= '/'+library if library else library print("Server:", server) @@ -460,11 +501,53 @@ def has_identifiers(book, identifiers=[], ignore_empty_identifiers=False): return True def download_ebooks(dir= 'my_books', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False, dry_run=False, map="", map_lib=""): ''' Download ebooks in matching subdirs: The different formats of the same book are groupe in the same directory with an UUID name close to the metadata file (metadata.json). The status of the formats for a book and its global status are initially set to 'todo'. They move to 'done' after their download. This allows you to rerun the download and progressively collect books. You can use different options to filter the formats for the download by language, size, format and identifiers(isbn, ...). A report of the download is displayed at the end of the process. You can run this command in dry mode (--dry-run) with different settings to only display the report and prepare your effective. Params: --min-size=<int> (default=0) --max-size=<int> (default=infinity) : Delimit the size in MB for the accepted formats --dry-run (defaul=False) : Run the command to simulate the download --language=<string> : Restrict the download to a list of specific languages (Ex: --languages='["eng","ita"]' --ignore-empty-language (defaul=False) : Ignore books with unidentfied language --formats=<string> : Restrict the download to a list of specific formats (Ex: --formats='["epub", "mobi", "pdf"]' --ignore-formats=<string> : Ignore the formats of a list of specific. Compliant with --formats. (Ex: --ignored-formats='["mp3", "rar", "zip"]' --single-format (defaul=False) : Limit the download to 1 format per book with this preference order 'azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar' , 'rtf', 'txt', 'zip', 'fb2' --identifiers=<string> : Restrict the download to a list of specific identifiers (Ex: --identifiers='["isbn","asin"]' --ignore-empty-identifiers (defaul=False) : Ignore books without identifiers (often OCR) ''' # all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip'] print() if single_format: my_formats = formats if formats else all_ordered_formats else: my_formats=formats # print("formats=", my_formats) min_size=int(min_size)*1024*1024 max_size=int(max_size)*1024*1024 @@ -521,7 +604,7 @@ def download_ebooks(dir= 'my_books', formats=[], single_format=False, ignored_fo # print("Format '{}': size expected={}".format(f, hsize(source['formats'][f]['size']))) pass # print(f"--> format '{f}' for ({book['title']} / {book['authors'][0]} / {str(book['series'])})") if not dry_run: try: get_file(dir, book, f, s, map, map_lib) -
Krazybug revised this gist
Dec 30, 2019 . 1 changed file with 159 additions and 142 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -9,16 +9,19 @@ from humanize import naturalsize as hsize from langid.langid import LanguageIdentifier, model import iso639 import time from requests.adapters import HTTPAdapter import urllib.parse import urllib3 from beautifultable import BeautifulTable urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip', 'fb2'] identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) def load_metadata(path, uuid): filepath=path+'/'+uuid+'/metadata.json' # print (filepath) @@ -30,18 +33,19 @@ def load_metadata(path, uuid): print ("Error loading metadata for:", uuid, "from path:", path) return 0 else: # print ("Metadata not found for:", uuid, "from path:", path) return 0 def save_metadata(path, book): filepath=path+'/'+book['uuid']+'/metadata.json' # print("Saving book metadata for:", book['uuid'], "to:", filepath) os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True) with open(filepath+".tmp", 'w') as fd: json.dump(book, fd, indent=4, separators=(',', ': ')) try: shutil.move(filepath+".tmp", filepath) # print("Saved to:", filepath) except: print("Unable to rename .tmp file:", filepath+".tmp") @@ -51,6 +55,7 @@ def get_cover_path(path, uuid): if os.path.isfile(filepath): return filepath else: return 0 def get_file_path(path, uuid, fileformat): files=os.listdir(path+'/'+uuid) if files: @@ -83,7 +88,9 @@ def get_cover(path, book, map): print("Saved to:", filepath) def download_covers(dir='my_books', server='', map=""): """ Download covers for each books""" for root, dirs, files in os.walk(dir, topdown=True): for d in dirs: # print() @@ -108,6 +115,7 @@ def download_covers(dir= '.', server='', map=""): else: print ("No ebook metadata found in:", root) def get_file_size(url): print("Downloading size:", url) r = requests.head(url, verify=False) @@ -116,6 +124,7 @@ def get_file_size(url): print("Size received="+ hsize(size)) return int(size) def get_file(path, book, format, session, map, map_lib): uuid = book['uuid'] url=book['source']['formats'][format]['url'] @@ -173,7 +182,6 @@ def set_status(uuid, status, dir='.'): print("Status unchanged changed ", status+":", book['uuid']) else: print ("No ebook metadata found for:", uuid) def remove_book(uuid, path='.'): @@ -189,36 +197,6 @@ def remove_book(uuid, path='.'): print(uuid, "not found") def update_done_status(book): source=book['source'] if source['status']!='ignored': @@ -228,19 +206,44 @@ def update_done_status(book): book['source']['status']="todo" def index_ebooks(site, library="", start=0, stop=0, dir="my_books", inc=1000, force_refresh=False): """ Index a remote Calibre library You will get in your <dir> all the metadata (title, authors, isbn, ...) for each book. They're stored as simple JSON files (metadata.json) so that you can easily visualize them or process them with 'jq' program. They are stored in subdirectories with a UUID as a name. These directories do match different books and allow you to group all the different formats of the same book and eventually the cover file. You can mix books from different sites without any (theoric) collisions Params: --site=<string> : Url of the site to index (ex: http://123.123.123.123/) --library=<string> (default=my_books) : Id of library to index. The script index the default library by default. The id is string following '&library_id=' in the url --force_refresh (defaul=False) : Force a refresh of the metadata. By default all the metdata already gathered are ignored --start=<int> (default=0) --stop=<int> (default=0) : Allow indexing between a range of ebooks --inc=<int> (default=1000) : Fix the number of ebooks for each request one the server """ os.makedirs(dir, exist_ok=True) offset= 0 if not start else start-1 num=min(1000,inc) server=site api=server+'ajax/' library= '/'+library if library else library print("Server:", server) url=api+'search'+library+'?num=0' print() print("Getting ebooks count:", server) try: r = requests.get(url,verify=False) r.raise_for_status() @@ -251,63 +254,63 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False total_num=int(r.json()["total_num"]) total_num= total_num if not stop else stop print() print("Start indexing") range=offset+1 while offset < total_num: remaining_num = min(num, total_num - offset) # print() # print("Downloading ids: offset="+str(offset), "num="+str(remaining_num)) url=api+'search'+library+'?num='+str(remaining_num)+'&offset='+str(offset)+'&sort=timestamp&sort_order=desc' # print("->", url) r=requests.get(url, verify=False) # print("Ids received from:"+str(offset), "to:"+str(offset+remaining_num-1)) # print() # print("\rDownloading metadata from", str(offset+1), "to", str(offset+remaining_num),end='') books_s=",".join(str(i) for i in r.json()['book_ids']) url=api+'books'+library+'?ids='+books_s # print("->", url) r=requests.get(url, verify=False) # print(len(r.json()), "received") for id, r_book in r.json().items(): uuid=r_book['uuid'] if not uuid: print ("No uuid for ebook: ignored") continue if r_book['authors']: desc= f"uuid={uuid} ({r_book['title']} / {r_book['authors'][0]})" else: desc= f"uuid={uuid} ({r_book['title']})" s=f"\r--> {range}/{total_num} - {desc}" s='{:140.140}'.format(s) print (s, end='') if not force_refresh: try: book = load_metadata(dir, uuid) except: print() print("Unable to get metadata from:", uuid) range+=1 continue if book: # print("Metadata already present for:", uuid) range+=1 continue if not r_book['formats']: print() print("No format found for {}".format(r_book['uuid'])) range+=1 continue book={} url=api+'book/'+id book['title']=r_book['title'] @@ -323,17 +326,13 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False languages=r_book['languages'] if not languages: # if True: if book['comments']: text=book['comments'] else: text=book['title'] s_language, prob=identifier.classify(text) if prob >= 0.85: language = iso639.to_iso639_2(s_language) book['languages']=[language] else: book['languages']=[] @@ -362,7 +361,7 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False source['timestamp']=r_book['timestamp'] format_sources={} formats=r_book['formats'] for f in formats: s={} url='' @@ -375,6 +374,7 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False if 'size' in r_book['format_metadata'][f]: s['size']=int(r_book['format_metadata'][f]['size']) else: print() print("Size not found for format '{}' : {}".format(f, uuid)) print("Trying to get size online: {}".format(s['url'])) try: @@ -387,76 +387,79 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False source['formats']=format_sources book['source']=source if not source['formats']: print("No format found for {}".format(r_book['uuid'])) range+=1 continue update_done_status(book) # print("Saving metadata for:", uuid) try: save_metadata(dir, book) except: print() print("Unable to save book metadata", book['uuid']) range+=1 offset=offset+num print() print("Done") def has_languages(book, languages=[], ignore_empty_language=False): # print("Accepted languages", languages) if not ignore_empty_language: # print("Unknown language accepted") pass # rustine if not 'languages' in book: book['languages']=[] # print("Book languages", book['languages']) if ignore_empty_language and not book['languages']: # print ("'{}' ignored: language is empty".format(book['uuid'])) return False if not ignore_empty_language and not book['languages']: # print ("'{}' todo: language is empty".format(book['uuid'])) return True expected_languages=list(set(book['languages']) & set(languages)) if languages and not expected_languages: # print ("'{}' ignored: language {} not in {}".format(book['uuid'], book['languages'],languages)) return False # print ("'{}' todo: expected languages {}".format(book['uuid'], expected_languages)) return True def has_identifiers(book, identifiers=[], ignore_empty_identifiers=False): # print("Accepted identifiers", identifiers) if not ignore_empty_identifiers: # print("Unknown identifiers accepted") pass # print("Book identifiers", book['identifiers'].keys()) if ignore_empty_identifiers and not book['identifiers']: # print ("'{}' ignored: identifier is empty".format(book['uuid'])) return False if not ignore_empty_identifiers and not book['identifiers']: # print ("'{}' todo: identifiers is empty".format(book['uuid'])) return True expected_identifiers=list(set(book['identifiers'].keys()) & set(identifiers)) if identifiers and not expected_identifiers: # print ("'{}' ignored: identifiers {} not in {}".format(book['uuid'], book['identifiers'].keys(), identifiers)) return False # print ("'{}' todo: expected identifiers {}".format(book['uuid'], expected_identifiers)) return True def download_ebooks(dir= 'my_books', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False, dry_run=False, map="", map_lib=""): # all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip'] if single_format: my_formats = formats if formats else all_ordered_formats @@ -467,8 +470,6 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore max_size=int(max_size)*1024*1024 print ("Format expected between {} and {}".format(hsize(min_size), hsize(max_size) if max_size else "infinity")) total_size=0 total_size_by_format={} total_ebook_count=0 @@ -488,9 +489,6 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore if book: status=book['source']['status'] if status=="todo": if not has_languages(book, languages=languages, ignore_empty_language=ignore_empty_language): continue @@ -501,8 +499,9 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore source=book['source'] download_formats = get_formats_to_download(book, accepted_formats=my_formats, single_format=single_format, ignored_formats=ignored_formats, max_size=max_size, min_size=min_size) if not len(download_formats): # print ("'{}' ignored: no more format available in formats expected {}".format(uuid, download_formats)) # print() pass else: ebook_kept=False for f in download_formats: @@ -515,19 +514,20 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore if url: # # It shouldn't occur: Need to download again if get_file_path(dir, uuid, f): # print ("Format '{}' already present for {}: Retrying".format(f, uuid)) # print() # continue # print("Format '{}': size expected={}".format(f, hsize(source['formats'][f]['size']))) pass print(f"--> format '{f}' for ({book['title']} / {book['authors'][0]} / {str(book['series'])})") if not dry_run: try: get_file(dir, book, f, s, map, map_lib) book['formats'].append(f) book['source']['formats'][f]['status']="done" time.sleep(0) except Exception as msg: print("Unable to get book:", url) print(msg) @@ -553,8 +553,9 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore total_count_by_format[f]+=1 total_format_count +=1 else: # print ("Format '{}' ignored for {} ({}): No url)".format(f, uuid, book['title'])) # print() pass if ebook_kept: total_ebook_count+=1 if not book['languages']: @@ -596,36 +597,68 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore print(f'--> {counter} books handled', end="\r") print() print("Reporting ...") print() table = BeautifulTable() table.column_headers = ["", "Total count"] table.append_row(["Formats", total_format_count]) table.append_row(["Ebooks", total_ebook_count]) print(table) print() table = BeautifulTable() table.column_headers = ["", "Size"] table.append_row(["Min", hsize(size_min)]) table.append_row(["Max", hsize(size_max)]) table.append_row(["Total", hsize(total_size)]) print(table) print() print("Total ebooks updated by language:") table = BeautifulTable() table.column_headers = ["Language", "Ebooks count"] for l, c in language_count.items(): table.append_row([l, c]) print(table) print() print("Total ebooks updated by identifiers:") table = BeautifulTable() table.column_headers = ["Identifiers", "Ebooks count"] for i, c in identifiers_count.items(): table.append_row([i, c]) print(table) print() print("Total count of ebooks by format:") table = BeautifulTable() table.column_headers = ["Formats", "Ebooks count"] for f, c in total_count_by_format.items(): table.append_row([f, c]) print(table) print() print("Total size by format:") table = BeautifulTable() table.column_headers = ["Format:", "Size"] for f, s in total_size_by_format.items(): table.append_row([f, hsize(s)]) print(table) print() print("Done !!!") def get_formats_to_download(book, accepted_formats=[], ignored_formats=[], single_format=False, min_size=0, max_size=0): # print("Accepted formats", accepted_formats) source=book['source'] # print("Formats available in source: {}".format(list(source['formats'].keys()))) my_formats=[] for f,v in source['formats'].items(): if v['status']=='todo': my_formats.append(f) # print("Formats in 'todo': {}".format(my_formats)) formats=[] if single_format: @@ -644,53 +677,37 @@ def get_formats_to_download(book, accepted_formats=[], ignored_formats=[], singl else: formats=my_formats # print("Formats expected: {}".format(formats)) download_formats=formats[:] for f in formats: if not 'size' in source['formats'][f] and max_size: # print ("Format '{}' ignored for {}: Size unknown".format(f, book['uuid'])) download_formats.remove(f) else: size = source['formats'][f]['size'] if size < min_size or (max_size and size > max_size): download_formats.remove(f) # print ("Format '{}' ignored for {}: size={} but expected between {} and {}".format(f, book['uuid'], hsize(size), hsize(min_size), hsize(max_size) if max_size else "infinity")) return download_formats def update_format_statuses(book,refresh_ignored): formats=book['source']['formats'] for f, v in formats.items(): if v['status']=='ignored' and not refresh_ignored: # print ("Format '{}' ignored: {} ({}))".format(f, book['uuid'], book['title'])) pass else: # print ("Format '{}' todo: {} ({}))".format(f, book['uuid'], book['title'])) book['source']['formats'][f]['status']='todo' if __name__ == "__main__": fire.Fire({ "index_ebooks": index_ebooks, "download_ebooks": download_ebooks, "download_covers": download_covers, "set_status": set_status }) -
Krazybug revised this gist
Nov 11, 2019 . 1 changed file with 0 additions and 5 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -16,11 +16,6 @@ import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip', 'fb2'] identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) -
Krazybug revised this gist
Nov 11, 2019 . 1 changed file with 101 additions and 106 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -11,8 +11,15 @@ import iso639 import pickle import time from requests.adapters import HTTPAdapter import urllib.parse import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # l= book.get_items_of_type(ebooklib.ITEM_DOCUMENT) # >>> for i in l: # ... print(BeautifulSoup(i.get_content()).text) all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip', 'fb2'] identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) @@ -59,11 +66,18 @@ def get_file_path(path, uuid, fileformat): else: return 0 else: return 0 def get_cover(path, book, map): url=book['source']['cover'] if map: pu=urllib.parse.urlparse(url) pu=(pu[0], map, *pu[2:]) print(pu) url=urllib.parse.urlunparse(pu) print("Downloading cover from:", url) r=requests.get(url, timeout=(20, 3), verify=False) r.raise_for_status() filepath=path+'/'+book['uuid']+'/cover.jpg' @@ -74,20 +88,21 @@ def get_cover(path, book): print("Saved to:", filepath) def download_covers(dir= '.', server='', map=""): for root, dirs, files in os.walk(dir, topdown=True): for d in dirs: # print() # print("-->", d) book = load_metadata(root, d) if book: # if book['source']['status'] != "ignored": if True: if not get_cover_path(root, book['uuid']): print() print("-->", d) print(book['uuid']) try: get_cover(root, book, map) except: print ("Unable to get cover", book['uuid']) else: @@ -100,19 +115,34 @@ def download_covers(dir= '.', server=''): def get_file_size(url): print("Downloading size:", url) r = requests.head(url, verify=False) r.raise_for_status() size=r.headers['Content-Length'] print("Size received="+ hsize(size)) return int(size) def get_file(path, book, format, session, map, map_lib): uuid = book['uuid'] url=book['source']['formats'][format]['url'] if map: pu=urllib.parse.urlparse(url) pu=(pu[0], map, *pu[2:]) print(pu) url=urllib.parse.urlunparse(pu) if map_lib: # pu=urllib.parse.urlparse(url) # print(pu) url_s=url.split("/") # print(url_s) url_s=url_s[:-1]+[map_lib] # print('/'.join(url_s)) url='/'.join(url_s) print("Downloading ebook:", url) print("Size expected (estimation):", hsize(book['source']['formats'][format]['size'])) r = session.get(url, timeout=(25,15), verify=False) # headers = {"Range": "bytes=0-1023"} # r = requests.get(url, headers=headers) r.raise_for_status() @@ -122,6 +152,7 @@ def get_file(path, book, format, session): else: print("Fize received") filename=re.findall(r'filename="(.*)"', r.headers['Content-Disposition']) # print(filename) if len(filename): @@ -142,7 +173,7 @@ def set_status(uuid, status, dir='.'): if book['source']['status'] != status: book['source']['status'] = status save_metadata(dir, book) print("Status changed to", status+":", book['uuid'], "(", book['title'], ")") else: print("Status unchanged changed ", status+":", book['uuid']) else: @@ -204,7 +235,7 @@ def update_done_status(book): def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False): offset= 0 if not start else start-1 num=1000 server=site api=server+'ajax/' #api=server+'calibre/ajax/' @@ -234,7 +265,7 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False url=api+'search'+library+'?num='+str(remaining_num)+'&offset='+str(offset)+'&sort=timestamp&sort_order=desc' print("->", url) r=requests.get(url, verify=False) print("Ids received from:"+str(offset), "to:"+str(offset+remaining_num-1)) print() @@ -248,14 +279,18 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False for id in r.json().keys(): print() print ('--> range={}/{}'.format(str(range),str(total_num))) r_book=r.json()[id] uuid=r_book['uuid'] if not uuid: print ("No uuid for ebook: ignored") continue # print ('\r--> range={}/{}'.format(str(range),str(total_num)), "uuid="+uuid, "("+r.json()[id]['title']+")", end='') # print (r.json()[id]) # title= r.json()[id]['title'] if 'title' in r.json()[id] else "<untitled>" if r_book['authors']: print("uuid="+uuid, "("+r_book['title']+" -- "+r_book['authors'][0]+")") else: print("uuid="+uuid, "("+r_book['title']+")") if not force_refresh: # print("Checking local metadata:", uuid) @@ -271,26 +306,28 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False continue if not r.json()[id]['formats']: print("No format found for {}".format(r_book['uuid'])) range+=1 continue print("Analyzing for:", uuid) book={} url=api+'book/'+id book['title']=r_book['title'] book['authors']=r_book['authors'] book['series']=r_book['series'] book['series_index']=r_book['series_index'] book['edition']=0 book['uuid']=r_book['uuid'] book['identifiers']=r_book['identifiers'] book['comments']=r_book['comments'] book['pubdate']=r_book['pubdate'] book['publisher']=r_book['publisher'] languages=r_book['languages'] if not languages: # if True: # pass print ("Analyzing languages") if book['comments']: @@ -310,11 +347,11 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False for l in languages: book['languages'].append(iso639.to_iso639_2(l)) book['tags']=r_book['tags'] book['formats']=[] book['metadata_version']=0.1 source={} source['url']=url+library source['id']=id try: tmpbook = load_metadata(dir, uuid) @@ -326,22 +363,22 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False source['status']="ignored" else: source['status']="todo" source['cover']=server+r_book['cover'] source['timestamp']=r_book['timestamp'] format_sources={} formats=r.json()[id]['formats'] for f in formats: s={} url='' if f in r_book['main_format']: url=r_book['main_format'][f] else: url=r_book['other_formats'][f] s['url']=server+url if 'size' in r_book['format_metadata'][f]: s['size']=int(r_book['format_metadata'][f]['size']) else: print("Size not found for format '{}' : {}".format(f, uuid)) print("Trying to get size online: {}".format(s['url'])) @@ -355,6 +392,8 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False source['formats']=format_sources book['source']=source print("Analyzed:", uuid) if not source['formats']: print("No format found for {}".format(r.json()[id]['uuid'])) @@ -422,7 +461,7 @@ def has_identifiers(book, identifiers=[], ignore_empty_identifiers=False): print ("'{}' todo: expected identifiers {}".format(book['uuid'], expected_identifiers)) return True def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False, dry_run=False, map="", map_lib=""): # all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip'] if single_format: my_formats = formats if formats else all_ordered_formats @@ -446,15 +485,17 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore identifiers_count={} s = requests.Session() for root, dirs, files in os.walk(dir, topdown=True): for counter, uuid in enumerate(dirs): book = load_metadata(root, uuid) if book: status=book['source']['status'] if status=="todo": print() print() print("-->", uuid, "("+book['title']+" -- "+book['authors'][0]+" -- serie: "+ str(book['series'])+")") if not has_languages(book, languages=languages, ignore_empty_language=ignore_empty_language): continue @@ -466,32 +507,42 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore download_formats = get_formats_to_download(book, accepted_formats=my_formats, single_format=single_format, ignored_formats=ignored_formats, max_size=max_size, min_size=min_size) if not len(download_formats): print ("'{}' ignored: no more format available in formats expected {}".format(uuid, download_formats)) print() else: ebook_kept=False for f in download_formats: url = source['formats'][f]['url'] # if map: # pu=urllib.parse.urlparse(url) # pu=(pu[0], map, *pu[2:]) # print(pu) # print(urllib.parse.urlunparse(pu)) if url: # # It shouldn't occur: Need to download again if get_file_path(dir, uuid, f): print ("Format '{}' already present for {}: Retrying".format(f, uuid)) print() # continue print("Format '{}': size expected={}".format(f, hsize(source['formats'][f]['size']))) if not dry_run: try: get_file(dir, book, f, s, map, map_lib) book['formats'].append(f) book['source']['formats'][f]['status']="done" time.sleep(0) # except: except Exception as msg: print("Unable to get book:", url) print(msg) time.sleep(5) continue save_metadata(dir, book) ebook_kept=True size=source['formats'][f]['size'] total_size += size size_max = size if size>size_max else size_max if not size_min: size_min = size @@ -508,6 +559,7 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore total_format_count +=1 else: print ("Format '{}' ignored for {} ({}): No url)".format(f, uuid, book['title'])) print() if ebook_kept: total_ebook_count+=1 if not book['languages']: @@ -538,11 +590,15 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore if book['source']['status']=="done": save_metadata(dir, book) print("Book done:", book['uuid']) print() # total_ebook_count+=1 else: # print() # print("-->", uuid, "("+book['title']+")") # print ('{} in status "{}": skipped'.format(book['uuid'], status)) # print(f"--> {uuid} ({book['title']}) in status {status}: skipped", end="\r") # print(f"--> {uuid} ({book['title']})", end="\r") print(f'--> {counter} books handled', end="\r") print() print("Total count of updated ebooks:", total_ebook_count) @@ -617,66 +673,6 @@ def update_format_statuses(book,refresh_ignored): print ("Format '{}' todo: {} ({}))".format(f, book['uuid'], book['title'])) book['source']['formats'][f]['status']='todo' def reset_ignored(dir= '.', server=''): for root, dirs, files in os.walk(dir, topdown=True): @@ -701,6 +697,5 @@ def reset_ignored(dir= '.', server=''): save_metadata(dir, book) if __name__ == "__main__": fire.Fire() -
Krazybug revised this gist
Mar 16, 2019 . 1 changed file with 31 additions and 127 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -10,6 +10,7 @@ from langid.langid import LanguageIdentifier, model import iso639 import pickle import time @@ -18,7 +19,7 @@ def load_metadata(path, uuid): filepath=path+'/'+uuid+'/metadata.json' # print (filepath) if os.path.isfile(filepath): try: with open(filepath, 'r') as fd: @@ -62,7 +63,7 @@ def get_cover(path, book): url=book['source']['cover'] print("Downloading cover from:", url) r=requests.get(url, timeout=10) r.raise_for_status() filepath=path+'/'+book['uuid']+'/cover.jpg' @@ -76,125 +77,26 @@ def get_cover(path, book): def download_covers(dir= '.', server=''): for root, dirs, files in os.walk(dir, topdown=True): for d in dirs: # print() # print("-->", d) book = load_metadata(root, d) if book: if book['source']['status'] != "ignored": if not get_cover_path(root, book['uuid']): print() print("-->", d) print(book['uuid']) try: get_cover(root, book) except: print ("Unable to get cover", book['uuid']) else: pass # print ("Cover already present:", book['uuid']) else: print ('book {} in status {}: ignored'.format(book['uuid'], book['source']['status'])) else: print ("No ebook metadata found in:", root) def get_file_size(url): print("Downloading size:", url) @@ -204,13 +106,13 @@ def get_file_size(url): print("Size received="+ hsize(size)) return int(size) def get_file(path, book, format, session): uuid = book['uuid'] url=book['source']['formats'][format]['url'] print("Downloading ebook:", url) print("Size expected (estimation):", hsize(book['source']['formats'][format]['size'])) r = session.get(url, timeout=5) # headers = {"Range": "bytes=0-1023"} # r = requests.get(url, headers=headers) r.raise_for_status() @@ -314,7 +216,7 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False print("Getting ebooks count:", server) print(url) try: r = requests.get(url,verify=False) r.raise_for_status() except: print("Unable to open site:", url) @@ -340,7 +242,7 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False books_s=",".join(str(i) for i in r.json()['book_ids']) url=api+'books'+library+'?ids='+books_s print("->", url) r=requests.get(url, verify=False) print(len(r.json()), "received") for id in r.json().keys(): @@ -389,20 +291,20 @@ def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False book['publisher']=r.json()[id]['publisher'] languages=r.json()[id]['languages'] if not languages: # pass print ("Analyzing languages") if book['comments']: text=book['comments'] else: text=book['title'] s_language, prob=identifier.classify(text) print (s_language, prob) if prob >= 0.85: language = iso639.to_iso639_2(s_language) print("language=", language) book['languages']=[language] else: book['languages']=[] else: book['languages']=[] for l in languages: @@ -543,6 +445,8 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore language_count={} identifiers_count={} s = requests.Session() for root, dirs, files in os.walk(dir, topdown=True): for uuid in dirs: book = load_metadata(root, uuid) @@ -575,8 +479,9 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore if not dry_run: try: get_file(dir, book, f, s) book['formats'].append(f) time.sleep(0.5) # except: except Exception as msg: print("Unable to get book:", url) @@ -663,7 +568,6 @@ def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignore def get_formats_to_download(book, accepted_formats=[], ignored_formats=[], single_format=False, min_size=0, max_size=0): print("Accepted formats", accepted_formats) source=book['source'] print("Formats available in source: {}".format(list(source['formats'].keys()))) my_formats=[] -
Krazybug revised this gist
Mar 13, 2019 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -763,7 +763,7 @@ def filter_ebooks(dir= '.', server='', formats=[], single_format=False, ignored_ print ("{} format ignored: '{}'".format(uuid, f)) total_format_count +=1 save_ebook=True if save_ebook: save_metadata(dir, book) else: print() -
Krazybug renamed this gist
Mar 11, 2019 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
Krazybug created this gist
Mar 11, 2019 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,802 @@ import sys import os import time import re import shutil import requests import json import fire from humanize import naturalsize as hsize from langid.langid import LanguageIdentifier, model import iso639 import pickle all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip', 'fb2'] identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) def load_metadata(path, uuid): filepath=path+'/'+uuid+'/metadata.json' print (filepath) if os.path.isfile(filepath): try: with open(filepath, 'r') as fd: return json.load(fd) except: print ("Error loading metadata for:", uuid, "from path:", path) return 0 else: print ("Metadata not found for:", uuid, "from path:", path) return 0 def save_metadata(path, book): filepath=path+'/'+book['uuid']+'/metadata.json' print("Saving book metadata for:", book['uuid'], "to:", filepath) os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True) with open(filepath+".tmp", 'w') as fd: json.dump(book, fd, indent=4, separators=(',', ': ')) try: shutil.move(filepath+".tmp", filepath) print("Saved to:", filepath) except: print("Unable to rename .tmp file:", filepath+".tmp") def get_cover_path(path, uuid): filepath=path+'/'+uuid+'/cover.jpg' if os.path.isfile(filepath): return filepath else: return 0 def get_file_path(path, uuid, fileformat): files=os.listdir(path+'/'+uuid) if files: for f in files: fname, ext=os.path.splitext(f) if ext =='.'+fileformat: return path+'/'+uuid+'/'+f else: return 0 else: return 0 def get_cover(path, book): url=book['source']['cover'] print("Downloading cover from:", url) r=requests.get(url) r.raise_for_status() filepath=path+'/'+book['uuid']+'/cover.jpg' os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True) with open(filepath+".tmp", 'wb') as fd: fd.write(r.content) shutil.move(filepath+".tmp", filepath) print("Saved to:", filepath) def download_covers(dir= '.', server=''): for root, dirs, files in os.walk(dir, topdown=True): for d in dirs: print() print("-->", d) book = load_metadata(root, d) if book: if book['source']['status'] != "ignored": if not get_cover_path(root, book['uuid']): print(book['uuid']) try: get_cover(root, book) except: print ("Unable to get cover", book['uuid']) else: print ("Cover already present:", book['uuid']) else: print ('book {} in status {}: ignored'.format(book['uuid'], book['source']['status'])) else: print ("No ebook metadata found in:", root) def index_ebooks1(dir= '.', server=''): filepath=dir+'/.index' if os.path.isfile(filepath): index=pickle.load(open(filepath, 'rb')) else: index = {} index['uuids']=[] index['identifiers']={} index['authors']={} index['titles']={} for root, dirs, files in os.walk(dir, topdown=True): for d in dirs: print() print("-->", d) book = load_metadata(root, d) if book: if book['source']['status'] == "todo": print(book['uuid']) if not book['uuid'] in index['uuids']: index['uuids'].append(book['uuid']) if book['title'] not in index['titles']: index['titles'][book['title']] = [book['uuid']] elif not book['uuid'] in index['titles'][book['title']]: index['titles'][book['title']].append(book['uuid']) # index['titles'][book['title']] = index['titles'].get(book['title'], []) + [book['uuid']] for a in book["authors"]: if a not in index['authors']: index['authors'][a] = [book['uuid']] elif not book['uuid'] in index['authors'][a]: index['authors'][a].append(book['uuid']) # index['authors'][a] = index['authors'].get(a, []) + [book['uuid']] for k, i in book["identifiers"].items(): if k not in index['identifiers']: index['identifiers'][k]={i:[book['uuid']]} elif i not in index['identifiers'][k]: index['identifiers'][k][i]=[book['uuid']] elif not book['uuid'] in index['identifiers'][k][i]: index['identifiers'][k][i].append(book['uuid']) # index['identifiers'][k][i]=index['identifiers'].get(k, {}).get(i, []) + [book['uuid']] else: print ('book {} in status {}: ignored'.format(book['uuid'], book['source']['status'])) else: print ("No ebook metadata found in:", root) print("titles indexed:", len(index['titles'])) print("authors indexed:", len(index['authors'])) print("identifiers indexed:", len(index['identifiers'])) # print("identifiers:",index['identifiers']) pickle.dump(index, open(filepath, 'wb')) def index_calibre_local(dir= '.', calibre_dir=''): filepath=dir+'/.index' if os.path.isfile(filepath): index=pickle.load(open(filepath, 'rb')) else: index = {} index['identifiers']={} index['authors']={} index['titles']={} for root, dirs, files in os.walk(dir, topdown=True): for d in dirs: print() print("-->", d) book = load_metadata(root, d) if book: if book['source']['status'] == "todo": print(book['uuid']) if book['title'] not in index['titles']: index['titles'][book['title']] = [book['uuid']] elif not book['uuid'] in index['titles'][book['title']]: index['titles'][book['title']].append(book['uuid']) # index['titles'][book['title']] = index['titles'].get(book['title'], []) + [book['uuid']] for a in book["authors"]: if a not in index['authors']: index['authors'][a] = [book['uuid']] elif not book['uuid'] in index['authors'][a]: index['authors'][a].append(book['uuid']) # index['authors'][a] = index['authors'].get(a, []) + [book['uuid']] for k, i in book["identifiers"].items(): if k not in index['identifiers']: index['identifiers'][k]={i:[book['uuid']]} elif i not in index['identifiers'][k]: index['identifiers'][k][i]=[book['uuid']] elif not book['uuid'] in index['identifiers'][k][i]: index['identifiers'][k][i].append(book['uuid']) # index['identifiers'][k][i]=index['identifiers'].get(k, {}).get(i, []) + [book['uuid']] else: print ('book {} in status {}: ignored'.format(book['uuid'], book['source']['status'])) else: print ("No ebook metadata found in:", root) print("titles indexed:", len(index['titles'])) print("authors indexed:", len(index['authors'])) print("identifiers indexed:", len(index['identifiers'])) # print("identifiers:",index['identifiers']) pickle.dump(index, open(filepath, 'wb')) def get_file_size(url): print("Downloading size:", url) r = requests.head(url) r.raise_for_status() size=r.headers['Content-Length'] print("Size received="+ hsize(size)) return int(size) def get_file(path, book, format): uuid = book['uuid'] url=book['source']['formats'][format]['url'] print("Downloading ebook:", url) print("Size expected (estimation):", hsize(book['source']['formats'][format]['size'])) r = requests.get(url) # headers = {"Range": "bytes=0-1023"} # r = requests.get(url, headers=headers) r.raise_for_status() # print(r.headers) if('Content-Length' in r.headers ): print("Size received="+hsize(r.headers['Content-Length'])) else: print("Fize received") filename=re.findall(r'filename="(.*)"', r.headers['Content-Disposition']) # print(filename) if len(filename): filepath=path+'/'+uuid+'/'+filename[0] else: filepath=path+'/'+uuid+'/'+uuid+"."+format os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True) with open(filepath+".tmp", 'wb') as fd: fd.write(r.content) shutil.move(filepath+".tmp", filepath) print("Saved to:", filepath) def set_status(uuid, status, dir='.'): book = load_metadata(dir, uuid) if book: if book['source']['status'] != status: book['source']['status'] = status save_metadata(dir, book) print("Status changed to", status+":", book['uuid']) else: print("Status unchanged changed ", status+":", book['uuid']) else: print ("No ebook metadata found for:", uuid) def remove_book(uuid, path='.'): print(os.getcwd()) bookpath=path+'/'+uuid if os.path.isdir(bookpath): try: shutil.rmtree(bookpath) print(uuid, "removed") except: print("problem") else: print(uuid, "not found") def explore(site, help=False): server=site api=server+'ajax/' print("Server:", server) url=api+'library-info' print() print("Getting libraries:", server) print(url) try: r = requests.get(url) r.raise_for_status() except: print("Unable to open site:", url) sys.exit(1) libraries = r.json()["library_map"].keys() print("Libraries:") for l in libraries: library='/'+l url=api+'search'+library+'?num=0' try: r = requests.get(url) r.raise_for_status() except: print("Unable to open site:", url) continue print("\t{}: {} ebooks".format(l, r.json()["total_num"])) def update_done_status(book): source=book['source'] if source['status']!='ignored': if set(source['formats'].keys()) == set(book['formats']) & set(source['formats'].keys()): book['source']['status']="done" else: book['source']['status']="todo" def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False): offset= 0 if not start else start-1 num=500 server=site api=server+'ajax/' #api=server+'calibre/ajax/' library= '/'+library if library else library print("Server:", server) url=api+'search'+library+'?num=0' print() print("Getting ebooks count:", server) print(url) try: r = requests.get(url) r.raise_for_status() except: print("Unable to open site:", url) sys.exit(1) print("Total count=",r.json()["total_num"]) total_num=int(r.json()["total_num"]) total_num= total_num if not stop else stop range=offset+1 while offset < total_num: remaining_num = min(num, total_num - offset) print() print("Downloading ids: offset="+str(offset), "num="+str(remaining_num)) # url=api+'search?num='+str(remaining_num)+'&offset='+str(offset) url=api+'search'+library+'?num='+str(remaining_num)+'&offset='+str(offset)+'&sort=timestamp&sort_order=desc' print("->", url) r=requests.get(url) print("Ids received from:"+str(offset), "to:"+str(offset+remaining_num-1)) print() print("Downloading metadata from", str(offset+1), "to", str(offset+remaining_num)) books_s=",".join(str(i) for i in r.json()['book_ids']) url=api+'books'+library+'?ids='+books_s print("->", url) r=requests.get(url) print(len(r.json()), "received") for id in r.json().keys(): print() print ('--> range={}/{}'.format(str(range),str(total_num))) uuid=r.json()[id]['uuid'] if not uuid: print ("No uuid for ebook: ignored") continue # print ('\r--> range={}/{}'.format(str(range),str(total_num)), "uuid="+uuid, "("+r.json()[id]['title']+")", end='') # print (r.json()[id]) # title= r.json()[id]['title'] if 'title' in r.json()[id] else "<untitled>" print("uuid="+uuid, "("+r.json()[id]['title']+")") if not force_refresh: # print("Checking local metadata:", uuid) try: book = load_metadata(dir, uuid) except: print("Unable to get metadata from:", uuid) range+=1 continue if book: print("Metadata already present for:", uuid) range+=1 continue if not r.json()[id]['formats']: print("No format found for {}".format(r.json()[id]['uuid'])) range+=1 continue book={} url=api+'book/'+id book['title']=r.json()[id]['title'] book['authors']=r.json()[id]['authors'] book['series']=r.json()[id]['series'] book['series']=r.json()[id]['series'] book['series_index']=r.json()[id]['series_index'] book['edition']=0 book['uuid']=r.json()[id]['uuid'] book['identifiers']=r.json()[id]['identifiers'] book['comments']=r.json()[id]['comments'] book['pubdate']=r.json()[id]['pubdate'] book['publisher']=r.json()[id]['publisher'] languages=r.json()[id]['languages'] if not languages: pass # print ("Analyzing languages") # if book['comments']: # text=book['comments'] # else: # text=book['title'] # s_language, prob=identifier.classify(text) # print (s_language, prob) # if prob >= 0.85: # language = iso639.to_iso639_2(s_language) # print("language=", language) # book['languages']=[language] # else: # book['languages']=[] else: book['languages']=[] for l in languages: book['languages'].append(iso639.to_iso639_2(l)) book['tags']=r.json()[id]['tags'] book['formats']=[] book['metadata_version']=0.1 source={} source['url']=url source['id']=id try: tmpbook = load_metadata(dir, uuid) except: print("Unable to get metadata from:", uuid) range+=1 continue if tmpbook and tmpbook['source']['status']=="ignored": source['status']="ignored" else: source['status']="todo" source['cover']=server+r.json()[id]['cover'] source['timestamp']=r.json()[id]['timestamp'] format_sources={} formats=r.json()[id]['formats'] for f in formats: s={} url='' if f in r.json()[id]['main_format']: url=r.json()[id]['main_format'][f] else: url=r.json()[id]['other_formats'][f] s['url']=server+url if 'size' in r.json()[id]['format_metadata'][f]: s['size']=int(r.json()[id]['format_metadata'][f]['size']) else: print("Size not found for format '{}' : {}".format(f, uuid)) print("Trying to get size online: {}".format(s['url'])) try: s['size']=get_file_size(s['url']) except: print("Unable to access format '{}' : {} skipped".format(f, uuid)) continue s['status']='todo' format_sources[f]=s source['formats']=format_sources book['source']=source if not source['formats']: print("No format found for {}".format(r.json()[id]['uuid'])) range+=1 continue update_done_status(book) print("Saving metadata for:", uuid) try: save_metadata(dir, book) except: print("Unable to save book metadata", book['uuid']) range+=1 offset=offset+num def has_languages(book, languages=[], ignore_empty_language=False): print("Accepted languages", languages) if not ignore_empty_language: print("Unknown language accepted") # rustine if not 'languages' in book: book['languages']=[] print("Book languages", book['languages']) if ignore_empty_language and not book['languages']: print ("'{}' ignored: language is empty".format(book['uuid'])) return False if not ignore_empty_language and not book['languages']: print ("'{}' todo: language is empty".format(book['uuid'])) return True expected_languages=list(set(book['languages']) & set(languages)) if languages and not expected_languages: print ("'{}' ignored: language {} not in {}".format(book['uuid'], book['languages'],languages)) return False print ("'{}' todo: expected languages {}".format(book['uuid'], expected_languages)) return True def has_identifiers(book, identifiers=[], ignore_empty_identifiers=False): print("Accepted identifiers", identifiers) if not ignore_empty_identifiers: print("Unknown identifiers accepted") print("Book identifiers", book['identifiers'].keys()) if ignore_empty_identifiers and not book['identifiers']: print ("'{}' ignored: identifier is empty".format(book['uuid'])) return False if not ignore_empty_identifiers and not book['identifiers']: print ("'{}' todo: identifiers is empty".format(book['uuid'])) return True expected_identifiers=list(set(book['identifiers'].keys()) & set(identifiers)) if identifiers and not expected_identifiers: print ("'{}' ignored: identifiers {} not in {}".format(book['uuid'], book['identifiers'].keys(), identifiers)) return False print ("'{}' todo: expected identifiers {}".format(book['uuid'], expected_identifiers)) return True def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False, dry_run=False): # all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip'] if single_format: my_formats = formats if formats else all_ordered_formats else: my_formats=formats print("formats=", my_formats) min_size=int(min_size)*1024*1024 max_size=int(max_size)*1024*1024 print ("Format expected between {} and {}".format(hsize(min_size), hsize(max_size) if max_size else "infinity")) # sys.exit() total_size=0 total_size_by_format={} total_ebook_count=0 total_format_count=0 total_count_by_format={} size_max=0 size_min=0 language_count={} identifiers_count={} for root, dirs, files in os.walk(dir, topdown=True): for uuid in dirs: book = load_metadata(root, uuid) if book: status=book['source']['status'] if status=="todo": print() print("-->", uuid, "("+book['title']+")") if not has_languages(book, languages=languages, ignore_empty_language=ignore_empty_language): continue if not has_identifiers(book, identifiers=identifiers, ignore_empty_identifiers=ignore_empty_identifiers): continue source=book['source'] download_formats = get_formats_to_download(book, accepted_formats=my_formats, single_format=single_format, ignored_formats=ignored_formats, max_size=max_size, min_size=min_size) if not len(download_formats): print ("'{}' ignored: no more format available in formats expected {}".format(uuid, download_formats)) else: ebook_kept=False for f in download_formats: url = source['formats'][f]['url'] if url: if get_file_path(dir, uuid, f): print ("Format '{}' already present for {}: Skipped".format(f, uuid)) continue print("Format '{}': size expected={}".format(f, hsize(source['formats'][f]['size']))) if not dry_run: try: get_file(dir, book, f) book['formats'].append(f) # except: except Exception as msg: print("Unable to get book:", url) print(msg) continue save_metadata(dir, book) ebook_kept=True size=source['formats'][f]['size'] total_size += size size_max = size if size>size_max else size_max if not size_min: size_min = size else: size_min = size if size<size_min else size_min if not f in total_size_by_format: total_size_by_format[f] = size else: total_size_by_format[f] +=size if not f in total_count_by_format: total_count_by_format[f] = 1 else: total_count_by_format[f]+=1 total_format_count +=1 else: print ("Format '{}' ignored for {} ({}): No url)".format(f, uuid, book['title'])) if ebook_kept: total_ebook_count+=1 if not book['languages']: if not '<unknown>' in language_count: language_count['<unknown>'] = 1 else: language_count['<unknown>']+=1 else: for l in book['languages']: if not l in language_count: language_count[l] = 1 else: language_count[l]+=1 if not book['identifiers']: if not '<unknown>' in identifiers_count: identifiers_count['<unknown>'] = 1 else: identifiers_count['<unknown>']+=1 else: for l in book['identifiers'].keys(): if not l in identifiers_count: identifiers_count[l] = 1 else: identifiers_count[l]+=1 if not dry_run: update_done_status(book) if book['source']['status']=="done": save_metadata(dir, book) print("Book done:", book['uuid']) # total_ebook_count+=1 else: print() print("-->", uuid, "("+book['title']+")") print ('{} in status "{}": skipped'.format(book['uuid'], status)) print() print("Total count of updated ebooks:", total_ebook_count) print("Total ebooks updated by language:") for l, c in language_count.items(): print(" '{}': {}".format(l, c)) print("Total ebooks updated by identifiers:") for l, c in identifiers_count.items(): print(" '{}': {}".format(l, c)) print("Total count of formats:", total_format_count) print("Total count of ebooks by format:") for f, c in total_count_by_format.items(): print("\t'{}': {}".format(f, c)) print() print("Total size:", hsize(total_size)) print("Maximum file size:", hsize(size_max)) print("Minimum file size:", hsize(size_min)) print("Total size by format:") for f, s in total_size_by_format.items(): print("\t'{}': {}".format(f, hsize(s))) def get_formats_to_download(book, accepted_formats=[], ignored_formats=[], single_format=False, min_size=0, max_size=0): print("Accepted formats", accepted_formats) source=book['source'] print("Formats available in source: {}".format(list(source['formats'].keys()))) my_formats=[] for f,v in source['formats'].items(): if v['status']=='todo': my_formats.append(f) print("Formats in 'todo': {}".format(my_formats)) formats=[] if single_format: if accepted_formats: for f in accepted_formats: if f in my_formats: formats=[f] break else: print("need at least 1 format for ordering") else: if accepted_formats: formats=list(set(accepted_formats) & set(my_formats)) elif ignored_formats: formats = list(set(my_formats) - set(ignored_formats)) else: formats=my_formats print("Formats expected: {}".format(formats)) download_formats=formats[:] for f in formats: if not 'size' in source['formats'][f] and max_size: print ("Format '{}' ignored for {}: Size unknown".format(f, book['uuid'])) download_formats.remove(f) else: size = source['formats'][f]['size'] if size < min_size or (max_size and size > max_size): download_formats.remove(f) print ("Format '{}' ignored for {}: size={} but expected between {} and {}".format(f, book['uuid'], hsize(size), hsize(min_size), hsize(max_size) if max_size else "infinity")) return download_formats def update_format_statuses(book,refresh_ignored): formats=book['source']['formats'] for f, v in formats.items(): if v['status']=='ignored' and not refresh_ignored: print ("Format '{}' ignored: {} ({}))".format(f, book['uuid'], book['title'])) else: print ("Format '{}' todo: {} ({}))".format(f, book['uuid'], book['title'])) book['source']['formats'][f]['status']='todo' def filter_ebooks(dir= '.', server='', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False): if single_format: my_formats = formats if formats else all_ordered_formats else: my_formats=formats print("formats=", my_formats) min_size=int(min_size)*1024*1024 max_size=int(max_size)*1024*1024 print ("Format expected between {} and {}".format(hsize(min_size), hsize(max_size) if max_size else "infinity")) # sys.exit() total_ebook_count=0 total_format_count=0 for root, dirs, files in os.walk(dir, topdown=True): for uuid in dirs: book = load_metadata(root, uuid) if book: status=book['source']['status'] if status=="todo": print() print("-->", uuid, "("+book['title']+")") if not has_languages(book, languages=languages, ignore_empty_language=ignore_empty_language): book['source']['status']='ignored' print ("{} ignored: languages filtered".format(uuid)) save_metadata(dir, book) total_ebook_count+=1 continue if not has_identifiers(book, identifiers=identifiers, ignore_empty_identifiers=ignore_empty_identifiers): book['source']['status']='ignored' print ("{} ignored: identifiers filtered".format(uuid)) save_metadata(dir, book) total_ebook_count+=1 continue download_formats = get_formats_to_download(book, accepted_formats=my_formats, single_format=single_format, ignored_formats=ignored_formats, max_size=max_size, min_size=min_size) save_ebook=False source=book['source'] formats_to_ignore=list(set(source['formats'].keys()) - set(book['formats']) - set(download_formats)) print("formats to ignore:", formats_to_ignore) for f in formats_to_ignore: if source['formats'][f]['status']!='ignored': source['formats'][f]['status']='ignored' print ("{} format ignored: '{}'".format(uuid, f)) total_format_count +=1 save_ebook=True if save_ebook:æ save_metadata(dir, book) else: print() print("-->", uuid, "("+book['title']+")") print ('{} in status "{}": skipped'.format(book['uuid'], status)) print() print("Total count of newly ignored ebooks:", total_ebook_count) print("Total count of newly formats to ignore:", total_format_count) def reset_ignored(dir= '.', server=''): for root, dirs, files in os.walk(dir, topdown=True): for uuid in dirs: save_ebook=False book = load_metadata(root, uuid) if book: status=book['source']['status'] if status=="ignored": print ("'{}' status 'ignored' reset to 'todo'".format(book['uuid'])) book['source']['status']='todo' save_ebook=True formats=book['source']['formats'] for f, v in formats.items(): if v['status']=='ignored': print ("'{}' format 'ignored' reset to 'todo'".format(book['uuid'])) book['source']['formats'][f]['status']='todo' save_ebook=True if save_ebook: save_metadata(dir, book) if __name__ == "__main__": fire.Fire()