import sys import os import time import re import shutil import requests import json import fire from humanize import naturalsize as hsize from langid.langid import LanguageIdentifier, model import iso639 import pickle all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip', 'fb2'] identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) def load_metadata(path, uuid): filepath=path+'/'+uuid+'/metadata.json' print (filepath) if os.path.isfile(filepath): try: with open(filepath, 'r') as fd: return json.load(fd) except: print ("Error loading metadata for:", uuid, "from path:", path) return 0 else: print ("Metadata not found for:", uuid, "from path:", path) return 0 def save_metadata(path, book): filepath=path+'/'+book['uuid']+'/metadata.json' print("Saving book metadata for:", book['uuid'], "to:", filepath) os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True) with open(filepath+".tmp", 'w') as fd: json.dump(book, fd, indent=4, separators=(',', ': ')) try: shutil.move(filepath+".tmp", filepath) print("Saved to:", filepath) except: print("Unable to rename .tmp file:", filepath+".tmp") def get_cover_path(path, uuid): filepath=path+'/'+uuid+'/cover.jpg' if os.path.isfile(filepath): return filepath else: return 0 def get_file_path(path, uuid, fileformat): files=os.listdir(path+'/'+uuid) if files: for f in files: fname, ext=os.path.splitext(f) if ext =='.'+fileformat: return path+'/'+uuid+'/'+f else: return 0 else: return 0 def get_cover(path, book): url=book['source']['cover'] print("Downloading cover from:", url) r=requests.get(url) r.raise_for_status() filepath=path+'/'+book['uuid']+'/cover.jpg' os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True) with open(filepath+".tmp", 'wb') as fd: fd.write(r.content) shutil.move(filepath+".tmp", filepath) print("Saved to:", filepath) def download_covers(dir= '.', server=''): for root, dirs, files in os.walk(dir, topdown=True): for d in dirs: print() print("-->", d) book = load_metadata(root, d) if book: if book['source']['status'] != "ignored": if not get_cover_path(root, book['uuid']): print(book['uuid']) try: get_cover(root, book) except: print ("Unable to get cover", book['uuid']) else: print ("Cover already present:", book['uuid']) else: print ('book {} in status {}: ignored'.format(book['uuid'], book['source']['status'])) else: print ("No ebook metadata found in:", root) def index_ebooks1(dir= '.', server=''): filepath=dir+'/.index' if os.path.isfile(filepath): index=pickle.load(open(filepath, 'rb')) else: index = {} index['uuids']=[] index['identifiers']={} index['authors']={} index['titles']={} for root, dirs, files in os.walk(dir, topdown=True): for d in dirs: print() print("-->", d) book = load_metadata(root, d) if book: if book['source']['status'] == "todo": print(book['uuid']) if not book['uuid'] in index['uuids']: index['uuids'].append(book['uuid']) if book['title'] not in index['titles']: index['titles'][book['title']] = [book['uuid']] elif not book['uuid'] in index['titles'][book['title']]: index['titles'][book['title']].append(book['uuid']) # index['titles'][book['title']] = index['titles'].get(book['title'], []) + [book['uuid']] for a in book["authors"]: if a not in index['authors']: index['authors'][a] = [book['uuid']] elif not book['uuid'] in index['authors'][a]: index['authors'][a].append(book['uuid']) # index['authors'][a] = index['authors'].get(a, []) + [book['uuid']] for k, i in book["identifiers"].items(): if k not in index['identifiers']: index['identifiers'][k]={i:[book['uuid']]} elif i not in index['identifiers'][k]: index['identifiers'][k][i]=[book['uuid']] elif not book['uuid'] in index['identifiers'][k][i]: index['identifiers'][k][i].append(book['uuid']) # index['identifiers'][k][i]=index['identifiers'].get(k, {}).get(i, []) + [book['uuid']] else: print ('book {} in status {}: ignored'.format(book['uuid'], book['source']['status'])) else: print ("No ebook metadata found in:", root) print("titles indexed:", len(index['titles'])) print("authors indexed:", len(index['authors'])) print("identifiers indexed:", len(index['identifiers'])) # print("identifiers:",index['identifiers']) pickle.dump(index, open(filepath, 'wb')) def index_calibre_local(dir= '.', calibre_dir=''): filepath=dir+'/.index' if os.path.isfile(filepath): index=pickle.load(open(filepath, 'rb')) else: index = {} index['identifiers']={} index['authors']={} index['titles']={} for root, dirs, files in os.walk(dir, topdown=True): for d in dirs: print() print("-->", d) book = load_metadata(root, d) if book: if book['source']['status'] == "todo": print(book['uuid']) if book['title'] not in index['titles']: index['titles'][book['title']] = [book['uuid']] elif not book['uuid'] in index['titles'][book['title']]: index['titles'][book['title']].append(book['uuid']) # index['titles'][book['title']] = index['titles'].get(book['title'], []) + [book['uuid']] for a in book["authors"]: if a not in index['authors']: index['authors'][a] = [book['uuid']] elif not book['uuid'] in index['authors'][a]: index['authors'][a].append(book['uuid']) # index['authors'][a] = index['authors'].get(a, []) + [book['uuid']] for k, i in book["identifiers"].items(): if k not in index['identifiers']: index['identifiers'][k]={i:[book['uuid']]} elif i not in index['identifiers'][k]: index['identifiers'][k][i]=[book['uuid']] elif not book['uuid'] in index['identifiers'][k][i]: index['identifiers'][k][i].append(book['uuid']) # index['identifiers'][k][i]=index['identifiers'].get(k, {}).get(i, []) + [book['uuid']] else: print ('book {} in status {}: ignored'.format(book['uuid'], book['source']['status'])) else: print ("No ebook metadata found in:", root) print("titles indexed:", len(index['titles'])) print("authors indexed:", len(index['authors'])) print("identifiers indexed:", len(index['identifiers'])) # print("identifiers:",index['identifiers']) pickle.dump(index, open(filepath, 'wb')) def get_file_size(url): print("Downloading size:", url) r = requests.head(url) r.raise_for_status() size=r.headers['Content-Length'] print("Size received="+ hsize(size)) return int(size) def get_file(path, book, format): uuid = book['uuid'] url=book['source']['formats'][format]['url'] print("Downloading ebook:", url) print("Size expected (estimation):", hsize(book['source']['formats'][format]['size'])) r = requests.get(url) # headers = {"Range": "bytes=0-1023"} # r = requests.get(url, headers=headers) r.raise_for_status() # print(r.headers) if('Content-Length' in r.headers ): print("Size received="+hsize(r.headers['Content-Length'])) else: print("Fize received") filename=re.findall(r'filename="(.*)"', r.headers['Content-Disposition']) # print(filename) if len(filename): filepath=path+'/'+uuid+'/'+filename[0] else: filepath=path+'/'+uuid+'/'+uuid+"."+format os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True) with open(filepath+".tmp", 'wb') as fd: fd.write(r.content) shutil.move(filepath+".tmp", filepath) print("Saved to:", filepath) def set_status(uuid, status, dir='.'): book = load_metadata(dir, uuid) if book: if book['source']['status'] != status: book['source']['status'] = status save_metadata(dir, book) print("Status changed to", status+":", book['uuid']) else: print("Status unchanged changed ", status+":", book['uuid']) else: print ("No ebook metadata found for:", uuid) def remove_book(uuid, path='.'): print(os.getcwd()) bookpath=path+'/'+uuid if os.path.isdir(bookpath): try: shutil.rmtree(bookpath) print(uuid, "removed") except: print("problem") else: print(uuid, "not found") def explore(site, help=False): server=site api=server+'ajax/' print("Server:", server) url=api+'library-info' print() print("Getting libraries:", server) print(url) try: r = requests.get(url) r.raise_for_status() except: print("Unable to open site:", url) sys.exit(1) libraries = r.json()["library_map"].keys() print("Libraries:") for l in libraries: library='/'+l url=api+'search'+library+'?num=0' try: r = requests.get(url) r.raise_for_status() except: print("Unable to open site:", url) continue print("\t{}: {} ebooks".format(l, r.json()["total_num"])) def update_done_status(book): source=book['source'] if source['status']!='ignored': if set(source['formats'].keys()) == set(book['formats']) & set(source['formats'].keys()): book['source']['status']="done" else: book['source']['status']="todo" def index_ebooks(site, library="", start=0, stop=0, dir=".", force_refresh=False): offset= 0 if not start else start-1 num=500 server=site api=server+'ajax/' #api=server+'calibre/ajax/' library= '/'+library if library else library print("Server:", server) url=api+'search'+library+'?num=0' print() print("Getting ebooks count:", server) print(url) try: r = requests.get(url) r.raise_for_status() except: print("Unable to open site:", url) sys.exit(1) print("Total count=",r.json()["total_num"]) total_num=int(r.json()["total_num"]) total_num= total_num if not stop else stop range=offset+1 while offset < total_num: remaining_num = min(num, total_num - offset) print() print("Downloading ids: offset="+str(offset), "num="+str(remaining_num)) # url=api+'search?num='+str(remaining_num)+'&offset='+str(offset) url=api+'search'+library+'?num='+str(remaining_num)+'&offset='+str(offset)+'&sort=timestamp&sort_order=desc' print("->", url) r=requests.get(url) print("Ids received from:"+str(offset), "to:"+str(offset+remaining_num-1)) print() print("Downloading metadata from", str(offset+1), "to", str(offset+remaining_num)) books_s=",".join(str(i) for i in r.json()['book_ids']) url=api+'books'+library+'?ids='+books_s print("->", url) r=requests.get(url) print(len(r.json()), "received") for id in r.json().keys(): print() print ('--> range={}/{}'.format(str(range),str(total_num))) uuid=r.json()[id]['uuid'] if not uuid: print ("No uuid for ebook: ignored") continue # print ('\r--> range={}/{}'.format(str(range),str(total_num)), "uuid="+uuid, "("+r.json()[id]['title']+")", end='') # print (r.json()[id]) # title= r.json()[id]['title'] if 'title' in r.json()[id] else "" print("uuid="+uuid, "("+r.json()[id]['title']+")") if not force_refresh: # print("Checking local metadata:", uuid) try: book = load_metadata(dir, uuid) except: print("Unable to get metadata from:", uuid) range+=1 continue if book: print("Metadata already present for:", uuid) range+=1 continue if not r.json()[id]['formats']: print("No format found for {}".format(r.json()[id]['uuid'])) range+=1 continue book={} url=api+'book/'+id book['title']=r.json()[id]['title'] book['authors']=r.json()[id]['authors'] book['series']=r.json()[id]['series'] book['series']=r.json()[id]['series'] book['series_index']=r.json()[id]['series_index'] book['edition']=0 book['uuid']=r.json()[id]['uuid'] book['identifiers']=r.json()[id]['identifiers'] book['comments']=r.json()[id]['comments'] book['pubdate']=r.json()[id]['pubdate'] book['publisher']=r.json()[id]['publisher'] languages=r.json()[id]['languages'] if not languages: pass # print ("Analyzing languages") # if book['comments']: # text=book['comments'] # else: # text=book['title'] # s_language, prob=identifier.classify(text) # print (s_language, prob) # if prob >= 0.85: # language = iso639.to_iso639_2(s_language) # print("language=", language) # book['languages']=[language] # else: # book['languages']=[] else: book['languages']=[] for l in languages: book['languages'].append(iso639.to_iso639_2(l)) book['tags']=r.json()[id]['tags'] book['formats']=[] book['metadata_version']=0.1 source={} source['url']=url source['id']=id try: tmpbook = load_metadata(dir, uuid) except: print("Unable to get metadata from:", uuid) range+=1 continue if tmpbook and tmpbook['source']['status']=="ignored": source['status']="ignored" else: source['status']="todo" source['cover']=server+r.json()[id]['cover'] source['timestamp']=r.json()[id]['timestamp'] format_sources={} formats=r.json()[id]['formats'] for f in formats: s={} url='' if f in r.json()[id]['main_format']: url=r.json()[id]['main_format'][f] else: url=r.json()[id]['other_formats'][f] s['url']=server+url if 'size' in r.json()[id]['format_metadata'][f]: s['size']=int(r.json()[id]['format_metadata'][f]['size']) else: print("Size not found for format '{}' : {}".format(f, uuid)) print("Trying to get size online: {}".format(s['url'])) try: s['size']=get_file_size(s['url']) except: print("Unable to access format '{}' : {} skipped".format(f, uuid)) continue s['status']='todo' format_sources[f]=s source['formats']=format_sources book['source']=source if not source['formats']: print("No format found for {}".format(r.json()[id]['uuid'])) range+=1 continue update_done_status(book) print("Saving metadata for:", uuid) try: save_metadata(dir, book) except: print("Unable to save book metadata", book['uuid']) range+=1 offset=offset+num def has_languages(book, languages=[], ignore_empty_language=False): print("Accepted languages", languages) if not ignore_empty_language: print("Unknown language accepted") # rustine if not 'languages' in book: book['languages']=[] print("Book languages", book['languages']) if ignore_empty_language and not book['languages']: print ("'{}' ignored: language is empty".format(book['uuid'])) return False if not ignore_empty_language and not book['languages']: print ("'{}' todo: language is empty".format(book['uuid'])) return True expected_languages=list(set(book['languages']) & set(languages)) if languages and not expected_languages: print ("'{}' ignored: language {} not in {}".format(book['uuid'], book['languages'],languages)) return False print ("'{}' todo: expected languages {}".format(book['uuid'], expected_languages)) return True def has_identifiers(book, identifiers=[], ignore_empty_identifiers=False): print("Accepted identifiers", identifiers) if not ignore_empty_identifiers: print("Unknown identifiers accepted") print("Book identifiers", book['identifiers'].keys()) if ignore_empty_identifiers and not book['identifiers']: print ("'{}' ignored: identifier is empty".format(book['uuid'])) return False if not ignore_empty_identifiers and not book['identifiers']: print ("'{}' todo: identifiers is empty".format(book['uuid'])) return True expected_identifiers=list(set(book['identifiers'].keys()) & set(identifiers)) if identifiers and not expected_identifiers: print ("'{}' ignored: identifiers {} not in {}".format(book['uuid'], book['identifiers'].keys(), identifiers)) return False print ("'{}' todo: expected identifiers {}".format(book['uuid'], expected_identifiers)) return True def download_ebooks(dir= '.', server='', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False, dry_run=False): # all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip'] if single_format: my_formats = formats if formats else all_ordered_formats else: my_formats=formats print("formats=", my_formats) min_size=int(min_size)*1024*1024 max_size=int(max_size)*1024*1024 print ("Format expected between {} and {}".format(hsize(min_size), hsize(max_size) if max_size else "infinity")) # sys.exit() total_size=0 total_size_by_format={} total_ebook_count=0 total_format_count=0 total_count_by_format={} size_max=0 size_min=0 language_count={} identifiers_count={} for root, dirs, files in os.walk(dir, topdown=True): for uuid in dirs: book = load_metadata(root, uuid) if book: status=book['source']['status'] if status=="todo": print() print("-->", uuid, "("+book['title']+")") if not has_languages(book, languages=languages, ignore_empty_language=ignore_empty_language): continue if not has_identifiers(book, identifiers=identifiers, ignore_empty_identifiers=ignore_empty_identifiers): continue source=book['source'] download_formats = get_formats_to_download(book, accepted_formats=my_formats, single_format=single_format, ignored_formats=ignored_formats, max_size=max_size, min_size=min_size) if not len(download_formats): print ("'{}' ignored: no more format available in formats expected {}".format(uuid, download_formats)) else: ebook_kept=False for f in download_formats: url = source['formats'][f]['url'] if url: if get_file_path(dir, uuid, f): print ("Format '{}' already present for {}: Skipped".format(f, uuid)) continue print("Format '{}': size expected={}".format(f, hsize(source['formats'][f]['size']))) if not dry_run: try: get_file(dir, book, f) book['formats'].append(f) # except: except Exception as msg: print("Unable to get book:", url) print(msg) continue save_metadata(dir, book) ebook_kept=True size=source['formats'][f]['size'] total_size += size size_max = size if size>size_max else size_max if not size_min: size_min = size else: size_min = size if size' in language_count: language_count[''] = 1 else: language_count['']+=1 else: for l in book['languages']: if not l in language_count: language_count[l] = 1 else: language_count[l]+=1 if not book['identifiers']: if not '' in identifiers_count: identifiers_count[''] = 1 else: identifiers_count['']+=1 else: for l in book['identifiers'].keys(): if not l in identifiers_count: identifiers_count[l] = 1 else: identifiers_count[l]+=1 if not dry_run: update_done_status(book) if book['source']['status']=="done": save_metadata(dir, book) print("Book done:", book['uuid']) # total_ebook_count+=1 else: print() print("-->", uuid, "("+book['title']+")") print ('{} in status "{}": skipped'.format(book['uuid'], status)) print() print("Total count of updated ebooks:", total_ebook_count) print("Total ebooks updated by language:") for l, c in language_count.items(): print(" '{}': {}".format(l, c)) print("Total ebooks updated by identifiers:") for l, c in identifiers_count.items(): print(" '{}': {}".format(l, c)) print("Total count of formats:", total_format_count) print("Total count of ebooks by format:") for f, c in total_count_by_format.items(): print("\t'{}': {}".format(f, c)) print() print("Total size:", hsize(total_size)) print("Maximum file size:", hsize(size_max)) print("Minimum file size:", hsize(size_min)) print("Total size by format:") for f, s in total_size_by_format.items(): print("\t'{}': {}".format(f, hsize(s))) def get_formats_to_download(book, accepted_formats=[], ignored_formats=[], single_format=False, min_size=0, max_size=0): print("Accepted formats", accepted_formats) source=book['source'] print("Formats available in source: {}".format(list(source['formats'].keys()))) my_formats=[] for f,v in source['formats'].items(): if v['status']=='todo': my_formats.append(f) print("Formats in 'todo': {}".format(my_formats)) formats=[] if single_format: if accepted_formats: for f in accepted_formats: if f in my_formats: formats=[f] break else: print("need at least 1 format for ordering") else: if accepted_formats: formats=list(set(accepted_formats) & set(my_formats)) elif ignored_formats: formats = list(set(my_formats) - set(ignored_formats)) else: formats=my_formats print("Formats expected: {}".format(formats)) download_formats=formats[:] for f in formats: if not 'size' in source['formats'][f] and max_size: print ("Format '{}' ignored for {}: Size unknown".format(f, book['uuid'])) download_formats.remove(f) else: size = source['formats'][f]['size'] if size < min_size or (max_size and size > max_size): download_formats.remove(f) print ("Format '{}' ignored for {}: size={} but expected between {} and {}".format(f, book['uuid'], hsize(size), hsize(min_size), hsize(max_size) if max_size else "infinity")) return download_formats def update_format_statuses(book,refresh_ignored): formats=book['source']['formats'] for f, v in formats.items(): if v['status']=='ignored' and not refresh_ignored: print ("Format '{}' ignored: {} ({}))".format(f, book['uuid'], book['title'])) else: print ("Format '{}' todo: {} ({}))".format(f, book['uuid'], book['title'])) book['source']['formats'][f]['status']='todo' def filter_ebooks(dir= '.', server='', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False): if single_format: my_formats = formats if formats else all_ordered_formats else: my_formats=formats print("formats=", my_formats) min_size=int(min_size)*1024*1024 max_size=int(max_size)*1024*1024 print ("Format expected between {} and {}".format(hsize(min_size), hsize(max_size) if max_size else "infinity")) # sys.exit() total_ebook_count=0 total_format_count=0 for root, dirs, files in os.walk(dir, topdown=True): for uuid in dirs: book = load_metadata(root, uuid) if book: status=book['source']['status'] if status=="todo": print() print("-->", uuid, "("+book['title']+")") if not has_languages(book, languages=languages, ignore_empty_language=ignore_empty_language): book['source']['status']='ignored' print ("{} ignored: languages filtered".format(uuid)) save_metadata(dir, book) total_ebook_count+=1 continue if not has_identifiers(book, identifiers=identifiers, ignore_empty_identifiers=ignore_empty_identifiers): book['source']['status']='ignored' print ("{} ignored: identifiers filtered".format(uuid)) save_metadata(dir, book) total_ebook_count+=1 continue download_formats = get_formats_to_download(book, accepted_formats=my_formats, single_format=single_format, ignored_formats=ignored_formats, max_size=max_size, min_size=min_size) save_ebook=False source=book['source'] formats_to_ignore=list(set(source['formats'].keys()) - set(book['formats']) - set(download_formats)) print("formats to ignore:", formats_to_ignore) for f in formats_to_ignore: if source['formats'][f]['status']!='ignored': source['formats'][f]['status']='ignored' print ("{} format ignored: '{}'".format(uuid, f)) total_format_count +=1 save_ebook=True if save_ebook:æ save_metadata(dir, book) else: print() print("-->", uuid, "("+book['title']+")") print ('{} in status "{}": skipped'.format(book['uuid'], status)) print() print("Total count of newly ignored ebooks:", total_ebook_count) print("Total count of newly formats to ignore:", total_format_count) def reset_ignored(dir= '.', server=''): for root, dirs, files in os.walk(dir, topdown=True): for uuid in dirs: save_ebook=False book = load_metadata(root, uuid) if book: status=book['source']['status'] if status=="ignored": print ("'{}' status 'ignored' reset to 'todo'".format(book['uuid'])) book['source']['status']='todo' save_ebook=True formats=book['source']['formats'] for f, v in formats.items(): if v['status']=='ignored': print ("'{}' format 'ignored' reset to 'todo'".format(book['uuid'])) book['source']['formats'][f]['status']='todo' save_ebook=True if save_ebook: save_metadata(dir, book) if __name__ == "__main__": fire.Fire()