import sys
import os
import time
import re
import shutil
import requests
import json
import fire
from humanize import naturalsize as hsize
from langid.langid import LanguageIdentifier, model
import iso639
import time
from requests.adapters import HTTPAdapter
import urllib.parse
import urllib3
from beautifultable import BeautifulTable
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip', 'fb2']
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
def load_metadata(path, uuid):
filepath=path+'/'+uuid+'/metadata.json'
# print (filepath)
if os.path.isfile(filepath):
try:
with open(filepath, 'r') as fd:
return json.load(fd)
except:
print ("Error loading metadata for:", uuid, "from path:", path)
return 0
else:
# print ("Metadata not found for:", uuid, "from path:", path)
return 0
def save_metadata(path, book):
filepath=path+'/'+book['uuid']+'/metadata.json'
# print("Saving book metadata for:", book['uuid'], "to:", filepath)
os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True)
with open(filepath+".tmp", 'w') as fd:
json.dump(book, fd, indent=4, separators=(',', ': '))
try:
shutil.move(filepath+".tmp", filepath)
# print("Saved to:", filepath)
except:
print("Unable to rename .tmp file:", filepath+".tmp")
def get_cover_path(path, uuid):
filepath=path+'/'+uuid+'/cover.jpg'
if os.path.isfile(filepath): return filepath
else: return 0
def get_file_path(path, uuid, fileformat):
files=os.listdir(path+'/'+uuid)
if files:
for f in files:
fname, ext=os.path.splitext(f)
if ext =='.'+fileformat:
return path+'/'+uuid+'/'+f
else: return 0
else: return 0
def get_cover(path, book, map):
url=book['source']['cover']
if map:
pu=urllib.parse.urlparse(url)
pu=(pu[0], map, *pu[2:])
print(pu)
url=urllib.parse.urlunparse(pu)
print("Downloading cover from:", url)
r=requests.get(url, timeout=(20, 3), verify=False)
r.raise_for_status()
filepath=path+'/'+book['uuid']+'/cover.jpg'
os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True)
with open(filepath+".tmp", 'wb') as fd:
fd.write(r.content)
shutil.move(filepath+".tmp", filepath)
print("Saved to:", filepath)
def download_covers(dir='my_books', server='', map=""):
""" Download covers for each books"""
for root, dirs, files in os.walk(dir, topdown=True):
for d in dirs:
# print()
# print("-->", d)
book = load_metadata(root, d)
if book:
# if book['source']['status'] != "ignored":
if True:
if not get_cover_path(root, book['uuid']):
print()
print("-->", d)
print(book['uuid'])
try:
get_cover(root, book, map)
except:
print ("Unable to get cover", book['uuid'])
else:
pass
# print ("Cover already present:", book['uuid'])
else:
print ('book {} in status {}: ignored'.format(book['uuid'], book['source']['status']))
else:
print ("No ebook metadata found in:", root)
def get_file_size(url):
print("Downloading size:", url)
r = requests.head(url, verify=False)
r.raise_for_status()
size=r.headers['Content-Length']
print("Size received="+ hsize(size))
return int(size)
def get_file(path, book, format, session, map, map_lib):
uuid = book['uuid']
url=book['source']['formats'][format]['url']
if map:
pu=urllib.parse.urlparse(url)
pu=(pu[0], map, *pu[2:])
print(pu)
url=urllib.parse.urlunparse(pu)
if map_lib:
# pu=urllib.parse.urlparse(url)
# print(pu)
url_s=url.split("/")
# print(url_s)
url_s=url_s[:-1]+[map_lib]
# print('/'.join(url_s))
url='/'.join(url_s)
print("Downloading ebook:", url)
print("Size expected (estimation):", hsize(book['source']['formats'][format]['size']))
r = session.get(url, timeout=(25,15), verify=False)
# headers = {"Range": "bytes=0-1023"}
# r = requests.get(url, headers=headers)
r.raise_for_status()
# print(r.headers)
if('Content-Length' in r.headers ):
print("Size received="+hsize(r.headers['Content-Length']))
else:
print("Fize received")
filename=re.findall(r'filename="(.*)"', r.headers['Content-Disposition'])
# print(filename)
if len(filename):
filepath=path+'/'+uuid+'/'+filename[0]
else:
filepath=path+'/'+uuid+'/'+uuid+"."+format
os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True)
with open(filepath+".tmp", 'wb') as fd:
fd.write(r.content)
shutil.move(filepath+".tmp", filepath)
print("Saved to:", filepath)
def set_status(uuid, status, dir='.'):
book = load_metadata(dir, uuid)
if book:
if book['source']['status'] != status:
book['source']['status'] = status
save_metadata(dir, book)
print("Status changed to", status+":", book['uuid'], "(", book['title'], ")")
else:
print("Status unchanged changed ", status+":", book['uuid'])
else:
print ("No ebook metadata found for:", uuid)
def remove_book(uuid, path='.'):
print(os.getcwd())
bookpath=path+'/'+uuid
if os.path.isdir(bookpath):
try:
shutil.rmtree(bookpath)
print(uuid, "removed")
except:
print("problem")
else:
print(uuid, "not found")
def update_done_status(book):
source=book['source']
if source['status']!='ignored':
if set(source['formats'].keys()) == set(book['formats']) & set(source['formats'].keys()):
book['source']['status']="done"
else:
book['source']['status']="todo"
def index_ebooks(site, library="", start=0, stop=0, dir="my_books", inc=1000, force_refresh=False):
"""
Index a remote Calibre library
You will get in your
all the metadata (title, authors, isbn, ...) for each book.
They're stored as simple JSON files (metadata.json) so that you can easily visualize them or process them with 'jq' program.
They are stored in subdirectories with a UUID as a name. These directories do match different books and allow you to group all
the different formats of the same book and eventually the cover file.
You can mix books from different sites without any (theoric) collisions
Params:
--site= : Url of the site to index (ex: http://123.123.123.123/)
--library= (default=my_books) : Id of library to index. The script index the default library by default.
The id is string following '&library_id=' in the url
--force_refresh (defaul=False) : Force a refresh of the metadata. By default all the metdata
already gathered are ignored
--start= (default=0)
--stop= (default=0) : Allow indexing between a range of ebooks
--inc= (default=1000) : Fix the number of ebooks for each request one the server
"""
os.makedirs(dir, exist_ok=True)
offset= 0 if not start else start-1
num=min(1000,inc)
server=site
api=server+'ajax/'
library= '/'+library if library else library
print("Server:", server)
url=api+'search'+library+'?num=0'
print()
print("Getting ebooks count:", server)
try:
r = requests.get(url,verify=False)
r.raise_for_status()
except:
print("Unable to open site:", url)
sys.exit(1)
print("Total count=",r.json()["total_num"])
total_num=int(r.json()["total_num"])
total_num= total_num if not stop else stop
print()
print("Start indexing")
range=offset+1
while offset < total_num:
remaining_num = min(num, total_num - offset)
# print()
# print("Downloading ids: offset="+str(offset), "num="+str(remaining_num))
url=api+'search'+library+'?num='+str(remaining_num)+'&offset='+str(offset)+'&sort=timestamp&sort_order=desc'
# print("->", url)
r=requests.get(url, verify=False)
# print("Ids received from:"+str(offset), "to:"+str(offset+remaining_num-1))
# print()
# print("\rDownloading metadata from", str(offset+1), "to", str(offset+remaining_num),end='')
books_s=",".join(str(i) for i in r.json()['book_ids'])
url=api+'books'+library+'?ids='+books_s
# print("->", url)
r=requests.get(url, verify=False)
# print(len(r.json()), "received")
for id, r_book in r.json().items():
uuid=r_book['uuid']
if not uuid:
print ("No uuid for ebook: ignored")
continue
if r_book['authors']:
desc= f"uuid={uuid} ({r_book['title']} / {r_book['authors'][0]})"
else:
desc= f"uuid={uuid} ({r_book['title']})"
s=f"\r--> {range}/{total_num} - {desc}"
s='{:140.140}'.format(s)
print (s, end='')
if not force_refresh:
try:
book = load_metadata(dir, uuid)
except:
print()
print("Unable to get metadata from:", uuid)
range+=1
continue
if book:
# print("Metadata already present for:", uuid)
range+=1
continue
if not r_book['formats']:
print()
print("No format found for {}".format(r_book['uuid']))
range+=1
continue
book={}
url=api+'book/'+id
book['title']=r_book['title']
book['authors']=r_book['authors']
book['series']=r_book['series']
book['series_index']=r_book['series_index']
book['edition']=0
book['uuid']=r_book['uuid']
book['identifiers']=r_book['identifiers']
book['comments']=r_book['comments']
book['pubdate']=r_book['pubdate']
book['publisher']=r_book['publisher']
languages=r_book['languages']
if not languages:
# if True:
if book['comments']:
text=book['comments']
else:
text=book['title']
s_language, prob=identifier.classify(text)
if prob >= 0.85:
language = iso639.to_iso639_2(s_language)
book['languages']=[language]
else:
book['languages']=[]
else:
book['languages']=[]
for l in languages:
book['languages'].append(iso639.to_iso639_2(l))
book['tags']=r_book['tags']
book['formats']=[]
book['metadata_version']=0.1
source={}
source['url']=url+library
source['id']=id
try:
tmpbook = load_metadata(dir, uuid)
except:
print("Unable to get metadata from:", uuid)
range+=1
continue
if tmpbook and tmpbook['source']['status']=="ignored":
source['status']="ignored"
else:
source['status']="todo"
source['cover']=server+r_book['cover']
source['timestamp']=r_book['timestamp']
format_sources={}
formats=r_book['formats']
for f in formats:
s={}
url=''
if f in r_book['main_format']:
url=r_book['main_format'][f]
else:
url=r_book['other_formats'][f]
s['url']=server+url
if 'size' in r_book['format_metadata'][f]:
s['size']=int(r_book['format_metadata'][f]['size'])
else:
print()
print("Size not found for format '{}' : {}".format(f, uuid))
print("Trying to get size online: {}".format(s['url']))
try:
s['size']=get_file_size(s['url'])
except:
print("Unable to access format '{}' : {} skipped".format(f, uuid))
continue
s['status']='todo'
format_sources[f]=s
source['formats']=format_sources
book['source']=source
if not source['formats']:
print("No format found for {}".format(r_book['uuid']))
range+=1
continue
update_done_status(book)
# print("Saving metadata for:", uuid)
try:
save_metadata(dir, book)
except:
print()
print("Unable to save book metadata", book['uuid'])
range+=1
offset=offset+num
print()
print("Done")
def has_languages(book, languages=[], ignore_empty_language=False):
# print("Accepted languages", languages)
if not ignore_empty_language:
# print("Unknown language accepted")
pass
# rustine
if not 'languages' in book:
book['languages']=[]
# print("Book languages", book['languages'])
if ignore_empty_language and not book['languages']:
# print ("'{}' ignored: language is empty".format(book['uuid']))
return False
if not ignore_empty_language and not book['languages']:
# print ("'{}' todo: language is empty".format(book['uuid']))
return True
expected_languages=list(set(book['languages']) & set(languages))
if languages and not expected_languages:
# print ("'{}' ignored: language {} not in {}".format(book['uuid'], book['languages'],languages))
return False
# print ("'{}' todo: expected languages {}".format(book['uuid'], expected_languages))
return True
def has_identifiers(book, identifiers=[], ignore_empty_identifiers=False):
# print("Accepted identifiers", identifiers)
if not ignore_empty_identifiers:
# print("Unknown identifiers accepted")
pass
# print("Book identifiers", book['identifiers'].keys())
if ignore_empty_identifiers and not book['identifiers']:
# print ("'{}' ignored: identifier is empty".format(book['uuid']))
return False
if not ignore_empty_identifiers and not book['identifiers']:
# print ("'{}' todo: identifiers is empty".format(book['uuid']))
return True
expected_identifiers=list(set(book['identifiers'].keys()) & set(identifiers))
if identifiers and not expected_identifiers:
# print ("'{}' ignored: identifiers {} not in {}".format(book['uuid'], book['identifiers'].keys(), identifiers))
return False
# print ("'{}' todo: expected identifiers {}".format(book['uuid'], expected_identifiers))
return True
def download_ebooks(dir= 'my_books', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False, dry_run=False, map="", map_lib=""):
# all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip']
if single_format: my_formats = formats if formats else all_ordered_formats
else: my_formats=formats
print("formats=", my_formats)
min_size=int(min_size)*1024*1024
max_size=int(max_size)*1024*1024
print ("Format expected between {} and {}".format(hsize(min_size), hsize(max_size) if max_size else "infinity"))
total_size=0
total_size_by_format={}
total_ebook_count=0
total_format_count=0
total_count_by_format={}
size_max=0
size_min=0
language_count={}
identifiers_count={}
s = requests.Session()
for root, dirs, files in os.walk(dir, topdown=True):
for counter, uuid in enumerate(dirs):
book = load_metadata(root, uuid)
if book:
status=book['source']['status']
if status=="todo":
if not has_languages(book, languages=languages, ignore_empty_language=ignore_empty_language):
continue
if not has_identifiers(book, identifiers=identifiers, ignore_empty_identifiers=ignore_empty_identifiers):
continue
source=book['source']
download_formats = get_formats_to_download(book, accepted_formats=my_formats, single_format=single_format, ignored_formats=ignored_formats, max_size=max_size, min_size=min_size)
if not len(download_formats):
# print ("'{}' ignored: no more format available in formats expected {}".format(uuid, download_formats))
# print()
pass
else:
ebook_kept=False
for f in download_formats:
url = source['formats'][f]['url']
# if map:
# pu=urllib.parse.urlparse(url)
# pu=(pu[0], map, *pu[2:])
# print(pu)
# print(urllib.parse.urlunparse(pu))
if url:
# # It shouldn't occur: Need to download again
if get_file_path(dir, uuid, f):
# print ("Format '{}' already present for {}: Retrying".format(f, uuid))
# print()
# continue
# print("Format '{}': size expected={}".format(f, hsize(source['formats'][f]['size'])))
pass
print(f"--> format '{f}' for ({book['title']} / {book['authors'][0]} / {str(book['series'])})")
if not dry_run:
try:
get_file(dir, book, f, s, map, map_lib)
book['formats'].append(f)
book['source']['formats'][f]['status']="done"
time.sleep(0)
except Exception as msg:
print("Unable to get book:", url)
print(msg)
time.sleep(5)
continue
save_metadata(dir, book)
ebook_kept=True
size=source['formats'][f]['size']
total_size += size
size_max = size if size>size_max else size_max
if not size_min:
size_min = size
else:
size_min = size if size' in language_count:
language_count[''] = 1
else:
language_count['']+=1
else:
for l in book['languages']:
if not l in language_count:
language_count[l] = 1
else:
language_count[l]+=1
if not book['identifiers']:
if not '' in identifiers_count:
identifiers_count[''] = 1
else:
identifiers_count['']+=1
else:
for l in book['identifiers'].keys():
if not l in identifiers_count:
identifiers_count[l] = 1
else:
identifiers_count[l]+=1
if not dry_run:
update_done_status(book)
if book['source']['status']=="done":
save_metadata(dir, book)
print("Book done:", book['uuid'])
print()
# total_ebook_count+=1
else:
# print()
# print("-->", uuid, "("+book['title']+")")
# print ('{} in status "{}": skipped'.format(book['uuid'], status))
# print(f"--> {uuid} ({book['title']}) in status {status}: skipped", end="\r")
# print(f"--> {uuid} ({book['title']})", end="\r")
print(f'--> {counter} books handled', end="\r")
print()
print("Reporting ...")
print()
table = BeautifulTable()
table.column_headers = ["", "Total count"]
table.append_row(["Formats", total_format_count])
table.append_row(["Ebooks", total_ebook_count])
print(table)
print()
table = BeautifulTable()
table.column_headers = ["", "Size"]
table.append_row(["Min", hsize(size_min)])
table.append_row(["Max", hsize(size_max)])
table.append_row(["Total", hsize(total_size)])
print(table)
print()
print("Total ebooks updated by language:")
table = BeautifulTable()
table.column_headers = ["Language", "Ebooks count"]
for l, c in language_count.items():
table.append_row([l, c])
print(table)
print()
print("Total ebooks updated by identifiers:")
table = BeautifulTable()
table.column_headers = ["Identifiers", "Ebooks count"]
for i, c in identifiers_count.items():
table.append_row([i, c])
print(table)
print()
print("Total count of ebooks by format:")
table = BeautifulTable()
table.column_headers = ["Formats", "Ebooks count"]
for f, c in total_count_by_format.items():
table.append_row([f, c])
print(table)
print()
print("Total size by format:")
table = BeautifulTable()
table.column_headers = ["Format:", "Size"]
for f, s in total_size_by_format.items():
table.append_row([f, hsize(s)])
print(table)
print()
print("Done !!!")
def get_formats_to_download(book, accepted_formats=[], ignored_formats=[], single_format=False, min_size=0, max_size=0):
# print("Accepted formats", accepted_formats)
source=book['source']
# print("Formats available in source: {}".format(list(source['formats'].keys())))
my_formats=[]
for f,v in source['formats'].items():
if v['status']=='todo':
my_formats.append(f)
# print("Formats in 'todo': {}".format(my_formats))
formats=[]
if single_format:
if accepted_formats:
for f in accepted_formats:
if f in my_formats:
formats=[f]
break
else:
print("need at least 1 format for ordering")
else:
if accepted_formats:
formats=list(set(accepted_formats) & set(my_formats))
elif ignored_formats:
formats = list(set(my_formats) - set(ignored_formats))
else:
formats=my_formats
# print("Formats expected: {}".format(formats))
download_formats=formats[:]
for f in formats:
if not 'size' in source['formats'][f] and max_size:
# print ("Format '{}' ignored for {}: Size unknown".format(f, book['uuid']))
download_formats.remove(f)
else:
size = source['formats'][f]['size']
if size < min_size or (max_size and size > max_size):
download_formats.remove(f)
# print ("Format '{}' ignored for {}: size={} but expected between {} and {}".format(f, book['uuid'], hsize(size), hsize(min_size), hsize(max_size) if max_size else "infinity"))
return download_formats
def update_format_statuses(book,refresh_ignored):
formats=book['source']['formats']
for f, v in formats.items():
if v['status']=='ignored' and not refresh_ignored:
# print ("Format '{}' ignored: {} ({}))".format(f, book['uuid'], book['title']))
pass
else:
# print ("Format '{}' todo: {} ({}))".format(f, book['uuid'], book['title']))
book['source']['formats'][f]['status']='todo'
if __name__ == "__main__":
fire.Fire({
"index_ebooks": index_ebooks,
"download_ebooks": download_ebooks,
"download_covers": download_covers,
"set_status": set_status
})