#!/usr/bin/env python3 """ This is a tool for recovering lost machine units in current model Usage: {0} model-name dest-dir """ # changes summary: # added logic to prevent overwrite /var/lib/juju to avoid removing non-machine units # added logic for detecting path of mongo binary (e.g. if present in $PATH or is in /usr/lib/juju/mongo*/bin) # added logic to determine primary mongo node to run queries against PRIMARY mongo # added logic to also restore systemd unit files # removed need to specify controller ip, as this is set when mongo PRIMARY detected import json import os import shlex import shutil import subprocess import sys import tempfile MONGOPASS_CMD = "juju ssh ubuntu@%s \"sudo grep ^apipassword: /var/lib/juju/agents/machine*/agent.conf\" | awk -e '{print $2}'" MONGO_CMD = "" MONGO_TEMPLATE = "juju ssh -m controller ubuntu@%s \"sudo %s --port 37017 --sslAllowInvalidCertificates --ssl --authenticationDatabase admin -u machine-%s -p %s juju < /home/ubuntu/%s 2>/dev/null\" | tail -n 2 | head -n 1" def run(cmd, output=True, shell=True): print(cmd) if output: return subprocess.check_output(cmd, shell=shell).decode().strip() return subprocess.call(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=shell) def update_machine_password(controller, model, machine_number, passwordhash): file_content = """use juju db.machines.update({"model-uuid": "%s", "machineid": "%s"}, {$set:{"passwordhash": "%s"}}) """ % (model, machine_number, passwordhash) with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file: tmp_file.write(file_content) run("juju scp {} ubuntu@{}:/home/ubuntu".format(tmp_file.name, controller)) password = run(MONGOPASS_CMD % controller) run(MONGO_CMD % (controller, password, os.path.basename(tmp_file.name))) def get_model_uuid(controller, model): file_content = """use juju db.models.find({"name": "%s"}, {"modeluuid": 1}) """ % (model) with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file: tmp_file.write(file_content) run("juju scp {} ubuntu@{}:/home/ubuntu".format(tmp_file.name, controller)) password = run(MONGOPASS_CMD % controller) uuid_json = run(MONGO_CMD % (controller, password, os.path.basename( tmp_file.name))) uuid = json.loads(uuid_json)['_id'] return uuid def determine_primary_mongo(): global MONGO_CMD file_content = """use juju rs.isMaster()['primary'] """ with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file: tmp_file.write(file_content) first_controller_num = run('juju machines -m controller |grep started|head -n1|cut -d " " -f1') mongo_cmd_tuple = gen_mongo_cmd(first_controller_num) temp_mongo_cmd = mongo_cmd_tuple[0] % os.path.basename(tmp_file.name) run("juju scp -m controller {} ubuntu@{}:/home/ubuntu".format(tmp_file.name, first_controller_num)) primary_ip = run(temp_mongo_cmd).split(':')[0] # now that primary established, we'll just use old logic for MONGO_CMD variable/pass # but hardcoding the machine number primary_controller_num = run('juju machines -m controller |grep {}|head -n1|cut -d " " -f1'.format(primary_ip)) MONGO_CMD = "juju ssh ubuntu@%s \"sudo {} --port 37017 --sslAllowInvalidCertificates " \ "--ssl --authenticationDatabase admin -u machine-{} -p %s juju < " \ "/home/ubuntu/%s 2>/dev/null\" | tail -n 2 | head -n 1".format(mongo_cmd_tuple[1],primary_controller_num) return primary_ip def gen_mongo_cmd(controller_num): binary_path = run('''juju ssh -m controller %s "ps aux|grep mongo"|grep -v grep|awk '{print $11}'|head -n1 | rev | cut -c 2- | rev''' % controller_num) #binary_path = "mongo" controller_pass = run("juju ssh -m controller %s 'sudo grep ^apipassword: /var/lib/juju/agents/machine*/agent.conf' | awk -e '{print $2}'" % controller_num ) return (MONGO_TEMPLATE % (controller_num, binary_path, controller_num, controller_pass, '%s'), binary_path) def get_donor_password(controller, donor, model_uuid): file_content = """use juju db.machines.find({"model-uuid": "%s", "machineid": "%s"}, {"passwordhash": 1}) """ % (model_uuid, donor) with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file: tmp_file.write(file_content) run("juju scp {} ubuntu@{}:/home/ubuntu".format(tmp_file.name, controller)) password = run(MONGOPASS_CMD % controller) attributes_json = run(MONGO_CMD % (controller, password, os.path.basename( tmp_file.name))) attributes = json.loads(attributes_json) passwordhash = attributes['passwordhash'] return passwordhash def get_machine_nonce(controller, machine, model_uuid): file_content = """use juju db.machines.find({"model-uuid": "%s", "machineid": "%s"}, {"nonce": 1}) """ % (model_uuid, machine) with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file: tmp_file.write(file_content) print('get machine nonce reached...') run("juju scp {} ubuntu@{}:/home/ubuntu".format(tmp_file.name, controller)) password = run(MONGOPASS_CMD % controller) attributes_json = run(MONGO_CMD % (controller, password, os.path.basename( tmp_file.name))) attributes = json.loads(attributes_json) nonce = attributes['nonce'] return nonce def recover_machine(machine_number, juju_tar, donor, passwordhash, controller, model, systemd_tar): print("Recovering machine {}".format(machine_number)) nonce = get_machine_nonce(controller, machine_number, model) check_juju_dir_or_create(machine_number) run("juju scp {} {}:/home/ubuntu/juju.tar".format( juju_tar, machine_number)) run("juju ssh {} 'sudo tar -xvf /home/ubuntu/juju.tar -C /var/lib --skip-old-files --keep-directory-symlink --dereference'".format(machine_number)) if 'lxd' in machine_number: machine_string = machine_number.replace('/', '-') run("juju ssh {} 'sudo mv -f /var/lib/juju/agents/machine-{} /var/lib/juju/agents/machine-{}'".format( machine_number, donor, machine_string)) run("juju ssh {} 'sudo mv -f /var/lib/juju/tools/machine-{} /var/lib/juju/tools/machine-{}'".format( machine_number, donor, machine_string)) run("juju ssh {} 'echo {} | sudo tee /var/lib/juju/nonce.txt'".format(machine_number, nonce)) else: run("juju ssh {} 'sudo mv -f /var/lib/juju/agents/machine-{} /var/lib/juju/agents/machine-{}'".format( machine_number, donor, machine_number)) run("juju ssh {} 'sudo mv -f /var/lib/juju/tools/machine-{} /var/lib/juju/tools/machine-{}'".format( machine_number, donor, machine_number)) run("juju ssh {} 'echo {} | sudo tee /var/lib/juju/nonce.txt'".format(machine_number, nonce)) # Update files if 'lxd' in machine_number: machine_string = machine_number.replace('/', '-') agent_file = "/var/lib/juju/agents/machine-{}/agent.conf".format(machine_string) run("juju ssh {} 'sudo sed -i \"s|tag: machine-{}|tag: machine-{}|g\" {}'".format( machine_number, donor, machine_string, agent_file)) run("juju ssh {} 'sudo sed -i \"s|jujud-machine-{}|jujud-machine-{}|g\" {}'".format( machine_number, donor, machine_string, agent_file)) run("juju ssh {} 'sudo sed -i \"s/nonce: .*/nonce: {}/g\" {}'".format( machine_number, nonce, agent_file)) else: agent_file = "/var/lib/juju/agents/machine-{}/agent.conf".format(machine_number) run("juju ssh {} 'sudo sed -i \"s/tag: machine-{}/tag: machine-{}/g\" {}'".format( machine_number, donor, machine_number, agent_file)) run("juju ssh {} 'sudo sed -i \"s/jujud-machine-{}/jujud-machine-{}/g\" {}'".format( machine_number, donor, machine_number, agent_file)) run("juju ssh {} 'sudo sed -i \"s/nonce: .*/nonce: {}/g\" {}'".format( machine_number, nonce, agent_file)) # restore unit files if 'lxd' in machine_number: run("juju scp {} {}:/home/ubuntu/systemd.tar".format( systemd_tar, machine_number)) machine_string = machine_number.replace('/', '-') run("juju ssh {} 'sudo rm -rf /lib/systemd/system/jujud-machine-{}'".format(machine_number,machine_string)) run("juju ssh {} 'sudo tar -xvf /home/ubuntu/systemd.tar -C / --skip-old-files'".format(machine_number)) run("juju ssh {} 'sudo cp -nrp /lib/systemd/system/jujud-machine-{} /lib/systemd/system/jujud-machine-{}'".format(machine_number,donor,machine_string)) run("juju ssh {} 'sudo cp -p /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service'".format( machine_number,donor, donor, machine_string, machine_string)) exec_start_file = "/lib/systemd/system/jujud-machine-{}/exec-start.sh".format(machine_string) jujud_unit_file = "/lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service".format(machine_string, machine_string) run("juju ssh {} 'sudo sed -i \"s|machine-{}|machine-{}|g\" {}'".format( machine_number, donor, machine_string, exec_start_file)) run("juju ssh {} 'sudo sed -i \"s|--machine-id {}|--machine-id {}|g\" {}'".format( machine_number, donor, machine_number, exec_start_file)) run("juju ssh {} 'sudo sed -i \"s|machine-{}|machine-{}|g\" {}'".format( machine_number, donor, machine_string, jujud_unit_file)) else: run("juju scp {} {}:/home/ubuntu/systemd.tar".format( systemd_tar, machine_number)) run("juju ssh {} 'sudo rm -rf /lib/systemd/system/jujud-machine-{}'".format(machine_number,machine_number)) run("juju ssh {} 'sudo tar -xvf /home/ubuntu/systemd.tar -C / --skip-old-files'".format(machine_number)) run("juju ssh {} 'sudo cp -nrp /lib/systemd/system/jujud-machine-{} /lib/systemd/system/jujud-machine-{}'".format(machine_number,donor,machine_number)) run("juju ssh {} 'sudo cp -p /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service'".format( machine_number,donor, donor, machine_number, machine_number)) exec_start_file = "/lib/systemd/system/jujud-machine-{}/exec-start.sh".format(machine_number) jujud_unit_file = "/lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service".format(machine_number, machine_number) run("juju ssh {} 'sudo sed -i \"s/machine-{}/machine-{}/g\" {}'".format( machine_number, donor, machine_number, exec_start_file)) run("juju ssh {} 'sudo sed -i \"s/--machine-id {}/--machine-id {}/g\" {}'".format( machine_number, donor, machine_number, exec_start_file)) run("juju ssh {} 'sudo sed -i \"s/machine-{}/machine-{}/g\" {}'".format( machine_number, donor, machine_number, jujud_unit_file)) # Update mongo update_machine_password(controller, model, machine_number, passwordhash) # Restart services link_to = determine_juju_version(machine_number) run("juju ssh {} 'for u in $(sudo ls /var/lib/juju/agents/|sort); do sudo ln -sf /var/lib/juju/tools/{} /var/lib/juju/tools/$u; done'".format(machine_number,link_to)) run("juju ssh {} sudo systemctl daemon-reload".format(machine_number)) if 'lxd' in machine_number: machine_string = machine_number.replace('/', '-') run("juju ssh {} sudo systemctl enable /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service".format(machine_number, machine_string, machine_string)) run("juju ssh {} sudo systemctl restart jujud-machine-{}".format(machine_number, machine_string)) else: run("juju ssh {} sudo systemctl enable /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service".format(machine_number, machine_number, machine_number)) run("juju ssh {} sudo systemctl restart jujud-machine-{}".format(machine_number, machine_number)) def check_juju_dir_or_create(machine_num): command = "juju ssh ubuntu@{} 'sudo test -d /var/lib/juju'".format(machine_num) exit_code = run(command, output=False) if exit_code != 0: create_command = "juju ssh {} 'sudo mkdir /var/lib/juju'".format(machine_num) run(create_command) # check for existence of /var/lib/juju/agents/machine-$ # remove it to avoid mv directory not-empty when recovering else: if 'lxd' in machine_num: machine_string = machine_num.replace('/', '-') check_for_machine_conf = "juju ssh ubuntu@{} 'sudo test -d /var/lib/juju/agents/machine-{}'".format(machine_num,machine_string) else: check_for_machine_conf = "juju ssh ubuntu@{} 'sudo test -d /var/lib/juju/agents/machine-{}'".format(machine_num,machine_num) if run(check_for_machine_conf, output=False) == 0: if 'lxd' in machine_num: mv_existing_command = "juju ssh ubuntu@{} 'sudo rm -rf /var/lib/juju/agents/machine-{}'".format(machine_num,machine_num.replace('/', '-')) run(mv_existing_command, output=False) else: mv_existing_command = "juju ssh ubuntu@{} 'sudo rm -rf /var/lib/juju/agents/machine-{}'".format(machine_num,machine_num) run(mv_existing_command, output=False) def get_agent_from_donor(donor, destdir): print ("Getting agent from machine {}".format(donor)) # Get dir from donor and remove everything except machine agent run("juju ssh {} 'sudo chmod -R a+r /var/lib/juju'".format(donor)) run("juju ssh {} 'sudo tar -C /var/lib/ -cvf /tmp/juju.tar juju'".format( donor)) run("juju scp {}:/tmp/juju.tar {}".format(donor, destdir)) run("tar -xvf {}/juju.tar -C {}".format(destdir, destdir)) run("rm -rf {}/juju/agents/unit*".format(destdir)) run("rm -rf {}/juju/meter-status.yaml".format(destdir)) run("rm -rf {}/juju/locks/*".format(destdir)) run("rm -rf {}/juju/tools/unit*".format(destdir)) run("rm -rf {}/juju/metricspool".format(destdir)) run("rm -rf {}/juju/nonce.txt".format(destdir)) run("rm -rf {}/juju.tar".format(destdir)) run("tar -cvf {}/juju.tar -C {} juju".format(destdir, destdir)) run("rm -rf {}/juju".format(destdir)) get_donor_systemd_units(donor,destdir) def get_donor_systemd_units(donor, destdir): run("juju ssh {} 'sudo tar -cvf /tmp/systemd.tar /lib/systemd/system/jujud-machine-{}'".format(donor,donor)) run("juju scp {}:/tmp/systemd.tar {}".format(donor, destdir)) def determine_juju_version(machine_number): return run("juju ssh {} 'ls /var/lib/juju/tools |egrep ^2.|sort -n|head -n1'".format(machine_number), output=True) def parse_machines_to_recover(status_json): machines = [] units = [] for app_name, app_data in status_json['applications'].items(): for unit_name,unit_data in app_data['units'].items(): if unit_data['juju-status']['current'] == "lost": units.append(unit_name) machine = unit_data['machine'] if machine not in machines: machines.append(machine) for machine_number, machine_data in status_json['machines'].items(): if machine_data['juju-status']['current'] == "down": if machine_number not in machines: machines.append(machine_number) return machines, units def parse_donor(status_json): candidates = [] for machine_number, machine_data in status_json['machines'].items(): if machine_data['juju-status']['current'] == "started": candidates.append(machine_number) for app_name, app_data in status_json['applications'].items(): for unit_name,unit_data in app_data['units'].items(): if unit_data['juju-status']['current'] == "lost": machine = unit_data['machine'] if machine in candidates: candidates.remove(machine) if len(candidates) > 0: return candidates[0] else: raise Exception("No donor candidates found") def disable_units(units): for unit in units: service = unit.replace('/','-') output = run("juju ssh {} 'sudo systemctl -a | grep {}' || true".format(unit, service)) if output: if 'active' in output: run("juju ssh {} 'sudo systemctl stop jujud-unit-{}.service'".format(unit, service)) run("juju ssh {} 'sudo systemctl disable jujud-unit-{}.service'".format(unit, service)) def main(): model = sys.argv[1] destdir = sys.argv[2] print("Determining the Primary MongoDB unit") controller = determine_primary_mongo() if os.path.exists(destdir): shutil.rmtree(destdir) os.mkdir(destdir) run("juju switch {}".format(model)) model_uuid = get_model_uuid(controller, model) print(model_uuid) # Query first healthy unit to select as donor print("Attempting to find a healthy machine donor") status_string_json = run("juju status --format json") status_json = json.loads(status_string_json) donor = parse_donor(status_json) passwordhash = get_donor_password(controller, donor, model_uuid) print(passwordhash) get_agent_from_donor(donor, destdir) # slight modification to ensure it's 'Running' print("Getting list of machines to recover") status_string_json = run("juju status --format json") status_json = json.loads(status_string_json) machines, units = parse_machines_to_recover(status_json) disable_units(units) print("Machines to be recovered: {}".format(machines)) for machine in machines: # added systemd_tar systemd_tar = destdir + "/systemd.tar" recover_machine(machine, "{}/juju.tar".format(destdir), donor, passwordhash, controller, model_uuid, systemd_tar) def test(): model = sys.argv[1] controller = sys.argv[2] destdir = sys.argv[3] if os.path.exists(destdir): shutil.rmtree(destdir) os.mkdir(destdir) run("juju switch {}".format(model)) print(determine_primary_mongo()) if __name__ == "__main__": if len(sys.argv) == 1 : print(__doc__.format(sys.argv[0])) sys.exit(-1) main()