Created
June 10, 2019 21:19
-
-
Save ryran/651d2c5ed44f94cf812c6c0235fc02be to your computer and use it in GitHub Desktop.
Revisions
-
ryran created this gist
Jun 10, 2019 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,230 @@ #!/bin/bash # # This extremely rough nonsense is an attempt to automate the disaster recovery # expired certs documentation published at # https://docs.openshift.com/container-platform/4.1/disaster_recovery/scenario-3-expired-certs.html # ... Which was last reviewed on 2019/06/10 # # Please contact [email protected] with suggestions or corrections # CUSTOMIZE THESE: MASTER=master0 # SSH hostname of master to use for bootstrapping everything ALTMASTERS="master1 master2" # SSH hostnames of additional masters (comment if none) WORKERS="worker0 worker1" # SSH hostnames of workers RELEASE=4.1.0 RELEASE_IMAGE=quay.io/openshift-release-dev/ocp-release:${RELEASE} c_YELLOW='\033[1;33m' c_BLUE='\033[1;34m' c_MAGENTA='\033[1;35m' c_CYAN='\033[1;36m' c_GREEN='\033[1;32m' c_RED='\033[1;31m' c_reset='\033[0;0m' c_err=${c_RED} c_msg=${c_YELLOW} print() { local c_host host=$1 shift [[ ${host} == local ]] && c_host=${c_BLUE} || c_host=${c_MAGENTA} printf "${c_host}[${host}] \t${c_msg}${@} ...${c_reset}\n" } errquit() { printf "${c_err}ERROR: ${@}${c_reset}\n" exit 1 } print ${MASTER} "Inspecting ${RELEASE_IMAGE}" KAO_IMAGE=$( ssh ${MASTER} oc adm release info --registry-config=/var/lib/kubelet/config.json ${RELEASE_IMAGE} --image-for=cluster-kube-apiserver-operator ) # Make sure master can pull the cluster-kube-apiserver-operator image print ${MASTER} "Pulling ${KAO_IMAGE} w/podman" ssh ${MASTER} sudo podman pull --authfile=/var/lib/kubelet/config.json ${KAO_IMAGE} || errquit "Unable to pull image '${KAO_IMAGE}' on ${MASTER}" # Create recovery API server and grab stdout/stderr print ${MASTER} "Kicking off 'recovery-apiserver create' w/podman" podrun_output=$( ssh ${MASTER} sudo podman run --net=host -v /etc/kubernetes/:/etc/kubernetes/:Z --entrypoint=/usr/bin/cluster-kube-apiserver-operator ${KAO_IMAGE} recovery-apiserver create 2>&1 ) # Grab kubeconfig filename path from output kubeconfig_file=$( grep -o "export KUBECONFIG=.*" <<<"${podrun_output}" ) if ! [[ $kubeconfig_file ]]; then echo -- echo "For debugging, here's all the output from the podman run cmd:" echo "${podrun_output}" echo -- errquit "Unable to get new kubeconfig credentials from recovery apiserver" fi destroy_command="ssh ${MASTER} sudo podman run --net=host -v /etc/kubernetes/:/etc/kubernetes/:Z --entrypoint=/usr/bin/cluster-kube-apiserver-operator ${KAO_IMAGE} recovery-apiserver destroy" cleanup() { print local "If there were no problems OR if you want to run this script a second time, you should destroy the recovery-apiserver with the following command:" printf "${c_GREEN}${destroy_command}${c_reset}\n" } trap cleanup EXIT INT # Strip export "KUBECONFIG=" so we have only the filename kubeconfig_file=${kubeconfig_file#*=} # Create script which comes straight from docs cat >restore_kubeconfig.sh <<\EOF #!/bin/bash set -eou pipefail # context intapi=$(oc get infrastructures.config.openshift.io cluster -o "jsonpath={.status.apiServerURL}") context="$(oc config current-context)" # cluster cluster="$(oc config view -o "jsonpath={.contexts[?(@.name==\"$context\")].context.cluster}")" server="$(oc config view -o "jsonpath={.clusters[?(@.name==\"$cluster\")].cluster.server}")" # token ca_crt_data="$(oc get secret -n openshift-machine-config-operator node-bootstrapper-token -o "jsonpath={.data.ca\.crt}" | base64 --decode)" namespace="$(oc get secret -n openshift-machine-config-operator node-bootstrapper-token -o "jsonpath={.data.namespace}" | base64 --decode)" token="$(oc get secret -n openshift-machine-config-operator node-bootstrapper-token -o "jsonpath={.data.token}" | base64 --decode)" export KUBECONFIG="$(mktemp)" kubectl config set-credentials "kubelet" --token="$token" >/dev/null ca_crt="$(mktemp)"; echo "$ca_crt_data" > $ca_crt kubectl config set-cluster $cluster --server="$intapi" --certificate-authority="$ca_crt" --embed-certs >/dev/null kubectl config set-context kubelet --cluster="$cluster" --user="kubelet" >/dev/null kubectl config use-context kubelet >/dev/null cat "$KUBECONFIG" EOF cat >tmpscript.sh <<-EOF #!/bin/bash # Bail as soon as anything fails set -e print() { printf "${c_MAGENTA}[${MASTER} tmpscript.sh] \t${c_msg}\${@} ...${c_reset}\n" } errquit() { printf "${c_err}ERROR: \${@}${c_reset}\n" exit 1 } # Since we're gonna do oc cmds as core, we need to make this readable print "Making ${kubeconfig_file} world-readable and exporting it as KUBECONFIG" sudo chmod +r ${kubeconfig_file} export KUBECONFIG=${kubeconfig_file} # Wait for recovery API server to come up (shouldn't take long at all) print "Waiting a bit for recovery apiserver" sleep 20 print "DEBUG: checking auth/nodes" oc whoami || : oc get nodes || : oc get namespace kube-system || : timeout=5m print "Waiting \${timeout} for 'oc get namespace kube-system' to succeed" timeout \${timeout} bash -c 'until oc get namespace kube-system &>/dev/null; do echo Waiting for recovery apiserver to come up...; sleep 2; done' # Run the regenerate-certificates command, fixing certs in API, overwriting old certs on local drive, and restarting static pods to pick them up cmd="sudo podman run --net=host -v /etc/kubernetes/:/etc/kubernetes/:Z --entrypoint=/usr/bin/cluster-kube-apiserver-operator ${KAO_IMAGE} regenerate-certificates" print "Executing cmd: \${cmd}" \${cmd} # Force new rollouts for control plane # ("it will reinstall itself on the other nodes because the kubelet is connected to API servers using an internal load balancer") print "Patching kubeapiserver to force redployment" oc patch kubeapiserver cluster -p='{"spec": {"forceRedeploymentReason": "recovery-'"$( date --rfc-3339=ns )"'"}}' --type=merge print "Patching kubecontrollermanager to force redployment" oc patch kubecontrollermanager cluster -p='{"spec": {"forceRedeploymentReason": "recovery-'"$( date --rfc-3339=ns )"'"}}' --type=merge print "Patching kubescheduler to force redployment" oc patch kubescheduler cluster -p='{"spec": {"forceRedeploymentReason": "recovery-'"$( date --rfc-3339=ns )"'"}}' --type=merge # Create a bootstrap kubeconfig print "Executing restore_kubeconfig.sh from https://docs.openshift.com/container-platform/4.1/disaster_recovery/scenario-3-expired-certs.html" bash restore_kubeconfig.sh >kubeconfig print "Copying ./kubeconfig to /etc/kubernetes/kubeconfig" sudo cp kubeconfig /etc/kubernetes/kubeconfig # Get the CA certificate used to validate connections from the API server print "Grabbing CA cert (cm kube-apiserver-to-kubelet-client-ca in ns openshift-kube-apiserver-operator)" oc get configmap kube-apiserver-to-kubelet-client-ca -n openshift-kube-apiserver-operator --template='{{ index .data "ca-bundle.crt" }}' >ca.crt print "Copying ./ca.crt to /etc/kubernetes/ca.crt" sudo cp ca.crt /etc/kubernetes/ca.crt # Cleanup rm -f restore_kubeconfig.sh kubeconfig ca.crt # Recover the kubelet service (delete stale kubelet data) print "Stopping kubelet.service and clearing out /var/lib/kubelet/{pki,kubeconfig}" sudo systemctl stop kubelet sudo rm -rf /var/lib/kubelet/pki /var/lib/kubelet/kubeconfig sudo systemctl start kubelet EOF # Copy to master print local "Copying tmpscript.sh & restore_kubeconfig.sh to ${MASTER}" scp tmpscript.sh restore_kubeconfig.sh ${MASTER}: # Do all our work print local "Kicking off tmpscript.sh on ${MASTER}" ssh ${MASTER} bash tmpscript.sh || errquit Aborting due to failure initiating recovery on ${MASTER} # Grab new kubeconfig & ca.crt print local "Grabbing /etc/kubernetes/{kubeconfig,ca.crt} from ${MASTER}" scp ${MASTER}:/etc/kubernetes/kubeconfig . || errquit Aborting due to error grabbing /etc/kubernetes/kubeconfig from ${MASTER} scp ${MASTER}:/etc/kubernetes/ca.crt . || errquit Aborting due to error grabbing /etc/kubernetes/ca.crt from ${MASTER} # If we have more than 1 master... if [[ ${ALTMASTERS} ]]; then for altmaster in ${ALTMASTERS}; do # Push bootstrap kubeconfig & new ca.crt print local "Pushing ./{kubeconfig,ca.crt} to ${altmaster}" scp kubeconfig ca.crt ${altmaster}: || errquit Aborting due to error pushing files to ${altmaster} print ${altmaster} "Stopping kubelet.service and clearing out /var/lib/kubelet/{pki,kubeconfig}" ssh ${altmaster} <<-EOF # Put new kubeconfig/ca.crt in place sudo cp kubeconfig ca.crt /etc/kubernetes/ rm -f kubeconfig ca.crt # Recover the kubelet service (delete stale kubelet data) sudo systemctl stop kubelet sudo rm -rf /var/lib/kubelet/pki /var/lib/kubelet/kubeconfig sudo systemctl start kubelet EOF done fi for worker in ${WORKERS}; do print local "Pushing ./ca.crt to ${worker}" scp ca.crt ${worker}: || errquit Aborting due to error pushing files to ${worker} print ${worker} "Stopping kubelet.service and clearing out /var/lib/kubelet/{pki,kubeconfig}" ssh ${worker} <<-EOF # Put new ca.crt in place sudo cp ca.crt /etc/kubernetes/ rm -f ca.crt # Recover the kubelet service (delete stale kubelet data) sudo systemctl stop kubelet sudo rm -rf /var/lib/kubelet/pki /var/lib/kubelet/kubeconfig sudo systemctl start kubelet EOF done print ${MASTER} "Approving pending CSRs" ssh ${MASTER} <<-EOF export KUBECONFIG=${kubeconfig_file} # Approve the pending node-bootstrapper CSRs oc get csr --no-headers | awk '\$4=="Pending"' oc get csr --no-headers | awk '\$4=="Pending" {system("oc adm certificate approve "\$1)}' # Destroy the recovery API server and cleanup containers #${destroy_command} #sudo podman rm -a # (Handled by bash trap right now) EOF