Skip to content

Instantly share code, notes, and snippets.

@mattmattox
Created January 4, 2024 22:03
Show Gist options
  • Save mattmattox/c2bd37b5f5c49ef97f648acf6587861c to your computer and use it in GitHub Desktop.
Save mattmattox/c2bd37b5f5c49ef97f648acf6587861c to your computer and use it in GitHub Desktop.

Revisions

  1. mattmattox created this gist Jan 4, 2024.
    63 changes: 63 additions & 0 deletions patch-k8s-cluster.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,63 @@
    #!/bin/bash

    while getopts "c:h" opt; do
    case $opt in
    c)
    CLUSTER="${OPTARG}"
    ;;
    h)
    help && exit 0
    ;;
    :)
    echo "Option -$OPTARG requires an argument."
    exit 1
    ;;
    *)
    help && exit 0
    esac
    done

    if [[ -z "${CLUSTER}" ]]; then
    echo "Please specify a cluster name."
    exit 1
    fi

    export KUBECONFIG=~/.kube/mattox/${CLUSTER}
    kubeconfig=~/.kube/mattox/${CLUSTER}
    cd ~/scripts/rolling-patching/

    check_ssh() {
    echo "Checking ${server}"
    until ssh -q -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no root@${server} 'uptime' > /dev/null
    do
    echo "Trying again..."
    sleep 1
    done
    }

    echo "Starting patching..."
    for server in `kubectl --insecure-skip-tls-verify get nodes -o name | awk -F '/' '{print $2}'`
    do
    i=0
    if ping -c 1 $server
    then
    echo "Server is pingable..."
    echo "Draining node..."
    #kubectl --insecure-skip-tls-verify --kubeconfig ${kubeconfig} cordon ${server}
    kubectl --kubeconfig ${kubeconfig} drain --delete-emptydir-data --ignore-daemonsets ${server}
    echo "Running apt update and upgrade"
    ~/scripts/prep-a-server ${server}
    echo "Sleeping for 60 seconds..."
    sleep 60
    check_ssh
    echo "Running do-release-upgrade"
    ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no root@${server} 'sed "s/Prompt=.*/Prompt=normal/g" /etc/update-manager/release-upgrades; export DEBIAN_FRONTEND=noninteractive; do-release-upgrade -f DistUpgradeViewNonInteractive; reboot'
    echo "Sleeping for 60 seconds..."
    sleep 60
    check_ssh
    echo "Uncordon node..,"
    kubectl --insecure-skip-tls-verify --kubeconfig ${kubeconfig} uncordon ${server}
    else
    echo "Skipping..."
    fi
    done
    198 changes: 198 additions & 0 deletions prep-a-server.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,198 @@
    #!/bin/bash

    function wait_for_ssh {
    local host=$1
    local port=${2:-22}
    local retries=${3:-900}
    local interval=${4:-1}
    local timeout=$(( retries * interval ))

    local i=0
    while [ $i -lt $retries ]; do
    echo "Trying to connect to root@$host..."
    ssh -q -o ConnectTimeout=$interval -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i ~/.ssh/id_rsa root@$host exit >/dev/null 2>&1
    if [ $? -eq 0 ]; then
    echo "SSH is now available for root@$host."
    return 0
    fi

    sleep $interval
    i=$(( i + 1 ))
    done

    echo "Timed out waiting for SSH to become available on $host after ${timeout}s."
    return 1
    }

    function fix_dns {
    local host=$1
    local port=${2:-22}
    local user=${3:-root}
    echo "Configuring DNS settings..."
    ssh -o "StrictHostKeyChecking=no" -o "UserKnownHostsFile=/dev/null" $user@$host /bin/bash << EOF
    systemctl disable systemd-resolved.service
    systemctl stop systemd-resolved
    unlink /etc/resolv.conf
    rm /etc/resolv.conf
    echo "search support.tools" > /etc/resolv.conf
    echo "nameserver 1.1.1.1" >> /etc/resolv.conf
    echo "nameserver 1.0.0.1" >> /etc/resolv.conf
    EOF
    return 0
    }

    function check_and_copy_ssh_key {
    local host=$1
    local user=$2
    local keyfile=$3

    echo "Checking SSH availability for $user@$host..."
    ssh -q -o ConnectTimeout=10 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i "$keyfile" $user@$host exit >/dev/null 2>&1

    if [ $? -eq 0 ]; then
    echo "SSH is available for $user@$host."
    else
    echo "SSH authentication failed for $user@$host. Trying with ubuntu user..."
    ssh -q -o ConnectTimeout=10 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i "$keyfile" ubuntu@$host exit >/dev/null 2>&1
    if [ $? -eq 0 ]; then
    echo "SSH authentication succeeded with ubuntu user. Copying SSH key to root..."
    ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@$host "sudo cp /home/ubuntu/.ssh/authorized_keys /root/.ssh/"
    ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@$host "sudo chown root:root /root/.ssh/authorized_keys"
    echo "SSH key copied to root user."
    else
    echo "SSH authentication failed for both root and ubuntu users on $host."
    fi
    fi
    }

    if [[ -z $1 ]]; then
    echo "Missing node IP."
    exit 1
    fi

    if [[ ! -z $2 ]]; then
    echo "Skipping do-release-upgrade"
    skip=1
    fi

    echo "Preparing node $1"

    firstrun=true

    echo "Waiting for SSH..."
    wait_for_ssh $1

    firstrun=false

    echo "Resetting SSH host keys if needed..."
    ssh -o "StrictHostKeyChecking=no" -o "UserKnownHostsFile=/dev/null" root@"$1" /bin/bash << EOF
    if [[ "$(md5sum /etc/ssh/ssh_host_rsa_key.pub | awk '{print $1}')" == "a94dbf9ac63ed41c4acee41dc920998a" ]]; then
    echo "Needed to recreate SSH host keys"
    rm /etc/ssh/ssh_host_*
    ssh-keygen -A
    reboot
    fi
    EOF

    fix_dns $1

    check_and_copy_ssh_key $1 root ~/.ssh/id_rsa

    echo "Copying over timesyncd.conf..."
    scp -o "StrictHostKeyChecking=no" -o "UserKnownHostsFile=/dev/null" ~/scripts/timesyncd.conf root@"$1":/etc/systemd/timesyncd.conf

    echo "Updating Server..."
    ssh -o "StrictHostKeyChecking=no" -o "UserKnownHostsFile=/dev/null" root@"$1" /bin/bash << EOF
    rm /etc/apt/apt.conf.d/00aptproxy
    sed -i -e 's/Prompt=.*/Prompt=normal/g' /etc/update-manager/release-upgrades
    export DEBIAN_FRONTEND=noninteractive
    apt-get update
    apt-get -o Dpkg::Options::="--force-confold" -o Dpkg::Options::=--force-confdef upgrade -q -y --allow-downgrades --allow-remove-essential --allow-change-held-packages
    apt-get -o Dpkg::Options::="--force-confold" -o Dpkg::Options::=--force-confdef dist-upgrade -q -y --allow-downgrades --allow-remove-essential --allow-change-held-packages
    EOF

    echo "Syncing time..."
    ssh -o "StrictHostKeyChecking=no" -o "UserKnownHostsFile=/dev/null" root@"$1" /bin/bash << EOF
    apt install ntpdate -y
    timedatectl set-ntp on
    timedatectl set-timezone America/Chicago
    EOF

    echo "Installing standard packages..."
    ssh -o "StrictHostKeyChecking=no" -o "UserKnownHostsFile=/dev/null" root@"$1" /bin/bash << EOF
    apt install -y \
    apt-transport-https \
    ca-certificates \
    curl \
    wget \
    software-properties-common \
    git \
    htop \
    iotop \
    iftop \
    nload \
    sysstat \
    nmon \
    nfs-common \
    open-iscsi \
    net-tools \
    tcpdump \
    dnsutils \
    ceph-common
    EOF

    echo "Setting up sysctl..."
    ssh -o "StrictHostKeyChecking=no" -o "UserKnownHostsFile=/dev/null" root@"$1" /bin/bash << EOF
    echo "fs.inotify.max_user_instances=8192" > /etc/sysctl.d/20-inotify.conf
    echo "fs.inotify.max_user_watches=524288" >> /etc/sysctl.d/20-inotify.conf
    echo "fs.inotify.max_queued_events=524288" >> /etc/sysctl.d/20-inotify.conf
    echo "net.ipv4.tcp_max_syn_backlog=2048" >> /etc/sysctl.d/20-inotify.conf
    echo "net.ipv4.tcp_syncookies=1" >> /etc/sysctl.d/20-inotify.conf
    echo "net.ipv4.tcp_tw_reuse=1" >> /etc/sysctl.d/20-inotify.conf
    echo "net.ipv4.tcp_fin_timeout=30" >> /etc/sysctl.d/20-inotify.conf
    echo "net.ipv4.tcp_keepalive_time=1200" >> /etc/sysctl.d/20-inotify.conf
    echo "net.ipv4.ip_local_port_range=1024 65535" >> /etc/sysctl.d/20-inotify.conf
    echo "net.ipv4.tcp_max_tw_buckets=1440000" >> /etc/sysctl.d/20-inotify.conf
    echo "net.ipv4.tcp_max_orphans=3276800" >> /etc/sysctl.d/20-inotify.conf
    echo "net.ipv4.tcp_mem=786432 1048576 26777216" >> /etc/sysctl.d/20-inotify.conf
    echo "net.ipv4.tcp_rmem=4096 87380 4194304" >> /etc/sysctl.d/20-inotify.conf
    echo "net.ipv4.tcp_wmem=4096 65536 4194304" >> /etc/sysctl.d/20-inotify.conf
    echo "net.core.somaxconn=4096" >> /etc/sysctl.d/20-inotify.conf
    echo "net.core.netdev_max_backlog=2500" >> /etc/sysctl.d/20-inotify.conf
    echo "net.core.rmem_max=4194304" >> /etc/sysctl.d/20-inotify.conf
    echo "net.core.wmem_max=4194304" >> /etc/sysctl.d/20-inotify.conf
    echo "net.core.rmem_default=262144" >> /etc/sysctl.d/20-inotify.conf
    echo "net.core.wmem_default=262144" >> /etc/sysctl.d/20-inotify.conf
    echo "net.ipv4.tcp_slow_start_after_idle=0" >> /etc/sysctl.d/20-inotify.conf
    echo "net.ipv4.tcp_timestamps=0" >> /etc/sysctl.d/20-inotify.conf
    echo "net.ipv4.tcp_synack_retries=2" >> /etc/sysctl.d/20-inotify.conf
    echo "net.ipv4.tcp_syn_retries=2" >> /etc/sysctl.d/20-inotify.conf
    echo "net.ipv4.tcp_tw_recycle=1" >> /etc/sysctl.d/20-inotify.conf
    EOF

    echo "Disabling ip checksum offloading..."
    ssh -o "StrictHostKeyChecking=no" -o "UserKnownHostsFile=/dev/null" root@"$1" /bin/bash << EOF
    ethtool -K eth0 tx-checksum-ip-generic off
    ethtool -K eth0 tx off rx off
    ethtool -K ens160 tx-checksum-ip-generic off
    ethtool -K ens160 tx off rx off
    ethtool -K ens192 tx-checksum-ip-generic off
    ethtool -K ens192 tx off rx off
    ethtool -K ens224 tx-checksum-ip-generic off
    ethtool -K ens224 tx off rx off
    ethtool -K flannel.0 tx-checksum-ip-generic off
    ethtool -K flannel.0 tx off rx off
    ethtool -K flannel.1 tx-checksum-ip-generic off
    ethtool -K flannel.1 tx off rx off
    EOF

    echo "Rebooting..."
    ssh -o "StrictHostKeyChecking=no" -o "UserKnownHostsFile=/dev/null" root@"$1" 'reboot'

    echo "Sleeping..."
    sleep 60

    echo "Waiting for SSH..."
    wait_for_ssh $1

    fix_dns $1