Skip to content

Instantly share code, notes, and snippets.

@flipphillips
Created October 30, 2025 16:47
Show Gist options
  • Select an option

  • Save flipphillips/a1b291d8cf65409388f0d4c36f7df5ec to your computer and use it in GitHub Desktop.

Select an option

Save flipphillips/a1b291d8cf65409388f0d4c36f7df5ec to your computer and use it in GitHub Desktop.

Revisions

  1. flipphillips created this gist Oct 30, 2025.
    258 changes: 258 additions & 0 deletions fix-rocky-kernel.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,258 @@
    #!/bin/bash
    # Kernel update verification and recovery script
    # Adds checks for free space on /boot and an optional automatic cleanup.
    # Enhancements:
    # - When run with --auto, will attempt additional heuristics to free space in /boot:
    # 1) remove old kernel packages (existing behavior)
    # 2) remove orphaned files in /boot that don't match any installed kernel
    # 3) as a last resort, archive the oldest non-active boot files to /var/tmp/boot-archive-<timestamp>.tar.xz
    # Note: Archiving moves files out of /boot but keeps them on the same filesystem if /var/tmp is on the rootfs.

    set -euo pipefail

    BOOT_DIR="/boot"
    # Allow --auto (non-interactive removal of oldest kernels) and --keep (how many kernels to keep)
    AUTO_REMOVE=false
    KEEP_COUNT=2
    while [[ ${1:-} != "" ]]; do
    case "$1" in
    --auto|-a) AUTO_REMOVE=true; shift ;;
    --keep) KEEP_COUNT=${2:-2}; shift 2 ;;
    --help|-h) echo "Usage: $0 [--auto|-a] [--keep N]"; exit 0 ;;
    *) shift ;;
    esac
    done

    LATEST_KERNEL=$(rpm -q kernel --last | head -1 | awk '{print $1}' | sed 's/kernel-//')
    MODULE_DIR="/lib/modules/$LATEST_KERNEL"
    INITRAMFS="$BOOT_DIR/initramfs-$LATEST_KERNEL.img"

    echo "Checking kernel: $LATEST_KERNEL"

    get_boot_avail_kb() {
    # Use --block-size=1K so output is in KB and avoid mixing short and long options
    df --block-size=1K --output=avail "$BOOT_DIR" | tail -1 | tr -d ' '
    }

    max_existing_init_kb=0
    for f in "$BOOT_DIR"/initramfs-*.img; do
    if [ -f "$f" ]; then
    size_bytes=$(stat -c%s "$f")
    size_kb=$(( (size_bytes + 1023) / 1024 ))
    if [ $size_kb -gt $max_existing_init_kb ]; then
    max_existing_init_kb=$size_kb
    fi
    fi
    done
    # default estimate if no initramfs found
    if [ $max_existing_init_kb -eq 0 ]; then
    max_existing_init_kb=$((200 * 1024))
    fi
    # Add a safety buffer (50MB)
    NEEDED_KB=$(( max_existing_init_kb + 50 * 1024 ))

    BOOT_AVAIL_KB=$(get_boot_avail_kb)

    echo "Available on $BOOT_DIR: $BOOT_AVAIL_KB KB; estimated needed: $NEEDED_KB KB"

    # Verify kernel modules
    if [ ! -d "$MODULE_DIR" ]; then
    echo "ERROR: Missing kernel modules for $LATEST_KERNEL"
    echo "Reinstalling kernel packages..."
    dnf reinstall "kernel-core-$LATEST_KERNEL" "kernel-modules-$LATEST_KERNEL" -y
    fi

    # Helper: list installed kernel versions (without the leading 'kernel-')
    installed_kernels() {
    rpm -q kernel --last | awk '{print $1}' | sed 's/^kernel-//' || true
    }

    # Helper: find boot files that look like kernel artifacts but do not correspond to installed kernels
    find_orphaned_boot_files() {
    # Patterns: vmlinuz-*, initramfs-*.img, System.map-*, config-*, abi-*
    local -a patterns=("vmlinuz-" "initramfs-" "System.map-" "config-" "abi-")
    local k
    # build list of installed kernels for quick grep
    mapfile -t installed < <(installed_kernels)
    for f in "$BOOT_DIR"/*; do
    [ -e "$f" ] || continue
    base=$(basename "$f")
    for p in "${patterns[@]}"; do
    if [[ "$base" == $p* ]]; then
    # extract version suffix
    ver=${base#${p}}
    # strip extensions like .img
    ver=${ver%%.*}
    keep=false
    for k in "${installed[@]}"; do
    if [[ "$k" == "$ver" ]]; then
    keep=true
    break
    fi
    done
    if [ "$keep" = false ]; then
    echo "$f"
    fi
    fi
    done
    done
    }

    # If target initramfs missing, try to ensure there's enough space first
    if [ ! -f "$INITRAMFS" ]; then
    echo "WARNING: Missing initramfs for $LATEST_KERNEL"
    echo "Preparing to rebuild initramfs..."

    if [ "$BOOT_AVAIL_KB" -lt "$NEEDED_KB" ]; then
    echo "Not enough free space on $BOOT_DIR to build initramfs."
    echo "Free: $BOOT_AVAIL_KB KB, required: $NEEDED_KB KB."

    # List candidate files/kernels to remove
    echo "Current files in $BOOT_DIR (largest first):"
    ls -lhS "$BOOT_DIR" | head -n 30

    # Show installed kernels and sizes of their initramfs (if exist)
    echo
    echo "Installed kernels (newest first):"
    rpm -q kernel --last | awk '{print $1}' | sed 's/kernel-//'

    if [ "$AUTO_REMOVE" = true ]; then
    echo "Auto-remove enabled: attempting to remove oldest kernels until enough space is available."
    running_kernel=$(uname -r)
    # get kernels in oldest-first order
    mapfile -t kernels < <(rpm -q kernel --last | awk '{print $1}' | sed 's/kernel-//' | tac)
    removed_any=false
    for ver in "${kernels[@]}"; do
    # do not remove running kernel or the target kernel; keep at least KEEP_COUNT newest
    if [[ "$ver" == "$running_kernel" ]] || [[ "$ver" == "$LATEST_KERNEL" ]]; then
    continue
    fi
    # Count how many kernels remain (newest-first)
    total_installed=$(rpm -q kernel | wc -l)
    if [ $total_installed -le $KEEP_COUNT ]; then
    echo "Reached keep limit ($KEEP_COUNT); stopping removals."
    break
    fi

    echo "Attempting to remove kernel packages for $ver"
    # Try removing kernel-core and kernel-modules for that version; fall back to kernel-<ver>
    if dnf remove -y "kernel-core-$ver" "kernel-modules-$ver" >/dev/null 2>&1; then
    removed_any=true
    else
    if dnf remove -y "kernel-$ver" >/dev/null 2>&1; then
    removed_any=true
    else
    echo "Failed to remove kernel packages for $ver; skipping."
    fi
    fi

    BOOT_AVAIL_KB=$(get_boot_avail_kb)
    echo "Available after attempt: $BOOT_AVAIL_KB KB"
    if [ "$BOOT_AVAIL_KB" -ge "$NEEDED_KB" ]; then
    echo "Enough space freed."
    break
    fi
    done

    # If still not enough space, try removing orphaned files in /boot
    if [ "$BOOT_AVAIL_KB" -lt "$NEEDED_KB" ]; then
    echo "Attempting to remove orphaned /boot files that don't match installed kernels..."
    mapfile -t orphans < <(find_orphaned_boot_files)
    if [ ${#orphans[@]} -gt 0 ]; then
    for f in "${orphans[@]}"; do
    echo "Removing orphaned file: $f"
    if rm -f "$f"; then
    echo "Removed $f"
    else
    echo "Failed to remove $f; skipping."
    fi
    BOOT_AVAIL_KB=$(get_boot_avail_kb)
    if [ "$BOOT_AVAIL_KB" -ge "$NEEDED_KB" ]; then
    echo "Enough space freed by removing orphans."
    break
    fi
    done
    else
    echo "No orphaned boot files detected."
    fi
    fi

    # Last resort: archive oldest non-active boot files to /var/tmp
    if [ "$BOOT_AVAIL_KB" -lt "$NEEDED_KB" ]; then
    echo "Not enough space after package removals and orphan cleanup. Preparing archival fallback."
    ARCHIVE_DIR=/var/tmp
    ts=$(date +%Y%m%d%H%M%S)
    ARCHIVE="$ARCHIVE_DIR/boot-archive-$ts.tar.xz"
    # Choose candidates: files in /boot sorted by mtime (oldest first) excluding running/kernel target artifacts
    echo "Building list of archival candidates..."
    running_kernel=$(uname -r)
    mapfile -t candidates < <(find "$BOOT_DIR" -maxdepth 1 -type f -printf '%T@ %p\n' | sort -n | awk '{print $2}')
    to_archive=()
    for f in "${candidates[@]}"; do
    base=$(basename "$f")
    # skip files for running kernel or target kernel
    if [[ "$base" == *"$running_kernel"* ]] || [[ "$base" == *"$LATEST_KERNEL"* ]]; then
    continue
    fi
    to_archive+=("$f")
    # stop collecting if archiving these will likely free enough (heuristic: collect up to 10 files)
    if [ ${#to_archive[@]} -ge 10 ]; then
    break
    fi
    done

    if [ ${#to_archive[@]} -eq 0 ]; then
    echo "No safe archival candidates found in $BOOT_DIR. Cannot proceed automatically."
    else
    echo "Archiving ${#to_archive[@]} files to $ARCHIVE to free space."
    # Create archive; use tar with xz compression
    # Build array of basenames to pass safely to tar
    basenames=()
    for f in "${to_archive[@]}"; do
    basenames+=("$(basename "$f")")
    done
    if tar -cJf "$ARCHIVE" -C "$BOOT_DIR" -- "${basenames[@]:-}"; then
    echo "Archive created at $ARCHIVE. Removing archived files from $BOOT_DIR."
    for f in "${to_archive[@]}"; do rm -f "$f" || true; done
    BOOT_AVAIL_KB=$(get_boot_avail_kb)
    echo "Available after archival: $BOOT_AVAIL_KB KB"
    else
    echo "Failed to create archive $ARCHIVE. Leaving files intact."
    fi
    fi
    fi

    if [ "$BOOT_AVAIL_KB" -lt "$NEEDED_KB" ]; then
    echo "Unable to free enough space automatically. Please free space in $BOOT_DIR and retry."
    exit 1
    fi

    if [ "$removed_any" = true ]; then
    echo "Cleaning up package metadata..."
    dnf -y autoremove || true
    fi
    else
    echo "To automatically remove old kernels and try again, re-run with --auto or -a"
    echo "Aborting rebuild to avoid filling $BOOT_DIR."
    exit 1
    fi
    fi

    echo "Rebuilding initramfs..."
    dracut -f "$INITRAMFS" "$LATEST_KERNEL"
    fi

    # Verify NVIDIA modules (if applicable)
    if lsmod | grep -q nvidia; then
    if ! modinfo nvidia -k "$LATEST_KERNEL" &>/dev/null; then
    echo "WARNING: NVIDIA modules missing for $LATEST_KERNEL"
    echo "Rebuilding NVIDIA driver..."
    # Add NVIDIA rebuild command based on your installation method
    # For .run installer: /path/to/NVIDIA*.run --silent
    # For akmod: akmods --force
    fi
    fi

    # Update GRUB - this is probably wrong since there's some EFI related stuff
    echo "Updating GRUB configuration..."
    grub2-mkconfig -o /boot/grub2/grub.cfg