#!/bin/sh # # Look for any known-to-hang processes that have been running for longer than # 2 hours, these are not caught by the ulimit -t set by pbulk as they are not # using any CPU time. # # Run from cron and redirect output to a log # if [ "$1" = "-n" ]; then dry_run=true else dry_run=false fi case "$(uname -s)" in NetBSD) ps_args_pbulk="-o etime= -o pid= -o args= -U pbulk -x" ps_args_log="-ww -o user,pid,lstart,etime,args" process_restart=true ;; *) ps_args_pbulk="-o etime= -o pid= -o args= -U pbulk" ps_args_log="-fo user,pid,etime,args" process_restart=false ;; esac kill_or_restart() { pid=$1; shift if ${dry_run}; then if ${process_restart}; then echo "Would stop/start PID ${pid}" else echo "Would kill PID ${pid}" fi return fi # # On some OS it's enough to stop and restart processes to get them # running again (notably NetBSD with its broken libpthread). # if ${process_restart}; then kill -STOP ${pid} sleep 1 kill -CONT ${pid} else kill -9 ${pid} fi } log_and_kill() { pid=$1; shift # Output date and running command for the log. date '+%Y-%m-%d-%H:%M:%S' case "$(uname -s)" in Darwin) /usr/sbin/lsof -d cwd -a -p ${pid} ;; esac ps ${ps_args_log} -p ${pid} kill_or_restart ${pid} } # # fseventsd on macOS often ends up spinning during bulk builds. Just kill # it once it's hit a certain amount of user time. # restart_system_processes_macos() { fspid=$(launchctl list | awk '/com.apple.fseventsd/ {print $1}') case "$(ps -o utime= -p ${fspid} 2>/dev/null)" in *[0-9][0-9]:*) log_and_kill ${fspid} ;; esac } ps ${ps_args_pbulk} | while read time pid cmd; do # # Match when the ETIME field is 2 hours or longer. Account for OS # differences, most have leading 0, NetBSD does not. # # If anything has been running for over a day just kill it, it's # highly unlikely to be making forward progress. # case "${time}" in *-*:*:*) # Except for known false positives case "${cmd}" in /usr/libexec/lsd*|/usr/sbin/distnoted*) ;; *) log_and_kill ${pid} ;; esac ;; 0[2-9]:*:*|\ [2-9]:*:*|\ [1-9][0-9]:*:*) # # Only match either known fail processes or anything running # from within the work directory, skipping known false # positives such as Rust. # case "${cmd}" in *lang/rust*|*ghc9[46]*) # Do nothing, likely legitimate long-running process. ;; /Users/pbulk/*|\ /home/pbulk/*|\ *Configure*|\ *cmake_autogen*|\ mplayer|\ *py-scipy*|\ *" ./configure "*|\ ./*|../*) log_and_kill ${pid} ;; esac ;; esac done # # Perform any OS-specific system cleanup. # case "$(uname -s)" in Darwin) restart_system_processes_macos ;; esac