import subprocess import requests import sys from datetime import datetime import time import random import signal # arbitrarily chosen prime numbers SLEEP_MIN = 127 SLEEP_MAX = 157 def get_slurm_status_status(username): """Get the status of the slurm nodes for a given user.""" output = subprocess.check_output( ['squeue', '-u', username, '-h', '-o', '"%i %N %T"']).decode('utf-8') # parse output and return a set of status lines output = output.strip('" \n').split('\n') outputSet = set() for line in output: line = line.strip('" ') if line: outputSet.add(line) return outputSet def send_notification(post_url, message): """Send notification via HTTP POST.""" response = requests.post(post_url, data=message, headers={ "Title": "Slurm Status Update", "Priority": "max", "Tags": "computer" }) if response.status_code != 200: raise Exception( f"Failed HTTP POST with {response.status_code}") def monitor_slurm_status(username, post_url): """Monitor the slurm status and send notifications for changes.""" previous_status = set() while True: current_status = get_slurm_status_status(username) # Check for changes in the nodes added_status = current_status - previous_status removed_status = previous_status - current_status # Send a notification if there are any changes if added_status: print(f"{datetime.now()} Added: {added_status}") message = "; ".join(added_status) # send a notification only if any of the added lines contain "RUNNING" if any("RUNNING" in line for line in added_status): send_notification(post_url, message) print(f"{datetime.now()} Notification sent: {message}") if removed_status: print(f"{datetime.now()} Removed: {removed_status}") # Update the previous nodes previous_status = current_status # sleep for a random interval betwen 127 and 157 seconds time.sleep(random.randint(SLEEP_MIN, SLEEP_MAX)) if __name__ == '__main__': # first argument is the username to monitor username = sys.argv[1] # second argument is the URL to send the notification post_url = sys.argv[2] # trap SIGINT def signal_handler(sig, frame): print("Exiting...") sys.exit(0) signal.signal(signal.SIGINT, signal_handler) monitor_slurm_status(username, post_url)