Skip to content

Instantly share code, notes, and snippets.

@aadityabhatia
Last active February 27, 2024 14:05
Show Gist options
  • Select an option

  • Save aadityabhatia/50574836c727a1add565c7908e22cb98 to your computer and use it in GitHub Desktop.

Select an option

Save aadityabhatia/50574836c727a1add565c7908e22cb98 to your computer and use it in GitHub Desktop.

Revisions

  1. aadityabhatia revised this gist Feb 27, 2024. 1 changed file with 8 additions and 1 deletion.
    9 changes: 8 additions & 1 deletion squeue-monitor.py
    Original file line number Diff line number Diff line change
    @@ -18,7 +18,14 @@ def get_slurm_status_status(username):

    # parse output and return a set of status lines
    output = output.strip('" \n').split('\n')
    return set([node.strip('" ') for node in output])

    outputSet = set()
    for line in output:
    line = line.strip('" ')
    if line:
    outputSet.add(line)

    return outputSet


    def send_notification(post_url, message):
  2. aadityabhatia revised this gist Feb 25, 2024. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion squeue-monitor.py
    Original file line number Diff line number Diff line change
    @@ -18,7 +18,7 @@ def get_slurm_status_status(username):

    # parse output and return a set of status lines
    output = output.strip('" \n').split('\n')
    return set([node.strip() for node in output])
    return set([node.strip('" ') for node in output])


    def send_notification(post_url, message):
  3. aadityabhatia revised this gist Feb 24, 2024. 1 changed file with 3 additions and 4 deletions.
    7 changes: 3 additions & 4 deletions squeue-monitor.py
    Original file line number Diff line number Diff line change
    @@ -30,7 +30,7 @@ def send_notification(post_url, message):
    })
    if response.status_code != 200:
    raise Exception(
    f"Failed to send notification: HTTP {response.status_code}")
    f"Failed HTTP POST with {response.status_code}")


    def monitor_slurm_status(username, post_url):
    @@ -48,13 +48,12 @@ def monitor_slurm_status(username, post_url):
    # Send a notification if there are any changes
    if added_status:
    print(f"{datetime.now()} Added: {added_status}")
    message = "\n".join(added_status)
    message = "; ".join(added_status)

    # send a notification only if any of the added lines contain "RUNNING"
    if any("RUNNING" in line for line in added_status):
    send_notification(post_url, message)
    print(datetime.now(), "Notification sent:",
    message.replace("\n", "; "))
    print(f"{datetime.now()} Notification sent: {message}")

    if removed_status:
    print(f"{datetime.now()} Removed: {removed_status}")
  4. aadityabhatia revised this gist Feb 24, 2024. 1 changed file with 26 additions and 7 deletions.
    33 changes: 26 additions & 7 deletions squeue-monitor.py
    Original file line number Diff line number Diff line change
    @@ -4,9 +4,15 @@
    from datetime import datetime
    import time
    import random
    import signal

    # arbitrarily chosen prime numbers
    SLEEP_MIN = 127
    SLEEP_MAX = 157


    def get_slurm_status_status(username):
    """Get the status of the slurm nodes for a given user."""
    output = subprocess.check_output(
    ['squeue', '-u', username, '-h', '-o', '"%i %N %T"']).decode('utf-8')

    @@ -15,9 +21,9 @@ def get_slurm_status_status(username):
    return set([node.strip() for node in output])


    def send_notification(post_url, data):
    def send_notification(post_url, message):
    """Send notification via HTTP POST."""
    response = requests.post(post_url, data=data, headers={
    response = requests.post(post_url, data=message, headers={
    "Title": "Slurm Status Update",
    "Priority": "max",
    "Tags": "computer"
    @@ -26,10 +32,10 @@ def send_notification(post_url, data):
    raise Exception(
    f"Failed to send notification: HTTP {response.status_code}")

    print(datetime.now(), "Notification sent:", data)


    def monitor_slurm_status(username, post_url):
    """Monitor the slurm status and send notifications for changes."""

    previous_status = set()

    while True:
    @@ -42,10 +48,13 @@ def monitor_slurm_status(username, post_url):
    # Send a notification if there are any changes
    if added_status:
    print(f"{datetime.now()} Added: {added_status}")
    message = "squeue update:\n" + "\n".join(added_status)
    message = "\n".join(added_status)

    # send a notification only if any of the added lines contain "RUNNING"
    if any("RUNNING" in line for line in added_status):
    send_notification(post_url, message)
    print(datetime.now(), "Notification sent:",
    message.replace("\n", "; "))

    if removed_status:
    print(f"{datetime.now()} Removed: {removed_status}")
    @@ -54,11 +63,21 @@ def monitor_slurm_status(username, post_url):
    previous_status = current_status

    # sleep for a random interval betwen 127 and 157 seconds
    time.sleep(random.randint(127, 157))
    time.sleep(random.randint(SLEEP_MIN, SLEEP_MAX))


    if __name__ == '__main__':
    # read username from first argument
    # first argument is the username to monitor
    username = sys.argv[1]

    # second argument is the URL to send the notification
    post_url = sys.argv[2]

    # trap SIGINT
    def signal_handler(sig, frame):
    print("Exiting...")
    sys.exit(0)

    signal.signal(signal.SIGINT, signal_handler)

    monitor_slurm_status(username, post_url)
  5. aadityabhatia created this gist Feb 24, 2024.
    64 changes: 64 additions & 0 deletions squeue-monitor.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,64 @@
    import subprocess
    import requests
    import sys
    from datetime import datetime
    import time
    import random


    def get_slurm_status_status(username):
    output = subprocess.check_output(
    ['squeue', '-u', username, '-h', '-o', '"%i %N %T"']).decode('utf-8')

    # parse output and return a set of status lines
    output = output.strip('" \n').split('\n')
    return set([node.strip() for node in output])


    def send_notification(post_url, data):
    """Send notification via HTTP POST."""
    response = requests.post(post_url, data=data, headers={
    "Title": "Slurm Status Update",
    "Priority": "max",
    "Tags": "computer"
    })
    if response.status_code != 200:
    raise Exception(
    f"Failed to send notification: HTTP {response.status_code}")

    print(datetime.now(), "Notification sent:", data)


    def monitor_slurm_status(username, post_url):
    previous_status = set()

    while True:
    current_status = get_slurm_status_status(username)

    # Check for changes in the nodes
    added_status = current_status - previous_status
    removed_status = previous_status - current_status

    # Send a notification if there are any changes
    if added_status:
    print(f"{datetime.now()} Added: {added_status}")
    message = "squeue update:\n" + "\n".join(added_status)
    # send a notification only if any of the added lines contain "RUNNING"
    if any("RUNNING" in line for line in added_status):
    send_notification(post_url, message)

    if removed_status:
    print(f"{datetime.now()} Removed: {removed_status}")

    # Update the previous nodes
    previous_status = current_status

    # sleep for a random interval betwen 127 and 157 seconds
    time.sleep(random.randint(127, 157))


    if __name__ == '__main__':
    # read username from first argument
    username = sys.argv[1]
    post_url = sys.argv[2]
    monitor_slurm_status(username, post_url)