#!/usr/bin/env python import os import re import subprocess import socket import sys def pids_of_jid(jid): result = subprocess.run(["sstat", "-p", "--format=PID", "-j", jid, "--noheader"], stdout=subprocess.PIPE) pids = result.stdout.decode("utf-8").strip().strip("|").split(",") return pids def devices_of_pid(pid): cuda_visible_devices = [] with open(f"/proc/{pid}/environ") as f: for line in f.read().split("\0"): if line.startswith("CUDA_VISIBLE_DEVICES="): return [int(device) for device in re.sub("CUDA_VISIBLE_DEVICES=", "", line).split(",")] return [] def devices_of_jid(jid): # Look up the CUDA_VISIBLE_DEVICES for all pids and make sure they match. cuda_visible_devices = set() for pid in pids: for device in devices_of_pid(pid): cuda_visible_devices.add(device) return cuda_visible_devices def get_jobs(): hostname = socket.gethostname() result = subprocess.run(["squeue", "--format=%A,%u", "--noheader", "-w", hostname], stdout=subprocess.PIPE) return [line.split(",") for line in result.stdout.decode("utf-8").splitlines()] def gpu_utilization(): result = subprocess.run(["nvidia-smi", "--query-gpu=utilization.gpu,memory.used,memory.total", "--format=csv,noheader"], stdout=subprocess.PIPE) utilization = [] for line in result.stdout.decode("utf-8").splitlines(): split = [part.strip() for part in line.split(",")] proc = int(re.sub('[^0-9]', "", split[0])) memused = int(re.sub('[^0-9]', "", split[1])) memtotal = int(re.sub('[^0-9]', "", split[2])) utilization.append([proc, 100 * memused / memtotal]) return utilization if not 'SUDO_UID' in os.environ.keys(): print("This program requires super user.") sys.exit(1) gpu_stats = gpu_utilization() for jid, user in get_jobs(): pids = pids_of_jid(jid) cuda_visible_devices = devices_of_jid(jid) processor = sum([gpu_stats[device][0] for device in cuda_visible_devices]) / len(cuda_visible_devices) memory = int(sum([gpu_stats[device][1] for device in cuda_visible_devices]) / len(cuda_visible_devices)) print("{} ({}) -> {} (proc={}%, memused={}%)".format(jid, user, cuda_visible_devices, processor, memory))