Created
April 1, 2021 15:56
-
-
Save treydock/a3de2fdd2d7df6da5e332fd9a2d48dc5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| var app_ident = require('../app_ident.js'); | |
| var map_helpers = require('../map_helpers.js'); | |
| module.exports = function(config) { | |
| var appident = app_ident(config.applicationDefn); | |
| var getHardwareConfig = function (setting, default_val) { | |
| var val = config.hardware; | |
| var props = setting.split('.'); | |
| for (let i = 0; i < props.length; i++) { | |
| if (typeof val !== 'undefined' && val.hasOwnProperty(props[i])) { | |
| val = val[props[i]]; | |
| } else { | |
| val = default_val; | |
| break; | |
| } | |
| } | |
| return val; | |
| }; | |
| var getProcInfo = function (job) { | |
| var app = null; | |
| if (job.procDump && job.procDump.constrained) { | |
| app = appident(job.procDump.constrained); | |
| if (!app) { | |
| app = appident(job.procDump.unconstrained); | |
| } | |
| if (!app) { | |
| if (job.procDump.constrained.length > 0) { | |
| return { | |
| executable: job.procDump.constrained[0], | |
| name: 'uncategorized' | |
| }; | |
| } | |
| if (job.procDump.unconstrained.length > 0) { | |
| return { | |
| executable: job.procDump.unconstrained[0], | |
| name: 'uncategorized' | |
| }; | |
| } | |
| } | |
| } | |
| return app; | |
| }; | |
| return { | |
| id: config.resource_id, | |
| name: config.resource, | |
| long_name: config.resource, | |
| gpfs: config.hardware.gpfs, | |
| nodes: 0, | |
| ppn: 0, | |
| start_date: "1970-01-01", | |
| // The summary documents use compression where the various statistics | |
| // for a metric are ommitted if they have default values. The avg | |
| // metric is always provided so that is is possible to determine | |
| // whether the statistic is mssing due to it being a default value. | |
| "getcov": function(job, metricname) { | |
| if (Array.isArray(metricname)) { | |
| for (var i = 0; i < metricname.length; i++) { | |
| var res = this.getcov.call(this, job, metricname[i]); | |
| if (res.error === 0) { | |
| return res; | |
| } | |
| } | |
| return { | |
| value: null, | |
| error: 2 | |
| }; | |
| } | |
| var cov = this.ref(job, metricname + ".cov"); | |
| if (cov.error === 0) { | |
| return cov; | |
| } | |
| var avg = this.ref(job, metricname + ".avg"); | |
| if (avg.error === 0) { | |
| // Avg is present but cov absent, therefore cov is default value of 0.0 | |
| return { | |
| value: 0.0, | |
| error: 0 | |
| }; | |
| } | |
| return { | |
| value: null, | |
| error: cov.error | |
| }; | |
| }, | |
| "getmax": function(job, metricname) { | |
| if (Array.isArray(metricname)) { | |
| for (var i = 0; i < metricname.length; i++) { | |
| var res = this.getmax(job, metricname[i]); | |
| if (res.error === 0) { | |
| return res; | |
| } | |
| } | |
| return { | |
| value: null, | |
| error: 2 | |
| }; | |
| } | |
| var maxval = this.ref(job, metricname + ".max"); | |
| if (maxval.error === 0) { | |
| return maxval; | |
| } | |
| var avg = this.ref(job, metricname + ".avg"); | |
| if (avg.error === 0) { | |
| // Avg is present but max absent, therefore max is same as avg | |
| return avg; | |
| } | |
| return { | |
| value: null, | |
| error: maxval.error | |
| }; | |
| }, | |
| "devices": { | |
| "block_sda": { | |
| "name": "/sda", | |
| "bytes_per_sector": 512 | |
| }, | |
| "netdrv_isilon": { | |
| "name": "ifs" | |
| }, | |
| "netdrv_panasas": { | |
| "name": "panfs" | |
| }, | |
| "net_eth0": { | |
| "name": "em1" | |
| }, | |
| "net_ib0": { | |
| "name": "ib0" | |
| } | |
| }, | |
| "attributes": { | |
| "local_job_id": { | |
| ref: "acct.id" | |
| }, | |
| "name": { | |
| ref: "acct.jobname" | |
| }, | |
| "resource_name": { | |
| formula: function() { | |
| return {value: this.name, error: 0}; | |
| } | |
| }, | |
| "resource_id": { | |
| formula: function() { | |
| return {value: this.id, error: 0}; | |
| } | |
| }, | |
| "organization_id": { | |
| value: 1 | |
| }, | |
| "account": { | |
| ref: "acct.account" | |
| }, | |
| "username": { | |
| ref: "acct.user" | |
| }, | |
| "cwd": { | |
| error: 2 | |
| }, | |
| executable: { | |
| formula: function (job) { | |
| var app = getProcInfo(job); | |
| if (app) { | |
| return { | |
| value: app.executable, | |
| error: 0 | |
| }; | |
| } | |
| return { | |
| value: null, | |
| error: this.metricErrors.codes.metricMissingUnknownReason.value | |
| }; | |
| } | |
| }, | |
| application: { | |
| formula: function (job) { | |
| var app = getProcInfo(job); | |
| if (app) { | |
| return { | |
| value: app.name, | |
| error: 0 | |
| }; | |
| } | |
| return { | |
| value: null, | |
| error: this.metricErrors.codes.metricMissingUnknownReason.value | |
| }; | |
| } | |
| }, | |
| "exit_status": { | |
| formula: function(job) { | |
| var exit = this.ref(job, "acct.exit_status"); | |
| if (exit.error === 0 && exit.value) { | |
| exit.value = exit.value.split(" ")[0]; | |
| } | |
| return exit; | |
| } | |
| }, | |
| "datasource": { | |
| value: "prometheus" | |
| }, | |
| "granted_pe": { | |
| ref: "acct.ncpus" | |
| }, | |
| "queue": { | |
| ref: "acct.partition" | |
| }, | |
| "requested_nodes": { | |
| ref: "acct.nodes" | |
| }, | |
| "hosts": { | |
| ref: "acct.host_list", | |
| required: true | |
| }, | |
| "nodes": { | |
| ref: "acct.nodes", | |
| required: true | |
| }, | |
| "shared": { | |
| formula: function(job) { | |
| if (job.hasOwnProperty("shared")) { | |
| return { | |
| value: job.shared ? 1 : 0, | |
| error: 0 | |
| }; | |
| } else { | |
| return { | |
| value: 0, | |
| error: 0 | |
| }; | |
| } | |
| } | |
| }, | |
| "cores": { | |
| ref: "acct.ncpus", | |
| required: true | |
| }, | |
| "cores_avail": { | |
| formula: function(job) { | |
| if (job.summarization.complete && job.hasOwnProperty("cpu") && job.cpu.hasOwnProperty("nodecpus") && ! job.cpu.nodecpus.hasOwnProperty("error")) { | |
| return this.ref(job, "cpu.nodecpus.all.cnt"); | |
| } else { | |
| return { | |
| value: 0, | |
| error: this.metricErrors.codes.missingCollectionFailed.value | |
| }; | |
| } | |
| } | |
| }, | |
| "submit_time_ts": { | |
| ref: "acct.submit", | |
| required: true | |
| }, | |
| "eligible_time_ts": { | |
| ref: "acct.eligible" | |
| }, | |
| "start_time_ts": { | |
| ref: "acct.start_time", | |
| required: true | |
| }, | |
| "end_time_ts": { | |
| ref: "acct.end_time", | |
| required: true | |
| }, | |
| "wall_time": { | |
| formula: function(job) { | |
| var end_time = this.ref(job, this.attributes.end_time_ts.ref); | |
| var start_time = this.ref(job, this.attributes.start_time_ts.ref); | |
| var combined_error = end_time.error | start_time.error; | |
| if (end_time.value === undefined || start_time.value === undefined) { | |
| return { | |
| value: null, | |
| error: combined_error | |
| }; | |
| } | |
| return { | |
| value: end_time.value - start_time.value, | |
| error: combined_error | |
| }; | |
| }, | |
| required: true | |
| }, | |
| "requested_wall_time": { | |
| formula: function(job) { | |
| var timelimit = this.ref(job, "acct.timelimit"); | |
| if (timelimit.error !== 0 || timelimit.value === null) { | |
| return { | |
| value: null, | |
| error: 2 | |
| }; | |
| } | |
| if (typeof timelimit.value === "number") { | |
| return { | |
| value: timelimit.value, | |
| error: 0 | |
| }; | |
| } | |
| var result = timelimit.value.match(/^(?:([0-9]+)-)?([0-9]{2}):([0-9]{2}):([0-9]{2})$/); | |
| if (result) { | |
| if (result[1]) { | |
| return { | |
| value: (24 * 3600 * result[1]) + (3600 * result[2]) + (60 * result[3]) + (1 * result[4]), | |
| error: 0 | |
| }; | |
| } else { | |
| return { | |
| value: (3600 * result[2]) + (60 * result[3]) + (1 * result[4]), | |
| error: 0 | |
| }; | |
| } | |
| } else { | |
| return { | |
| value: null, | |
| error: 2 | |
| }; | |
| } | |
| } | |
| }, | |
| "wait_time": { | |
| formula: function(job) { | |
| var start_time = this.ref(job, this.attributes.start_time_ts.ref); | |
| var submit_time = this.ref(job, this.attributes.submit_time_ts.ref); | |
| var combined_error = start_time.error | submit_time.error; | |
| if (start_time.value === undefined || submit_time.value === undefined) { | |
| return { | |
| value: null, | |
| error: combined_error | |
| }; | |
| } | |
| return { | |
| value: start_time.value - submit_time.value, | |
| error: combined_error | |
| }; | |
| }, | |
| required: true | |
| }, | |
| "cpu_time": { | |
| formula: function(job) { | |
| var wall_time = this.attributes.wall_time.formula.call(this, job); | |
| var num_cores = this.ref(job, this.attributes.cores.ref); | |
| var combined_error = wall_time.error | num_cores.error; | |
| if (wall_time.value === undefined || num_cores.value === undefined) { | |
| return { | |
| value: null, | |
| error: combined_error | |
| }; | |
| } | |
| return { | |
| value: wall_time.value * num_cores.value, | |
| error: combined_error | |
| }; | |
| }, | |
| required: true | |
| }, | |
| "node_time": { | |
| formula: function(job) { | |
| var wall_time = this.attributes.wall_time.formula.call(this, job); | |
| var num_nodes = this.ref(job, this.attributes.nodes.ref); | |
| var combined_error = wall_time.error | num_nodes.error; | |
| if (wall_time.value === undefined || num_nodes.value === undefined) { | |
| return { | |
| value: null, | |
| error: combined_error | |
| }; | |
| } | |
| return { | |
| value: wall_time.value * num_nodes.value, | |
| error: combined_error | |
| }; | |
| }, | |
| required: true | |
| }, | |
| "cpu_idle": { | |
| ref: ["cpu.cgroup.idle.avg", "cpu.jobcpus.idle.avg", "cpu.nodecpus.idle.avg"] | |
| }, | |
| "cpu_system": { | |
| ref: ["cpu.cgroup.system.avg", "cpu.jobcpus.system.avg", "cpu.nodecpus.system.avg"] | |
| }, | |
| "cpu_user": { | |
| ref: ["cpu.cgroup.user.avg", "cpu.jobcpus.user.avg", "cpu.nodecpus.user.avg"] | |
| }, | |
| "error": { | |
| error: 2 | |
| }, | |
| "flops": { | |
| ref: "cpuperf.flops.avg" | |
| }, | |
| "flops_cov": { | |
| formula: function(job) { | |
| return this.getcov.call(this, job, "cpuperf.flops"); | |
| } | |
| }, | |
| "cpiref": { | |
| ref: "cpuperf.cpiref.avg" | |
| }, | |
| "cpiref_cov": { | |
| formula: function(job) { | |
| return this.getcov.call(this, job, "cpuperf.cpiref"); | |
| } | |
| }, | |
| catastrophe: { | |
| formula: function (job) { | |
| var result = { | |
| value: null, | |
| error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value | |
| }; | |
| if (job.catastrophe) { | |
| if (job.catastrophe.error) { | |
| switch (job.catastrophe.error) { | |
| case 1: | |
| result.error = this.metricErrors.codes.metricDisabledByUser.value; | |
| break; | |
| case 2: | |
| result.error = this.metricErrors.codes.metricInsufficientData.value; | |
| break; | |
| case 6: | |
| result.error = this.metricErrors.codes.metricCounterRollover.value; | |
| break; | |
| default: | |
| result.error = this.metricErrors.codes.metricMissingUnknownReason.value; | |
| break; | |
| } | |
| } else if (Number.isNaN(job.catastrophe.value)) { | |
| result.error = this.metricErrors.codes.metricSummarizationError.value; | |
| } else { | |
| result.value = job.catastrophe.value; | |
| result.error = 0; | |
| } | |
| } | |
| return result; | |
| } | |
| }, | |
| "cpldref": { | |
| ref: "cpuperf.cpldref.avg" | |
| }, | |
| "cpldref_cov": { | |
| formula: function(job) { | |
| return this.getcov.call(this, job, "cpuperf.cpldref"); | |
| } | |
| }, | |
| "mem_transferred": { | |
| ref: "uncperf.membw.avg" | |
| }, | |
| "mem_transferred_cov": { | |
| formula: function(job) { | |
| return this.getcov.call(this, job, "uncperf.membw"); | |
| } | |
| }, | |
| "cpu_user_imbalance": { | |
| formula: function(job) { | |
| var cpu_count = this.ref(job, ["cpu.jobcpus.user.cnt", "cpu.nodecpus.user.cnt"]); | |
| var cpu_user_min = this.ref(job, ["cpu.cgroup.user.min", "cpu.jobcpus.user.min", "cpu.nodecpus.user.min"]); | |
| var cpu_user_max = this.ref(job, ["cpu.cgroup.user.max", "cpu.jobcpus.user.max", "cpu.nodecpus.user.max"]); | |
| var error = cpu_user_min.error | cpu_user_max.error | cpu_count.error; | |
| if (error === 0) { | |
| if (cpu_count.value <= 1) { | |
| return { | |
| value: 0.0, | |
| error: error | |
| }; | |
| } else { | |
| return { | |
| value: 100.0 * (cpu_user_max.value - cpu_user_min.value) / cpu_user_max.value, | |
| error: error | |
| }; | |
| } | |
| } else { | |
| return { | |
| value: null, | |
| error: error | |
| }; | |
| } | |
| } | |
| }, | |
| "cpu_user_cv": { | |
| formula: function(job) { | |
| return this.getcov.call(this, job, ["cpu.cgroup.user", "cpu.jobcpus.user", "cpu.nodecpus.user"]); | |
| } | |
| }, | |
| node_cpu_idle: { | |
| ref: 'cpu.nodecpus.idle.avg' | |
| }, | |
| energy: { | |
| ref: 'ipmi.energy.avg' | |
| }, | |
| max_power: { | |
| formula: function (job) { | |
| return this.getmax(job, 'ipmi.power.max'); | |
| } | |
| }, | |
| "memory_used": { | |
| formula: function(job) { | |
| var mem = this.ref(job, "memory.used_minus_cache.avg"); | |
| if (mem.error === 0) { | |
| return { | |
| value: mem.value * 1024.0, | |
| error: 0 | |
| }; | |
| } | |
| return { | |
| value: null, | |
| error: mem.error | |
| }; | |
| } | |
| }, | |
| "memory_used_cov": { | |
| formula: function(job) { | |
| return this.getcov.call(this, job, "memory.used_minus_cache"); | |
| } | |
| }, | |
| "max_memory": { | |
| formula: function(job) { | |
| return this.getmax(job, 'process_memory.usageratio.max'); | |
| } | |
| }, | |
| "mem_used_including_os_caches": { | |
| formula: function(job) { | |
| var mem = this.ref(job, "memory.used.avg"); | |
| if (mem.error === 0) { | |
| return { | |
| value: mem.value * 1024.0, | |
| error: 0 | |
| }; | |
| } | |
| return { | |
| value: null, | |
| error: mem.error | |
| }; | |
| } | |
| }, | |
| "mem_used_including_os_caches_cov": { | |
| formula: function(job) { | |
| return this.getcov.call(this, job, "memory.used"); | |
| } | |
| }, | |
| "ib_rx_bytes": map_helpers.device('infiniband', 'all', 'switch-out-bytes'), | |
| block_sda_wr_ios: map_helpers.device('block', getHardwareConfig('block', 'sda'), 'write'), | |
| block_sda_wr_bytes: map_helpers.device('block', getHardwareConfig('block', 'sda'), 'write_bytes'), | |
| block_sda_wr_bytes_cov: map_helpers.device_cov('block', getHardwareConfig('block', 'sda'), 'write_bytes'), | |
| block_sda_rd_ios: map_helpers.device('block', getHardwareConfig('block', 'sda'), 'read'), | |
| block_sda_rd_bytes: map_helpers.device('block', getHardwareConfig('block', 'sda'), 'read_bytes'), | |
| block_sda_rd_bytes_cov: map_helpers.device_cov('block', getHardwareConfig('block', 'sda'), 'read_bytes'), | |
| netdrv_gpfs_rx: map_helpers.device('gpfs', 'all', 'read_bytes'), | |
| netdrv_gpfs_rx_cov: map_helpers.device_cov('gpfs', 'all', 'read_bytes'), | |
| netdrv_gpfs_rx_msgs: map_helpers.device('gpfs', 'all', 'reads'), | |
| netdrv_gpfs_tx: map_helpers.device('gpfs', 'all', 'write_bytes'), | |
| netdrv_gpfs_tx_cov: map_helpers.device_cov('gpfs', 'all', 'write_bytes'), | |
| netdrv_gpfs_tx_msgs: map_helpers.device('gpfs', 'all', 'writes'), | |
| "netdrv_isilon_rx": { | |
| error: 2 | |
| }, | |
| "netdrv_isilon_rx_cov": { | |
| error: 2 | |
| }, | |
| "netdrv_isilon_rx_msgs": { | |
| error: 2 | |
| }, | |
| "netdrv_isilon_tx": { | |
| error: 2 | |
| }, | |
| "netdrv_isilon_tx_cov": { | |
| error: 2 | |
| }, | |
| "netdrv_isilon_tx_msgs": { | |
| error: 2 | |
| }, | |
| "netdrv_panasas_rx": { | |
| error: 2 | |
| }, | |
| "netdrv_panasas_rx_cov": { | |
| error: 2 | |
| }, | |
| "netdrv_panasas_rx_msgs": { | |
| error: 2 | |
| }, | |
| "netdrv_panasas_tx": { | |
| error: 2 | |
| }, | |
| "netdrv_panasas_tx_cov": { | |
| error: 2 | |
| }, | |
| "netdrv_panasas_tx_msgs": { | |
| error: 2 | |
| }, | |
| netdir_home_read: { | |
| formula: function (job) { | |
| if (!job.nfs) { | |
| return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value }; | |
| } | |
| var read = 0.0; | |
| for (let mount in job.nfs) { | |
| if (job.nfs.hasOwnProperty(mount)) { | |
| if (job.nfs[mount].read && job.nfs[mount].read.avg) { | |
| read += job.nfs[mount].read.avg | |
| } | |
| } | |
| } | |
| return { value: read, error: 0 }; | |
| } | |
| }, | |
| netdir_home_write: { | |
| formula: function (job) { | |
| if (!job.nfs) { | |
| return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value }; | |
| } | |
| var write = 0.0; | |
| for (let mount in job.nfs) { | |
| if (job.nfs.hasOwnProperty(mount)) { | |
| if (job.nfs[mount].write && job.nfs[mount].write.avg) { | |
| write += job.nfs[mount].write.avg | |
| } | |
| } | |
| } | |
| return { value: write, error: 0 }; | |
| } | |
| }, | |
| netdir_projects_read: map_helpers.sum( | |
| ['nfs', getHardwareConfig('mounts.projects', '/projects')], | |
| ['read'] | |
| ), | |
| netdir_projects_write: map_helpers.sum( | |
| ['nfs', getHardwareConfig('mounts.projects', '/projects')], | |
| ['write'] | |
| ), | |
| netdir_util_read: map_helpers.sum( | |
| ['nfs', getHardwareConfig('mounts.util', '/util')], | |
| ['read'] | |
| ), | |
| netdir_util_write: map_helpers.sum( | |
| ['nfs', getHardwareConfig('mounts.util', '/util')], | |
| ['write'] | |
| ), | |
| net_eth0_rx: map_helpers.device('network', getHardwareConfig('network', 'em1'), 'in-bytes', ['lo', 'ib0', 'ib1']), | |
| net_eth0_rx_cov: map_helpers.device_cov('network', getHardwareConfig('network', 'em1'), 'in-bytes', ['lo', 'ib0', 'ib1']), | |
| net_eth0_rx_packets: map_helpers.device('network', getHardwareConfig('network', 'em1'), 'in-packets', ['lo', 'ib0', 'ib1']), | |
| net_eth0_tx: map_helpers.device('network', getHardwareConfig('network', 'em1'), 'out-bytes', ['lo', 'ib0', 'ib1']), | |
| net_eth0_tx_cov: map_helpers.device_cov('network', getHardwareConfig('network', 'em1'), 'out-bytes', ['lo', 'ib0', 'ib1']), | |
| net_eth0_tx_packets: map_helpers.device('network', getHardwareConfig('network', 'em1'), 'out-packets', ['lo', 'ib0', 'ib1']), | |
| "net_ib0_rx": { | |
| ref: "network.ib0.in-bytes.avg" | |
| }, | |
| "net_ib0_rx_packets": { | |
| ref: "network.ib0.in-packets.avg" | |
| }, | |
| "net_ib0_tx": { | |
| ref: "network.ib0.out-bytes.avg" | |
| }, | |
| "net_ib0_tx_packets": { | |
| ref: "network.ib0.out-packets.avg" | |
| }, | |
| gpu_energy: { | |
| formula: function (job) { | |
| if (!job.gpupower) { | |
| return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value }; | |
| } | |
| var energy = 0.0; | |
| var device_count = 0; | |
| for (let gpu in job.gpupower) { | |
| if (job.gpupower.hasOwnProperty(gpu)) { | |
| if (job.gpupower[gpu].energy && job.gpupower[gpu].energy.avg) { | |
| energy += job.gpupower[gpu].energy.avg; | |
| device_count += 1; | |
| } | |
| } | |
| } | |
| if (device_count === 0) { | |
| return { value: null, error: this.metricErrors.codes.missingCollectionFailed.value }; | |
| } | |
| return { value: energy, error: 0 }; | |
| } | |
| }, | |
| gpu_max_power: { | |
| formula: function (job) { | |
| if (!job.gpupower) { | |
| return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value }; | |
| } | |
| var max_power = 0.0; | |
| var device_count = 0; | |
| for (let gpu in job.gpupower) { | |
| if (job.gpupower.hasOwnProperty(gpu)) { | |
| if (job.gpupower[gpu].power && job.gpupower[gpu].power.max) { | |
| if (job.gpupower[gpu].power.max.max) { | |
| max_power = Math.max(max_power, job.gpupower[gpu].power.max.max); | |
| device_count += 1; | |
| } else if (job.gpupower[gpu].power.max.avg) { | |
| max_power = Math.max(max_power, job.gpupower[gpu].power.max.avg); | |
| device_count += 1; | |
| } | |
| } | |
| } | |
| } | |
| if (device_count === 0) { | |
| return { value: null, error: this.metricErrors.codes.missingCollectionFailed.value }; | |
| } | |
| return { value: max_power, error: 0 }; | |
| } | |
| }, | |
| "gpu0_nv_mem_used": { | |
| formula: function(job) { | |
| if (!job.gpu) { | |
| return { value: null, error: this.metricErrors.codes.missingCollectionFailed.value }; | |
| } | |
| var job_gpus = this.ref(job, "acct.gpus"); | |
| if (job_gpus.value === undefined || job_gpus.value === 0) { | |
| return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value }; | |
| } | |
| var util = 0.0; | |
| for (let gpu in job.gpu) { | |
| if (job.gpu.hasOwnProperty(gpu)) { | |
| if (job.gpu[gpu].memused && job.gpu[gpu].memused.avg) { | |
| util += job.gpu[gpu].memused.avg; | |
| } | |
| } | |
| } | |
| return { value: util / job_gpus.value, error: 0 }; | |
| } | |
| }, | |
| "gpu0_nv_utilization": { | |
| formula: function(job) { | |
| if (!job.gpu) { | |
| return { value: null, error: this.metricErrors.codes.missingCollectionFailed.value }; | |
| } | |
| var job_gpus = this.ref(job, "acct.gpus"); | |
| if (job_gpus.value === undefined || job_gpus.value === 0) { | |
| return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value }; | |
| } | |
| var util = 0.0; | |
| for (let gpu in job.gpu) { | |
| if (job.gpu.hasOwnProperty(gpu)) { | |
| if (job.gpu[gpu].util && job.gpu[gpu].util.avg) { | |
| util += job.gpu[gpu].util.avg; | |
| } | |
| } | |
| } | |
| return { value: util / job_gpus.value / 100.0, error: 0 }; | |
| } | |
| } | |
| } | |
| }; | |
| }; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment