Skip to content

Instantly share code, notes, and snippets.

@treydock
Created April 1, 2021 15:56
Show Gist options
  • Select an option

  • Save treydock/a3de2fdd2d7df6da5e332fd9a2d48dc5 to your computer and use it in GitHub Desktop.

Select an option

Save treydock/a3de2fdd2d7df6da5e332fd9a2d48dc5 to your computer and use it in GitHub Desktop.

Revisions

  1. treydock created this gist Apr 1, 2021.
    788 changes: 788 additions & 0 deletions prometheus-osc.js
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,788 @@
    var app_ident = require('../app_ident.js');
    var map_helpers = require('../map_helpers.js');

    module.exports = function(config) {
    var appident = app_ident(config.applicationDefn);

    var getHardwareConfig = function (setting, default_val) {
    var val = config.hardware;
    var props = setting.split('.');
    for (let i = 0; i < props.length; i++) {
    if (typeof val !== 'undefined' && val.hasOwnProperty(props[i])) {
    val = val[props[i]];
    } else {
    val = default_val;
    break;
    }
    }
    return val;
    };

    var getProcInfo = function (job) {
    var app = null;

    if (job.procDump && job.procDump.constrained) {
    app = appident(job.procDump.constrained);

    if (!app) {
    app = appident(job.procDump.unconstrained);
    }

    if (!app) {
    if (job.procDump.constrained.length > 0) {
    return {
    executable: job.procDump.constrained[0],
    name: 'uncategorized'
    };
    }
    if (job.procDump.unconstrained.length > 0) {
    return {
    executable: job.procDump.unconstrained[0],
    name: 'uncategorized'
    };
    }
    }
    }

    return app;
    };

    return {
    id: config.resource_id,
    name: config.resource,
    long_name: config.resource,
    gpfs: config.hardware.gpfs,
    nodes: 0,
    ppn: 0,
    start_date: "1970-01-01",
    // The summary documents use compression where the various statistics
    // for a metric are ommitted if they have default values. The avg
    // metric is always provided so that is is possible to determine
    // whether the statistic is mssing due to it being a default value.
    "getcov": function(job, metricname) {
    if (Array.isArray(metricname)) {
    for (var i = 0; i < metricname.length; i++) {
    var res = this.getcov.call(this, job, metricname[i]);
    if (res.error === 0) {
    return res;
    }
    }
    return {
    value: null,
    error: 2
    };
    }
    var cov = this.ref(job, metricname + ".cov");
    if (cov.error === 0) {
    return cov;
    }
    var avg = this.ref(job, metricname + ".avg");
    if (avg.error === 0) {
    // Avg is present but cov absent, therefore cov is default value of 0.0
    return {
    value: 0.0,
    error: 0
    };
    }
    return {
    value: null,
    error: cov.error
    };
    },
    "getmax": function(job, metricname) {
    if (Array.isArray(metricname)) {
    for (var i = 0; i < metricname.length; i++) {
    var res = this.getmax(job, metricname[i]);
    if (res.error === 0) {
    return res;
    }
    }
    return {
    value: null,
    error: 2
    };
    }
    var maxval = this.ref(job, metricname + ".max");
    if (maxval.error === 0) {
    return maxval;
    }
    var avg = this.ref(job, metricname + ".avg");
    if (avg.error === 0) {
    // Avg is present but max absent, therefore max is same as avg
    return avg;
    }
    return {
    value: null,
    error: maxval.error
    };
    },

    "devices": {
    "block_sda": {
    "name": "/sda",
    "bytes_per_sector": 512
    },
    "netdrv_isilon": {
    "name": "ifs"
    },
    "netdrv_panasas": {
    "name": "panfs"
    },
    "net_eth0": {
    "name": "em1"
    },
    "net_ib0": {
    "name": "ib0"
    }
    },

    "attributes": {
    "local_job_id": {
    ref: "acct.id"
    },
    "name": {
    ref: "acct.jobname"
    },
    "resource_name": {
    formula: function() {
    return {value: this.name, error: 0};
    }
    },
    "resource_id": {
    formula: function() {
    return {value: this.id, error: 0};
    }
    },
    "organization_id": {
    value: 1
    },
    "account": {
    ref: "acct.account"
    },
    "username": {
    ref: "acct.user"
    },
    "cwd": {
    error: 2
    },
    executable: {
    formula: function (job) {
    var app = getProcInfo(job);
    if (app) {
    return {
    value: app.executable,
    error: 0
    };
    }
    return {
    value: null,
    error: this.metricErrors.codes.metricMissingUnknownReason.value
    };
    }
    },
    application: {
    formula: function (job) {
    var app = getProcInfo(job);
    if (app) {
    return {
    value: app.name,
    error: 0
    };
    }
    return {
    value: null,
    error: this.metricErrors.codes.metricMissingUnknownReason.value
    };
    }
    },
    "exit_status": {
    formula: function(job) {
    var exit = this.ref(job, "acct.exit_status");
    if (exit.error === 0 && exit.value) {
    exit.value = exit.value.split(" ")[0];
    }
    return exit;
    }
    },
    "datasource": {
    value: "prometheus"
    },
    "granted_pe": {
    ref: "acct.ncpus"
    },
    "queue": {
    ref: "acct.partition"
    },
    "requested_nodes": {
    ref: "acct.nodes"
    },
    "hosts": {
    ref: "acct.host_list",
    required: true
    },
    "nodes": {
    ref: "acct.nodes",
    required: true
    },
    "shared": {
    formula: function(job) {
    if (job.hasOwnProperty("shared")) {
    return {
    value: job.shared ? 1 : 0,
    error: 0
    };
    } else {
    return {
    value: 0,
    error: 0
    };
    }
    }
    },
    "cores": {
    ref: "acct.ncpus",
    required: true
    },
    "cores_avail": {
    formula: function(job) {
    if (job.summarization.complete && job.hasOwnProperty("cpu") && job.cpu.hasOwnProperty("nodecpus") && ! job.cpu.nodecpus.hasOwnProperty("error")) {
    return this.ref(job, "cpu.nodecpus.all.cnt");
    } else {
    return {
    value: 0,
    error: this.metricErrors.codes.missingCollectionFailed.value
    };
    }
    }
    },
    "submit_time_ts": {
    ref: "acct.submit",
    required: true
    },
    "eligible_time_ts": {
    ref: "acct.eligible"
    },
    "start_time_ts": {
    ref: "acct.start_time",
    required: true
    },
    "end_time_ts": {
    ref: "acct.end_time",
    required: true
    },
    "wall_time": {
    formula: function(job) {
    var end_time = this.ref(job, this.attributes.end_time_ts.ref);
    var start_time = this.ref(job, this.attributes.start_time_ts.ref);

    var combined_error = end_time.error | start_time.error;

    if (end_time.value === undefined || start_time.value === undefined) {
    return {
    value: null,
    error: combined_error
    };
    }

    return {
    value: end_time.value - start_time.value,
    error: combined_error
    };
    },
    required: true
    },
    "requested_wall_time": {
    formula: function(job) {

    var timelimit = this.ref(job, "acct.timelimit");

    if (timelimit.error !== 0 || timelimit.value === null) {
    return {
    value: null,
    error: 2
    };
    }

    if (typeof timelimit.value === "number") {
    return {
    value: timelimit.value,
    error: 0
    };
    }

    var result = timelimit.value.match(/^(?:([0-9]+)-)?([0-9]{2}):([0-9]{2}):([0-9]{2})$/);
    if (result) {
    if (result[1]) {
    return {
    value: (24 * 3600 * result[1]) + (3600 * result[2]) + (60 * result[3]) + (1 * result[4]),
    error: 0
    };
    } else {
    return {
    value: (3600 * result[2]) + (60 * result[3]) + (1 * result[4]),
    error: 0
    };
    }
    } else {
    return {
    value: null,
    error: 2
    };
    }
    }
    },
    "wait_time": {
    formula: function(job) {
    var start_time = this.ref(job, this.attributes.start_time_ts.ref);
    var submit_time = this.ref(job, this.attributes.submit_time_ts.ref);

    var combined_error = start_time.error | submit_time.error;

    if (start_time.value === undefined || submit_time.value === undefined) {
    return {
    value: null,
    error: combined_error
    };
    }

    return {
    value: start_time.value - submit_time.value,
    error: combined_error
    };
    },
    required: true
    },
    "cpu_time": {
    formula: function(job) {
    var wall_time = this.attributes.wall_time.formula.call(this, job);
    var num_cores = this.ref(job, this.attributes.cores.ref);

    var combined_error = wall_time.error | num_cores.error;

    if (wall_time.value === undefined || num_cores.value === undefined) {
    return {
    value: null,
    error: combined_error
    };
    }

    return {
    value: wall_time.value * num_cores.value,
    error: combined_error
    };
    },
    required: true
    },
    "node_time": {
    formula: function(job) {
    var wall_time = this.attributes.wall_time.formula.call(this, job);
    var num_nodes = this.ref(job, this.attributes.nodes.ref);

    var combined_error = wall_time.error | num_nodes.error;

    if (wall_time.value === undefined || num_nodes.value === undefined) {
    return {
    value: null,
    error: combined_error
    };
    }

    return {
    value: wall_time.value * num_nodes.value,
    error: combined_error
    };
    },
    required: true
    },
    "cpu_idle": {
    ref: ["cpu.cgroup.idle.avg", "cpu.jobcpus.idle.avg", "cpu.nodecpus.idle.avg"]
    },
    "cpu_system": {
    ref: ["cpu.cgroup.system.avg", "cpu.jobcpus.system.avg", "cpu.nodecpus.system.avg"]
    },
    "cpu_user": {
    ref: ["cpu.cgroup.user.avg", "cpu.jobcpus.user.avg", "cpu.nodecpus.user.avg"]
    },
    "error": {
    error: 2
    },
    "flops": {
    ref: "cpuperf.flops.avg"
    },
    "flops_cov": {
    formula: function(job) {
    return this.getcov.call(this, job, "cpuperf.flops");
    }
    },
    "cpiref": {
    ref: "cpuperf.cpiref.avg"
    },
    "cpiref_cov": {
    formula: function(job) {
    return this.getcov.call(this, job, "cpuperf.cpiref");
    }
    },
    catastrophe: {
    formula: function (job) {
    var result = {
    value: null,
    error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value
    };

    if (job.catastrophe) {
    if (job.catastrophe.error) {
    switch (job.catastrophe.error) {
    case 1:
    result.error = this.metricErrors.codes.metricDisabledByUser.value;
    break;
    case 2:
    result.error = this.metricErrors.codes.metricInsufficientData.value;
    break;
    case 6:
    result.error = this.metricErrors.codes.metricCounterRollover.value;
    break;
    default:
    result.error = this.metricErrors.codes.metricMissingUnknownReason.value;
    break;
    }
    } else if (Number.isNaN(job.catastrophe.value)) {
    result.error = this.metricErrors.codes.metricSummarizationError.value;
    } else {
    result.value = job.catastrophe.value;
    result.error = 0;
    }
    }

    return result;
    }
    },
    "cpldref": {
    ref: "cpuperf.cpldref.avg"
    },
    "cpldref_cov": {
    formula: function(job) {
    return this.getcov.call(this, job, "cpuperf.cpldref");
    }
    },
    "mem_transferred": {
    ref: "uncperf.membw.avg"
    },
    "mem_transferred_cov": {
    formula: function(job) {
    return this.getcov.call(this, job, "uncperf.membw");
    }
    },
    "cpu_user_imbalance": {
    formula: function(job) {
    var cpu_count = this.ref(job, ["cpu.jobcpus.user.cnt", "cpu.nodecpus.user.cnt"]);
    var cpu_user_min = this.ref(job, ["cpu.cgroup.user.min", "cpu.jobcpus.user.min", "cpu.nodecpus.user.min"]);
    var cpu_user_max = this.ref(job, ["cpu.cgroup.user.max", "cpu.jobcpus.user.max", "cpu.nodecpus.user.max"]);
    var error = cpu_user_min.error | cpu_user_max.error | cpu_count.error;
    if (error === 0) {
    if (cpu_count.value <= 1) {
    return {
    value: 0.0,
    error: error
    };
    } else {
    return {
    value: 100.0 * (cpu_user_max.value - cpu_user_min.value) / cpu_user_max.value,
    error: error
    };
    }
    } else {
    return {
    value: null,
    error: error
    };
    }
    }
    },
    "cpu_user_cv": {
    formula: function(job) {
    return this.getcov.call(this, job, ["cpu.cgroup.user", "cpu.jobcpus.user", "cpu.nodecpus.user"]);
    }
    },
    node_cpu_idle: {
    ref: 'cpu.nodecpus.idle.avg'
    },
    energy: {
    ref: 'ipmi.energy.avg'
    },
    max_power: {
    formula: function (job) {
    return this.getmax(job, 'ipmi.power.max');
    }
    },
    "memory_used": {
    formula: function(job) {
    var mem = this.ref(job, "memory.used_minus_cache.avg");
    if (mem.error === 0) {
    return {
    value: mem.value * 1024.0,
    error: 0
    };
    }
    return {
    value: null,
    error: mem.error
    };
    }
    },
    "memory_used_cov": {
    formula: function(job) {
    return this.getcov.call(this, job, "memory.used_minus_cache");
    }
    },
    "max_memory": {
    formula: function(job) {
    return this.getmax(job, 'process_memory.usageratio.max');
    }
    },
    "mem_used_including_os_caches": {
    formula: function(job) {
    var mem = this.ref(job, "memory.used.avg");
    if (mem.error === 0) {
    return {
    value: mem.value * 1024.0,
    error: 0
    };
    }
    return {
    value: null,
    error: mem.error
    };
    }
    },
    "mem_used_including_os_caches_cov": {
    formula: function(job) {
    return this.getcov.call(this, job, "memory.used");
    }
    },
    "ib_rx_bytes": map_helpers.device('infiniband', 'all', 'switch-out-bytes'),

    block_sda_wr_ios: map_helpers.device('block', getHardwareConfig('block', 'sda'), 'write'),
    block_sda_wr_bytes: map_helpers.device('block', getHardwareConfig('block', 'sda'), 'write_bytes'),
    block_sda_wr_bytes_cov: map_helpers.device_cov('block', getHardwareConfig('block', 'sda'), 'write_bytes'),

    block_sda_rd_ios: map_helpers.device('block', getHardwareConfig('block', 'sda'), 'read'),
    block_sda_rd_bytes: map_helpers.device('block', getHardwareConfig('block', 'sda'), 'read_bytes'),
    block_sda_rd_bytes_cov: map_helpers.device_cov('block', getHardwareConfig('block', 'sda'), 'read_bytes'),

    netdrv_gpfs_rx: map_helpers.device('gpfs', 'all', 'read_bytes'),
    netdrv_gpfs_rx_cov: map_helpers.device_cov('gpfs', 'all', 'read_bytes'),
    netdrv_gpfs_rx_msgs: map_helpers.device('gpfs', 'all', 'reads'),

    netdrv_gpfs_tx: map_helpers.device('gpfs', 'all', 'write_bytes'),
    netdrv_gpfs_tx_cov: map_helpers.device_cov('gpfs', 'all', 'write_bytes'),
    netdrv_gpfs_tx_msgs: map_helpers.device('gpfs', 'all', 'writes'),

    "netdrv_isilon_rx": {
    error: 2
    },
    "netdrv_isilon_rx_cov": {
    error: 2
    },
    "netdrv_isilon_rx_msgs": {
    error: 2
    },
    "netdrv_isilon_tx": {
    error: 2
    },
    "netdrv_isilon_tx_cov": {
    error: 2
    },
    "netdrv_isilon_tx_msgs": {
    error: 2
    },
    "netdrv_panasas_rx": {
    error: 2
    },
    "netdrv_panasas_rx_cov": {
    error: 2
    },
    "netdrv_panasas_rx_msgs": {
    error: 2
    },
    "netdrv_panasas_tx": {
    error: 2
    },
    "netdrv_panasas_tx_cov": {
    error: 2
    },
    "netdrv_panasas_tx_msgs": {
    error: 2
    },
    netdir_home_read: {
    formula: function (job) {
    if (!job.nfs) {
    return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value };
    }

    var read = 0.0;

    for (let mount in job.nfs) {
    if (job.nfs.hasOwnProperty(mount)) {
    if (job.nfs[mount].read && job.nfs[mount].read.avg) {
    read += job.nfs[mount].read.avg
    }
    }
    }

    return { value: read, error: 0 };
    }
    },
    netdir_home_write: {
    formula: function (job) {
    if (!job.nfs) {
    return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value };
    }

    var write = 0.0;

    for (let mount in job.nfs) {
    if (job.nfs.hasOwnProperty(mount)) {
    if (job.nfs[mount].write && job.nfs[mount].write.avg) {
    write += job.nfs[mount].write.avg
    }
    }
    }

    return { value: write, error: 0 };
    }
    },
    netdir_projects_read: map_helpers.sum(
    ['nfs', getHardwareConfig('mounts.projects', '/projects')],
    ['read']
    ),
    netdir_projects_write: map_helpers.sum(
    ['nfs', getHardwareConfig('mounts.projects', '/projects')],
    ['write']
    ),
    netdir_util_read: map_helpers.sum(
    ['nfs', getHardwareConfig('mounts.util', '/util')],
    ['read']
    ),
    netdir_util_write: map_helpers.sum(
    ['nfs', getHardwareConfig('mounts.util', '/util')],
    ['write']
    ),

    net_eth0_rx: map_helpers.device('network', getHardwareConfig('network', 'em1'), 'in-bytes', ['lo', 'ib0', 'ib1']),
    net_eth0_rx_cov: map_helpers.device_cov('network', getHardwareConfig('network', 'em1'), 'in-bytes', ['lo', 'ib0', 'ib1']),
    net_eth0_rx_packets: map_helpers.device('network', getHardwareConfig('network', 'em1'), 'in-packets', ['lo', 'ib0', 'ib1']),

    net_eth0_tx: map_helpers.device('network', getHardwareConfig('network', 'em1'), 'out-bytes', ['lo', 'ib0', 'ib1']),
    net_eth0_tx_cov: map_helpers.device_cov('network', getHardwareConfig('network', 'em1'), 'out-bytes', ['lo', 'ib0', 'ib1']),
    net_eth0_tx_packets: map_helpers.device('network', getHardwareConfig('network', 'em1'), 'out-packets', ['lo', 'ib0', 'ib1']),

    "net_ib0_rx": {
    ref: "network.ib0.in-bytes.avg"
    },
    "net_ib0_rx_packets": {
    ref: "network.ib0.in-packets.avg"
    },
    "net_ib0_tx": {
    ref: "network.ib0.out-bytes.avg"
    },
    "net_ib0_tx_packets": {
    ref: "network.ib0.out-packets.avg"
    },
    gpu_energy: {
    formula: function (job) {
    if (!job.gpupower) {
    return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value };
    }

    var energy = 0.0;
    var device_count = 0;

    for (let gpu in job.gpupower) {
    if (job.gpupower.hasOwnProperty(gpu)) {
    if (job.gpupower[gpu].energy && job.gpupower[gpu].energy.avg) {
    energy += job.gpupower[gpu].energy.avg;
    device_count += 1;
    }
    }
    }

    if (device_count === 0) {
    return { value: null, error: this.metricErrors.codes.missingCollectionFailed.value };
    }

    return { value: energy, error: 0 };
    }
    },
    gpu_max_power: {
    formula: function (job) {
    if (!job.gpupower) {
    return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value };
    }

    var max_power = 0.0;
    var device_count = 0;

    for (let gpu in job.gpupower) {
    if (job.gpupower.hasOwnProperty(gpu)) {
    if (job.gpupower[gpu].power && job.gpupower[gpu].power.max) {
    if (job.gpupower[gpu].power.max.max) {
    max_power = Math.max(max_power, job.gpupower[gpu].power.max.max);
    device_count += 1;
    } else if (job.gpupower[gpu].power.max.avg) {
    max_power = Math.max(max_power, job.gpupower[gpu].power.max.avg);
    device_count += 1;
    }
    }
    }
    }

    if (device_count === 0) {
    return { value: null, error: this.metricErrors.codes.missingCollectionFailed.value };
    }

    return { value: max_power, error: 0 };
    }
    },
    "gpu0_nv_mem_used": {
    formula: function(job) {
    if (!job.gpu) {
    return { value: null, error: this.metricErrors.codes.missingCollectionFailed.value };
    }
    var job_gpus = this.ref(job, "acct.gpus");
    if (job_gpus.value === undefined || job_gpus.value === 0) {
    return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value };
    }
    var util = 0.0;
    for (let gpu in job.gpu) {
    if (job.gpu.hasOwnProperty(gpu)) {
    if (job.gpu[gpu].memused && job.gpu[gpu].memused.avg) {
    util += job.gpu[gpu].memused.avg;
    }
    }
    }
    return { value: util / job_gpus.value, error: 0 };
    }
    },
    "gpu0_nv_utilization": {
    formula: function(job) {
    if (!job.gpu) {
    return { value: null, error: this.metricErrors.codes.missingCollectionFailed.value };
    }
    var job_gpus = this.ref(job, "acct.gpus");
    if (job_gpus.value === undefined || job_gpus.value === 0) {
    return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value };
    }
    var util = 0.0;
    for (let gpu in job.gpu) {
    if (job.gpu.hasOwnProperty(gpu)) {
    if (job.gpu[gpu].util && job.gpu[gpu].util.avg) {
    util += job.gpu[gpu].util.avg;
    }
    }
    }
    return { value: util / job_gpus.value / 100.0, error: 0 };
    }
    }
    }
    };
    };