Skip to content

Instantly share code, notes, and snippets.

@treydock
Last active October 24, 2025 06:42
Show Gist options
  • Save treydock/b964c5599fd057b0aa6a to your computer and use it in GitHub Desktop.
Save treydock/b964c5599fd057b0aa6a to your computer and use it in GitHub Desktop.

Revisions

  1. treydock revised this gist Jan 14, 2015. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions job_submit.lua
    Original file line number Diff line number Diff line change
    @@ -160,7 +160,7 @@ function slurm_job_submit ( job_desc, part_list, submit_uid )
    end
    end

    return 0
    return 0
    end

    function slurm_job_modify ( job_desc, job_rec, part_list, modify_uid )
    @@ -193,7 +193,7 @@ function slurm_job_modify ( job_desc, job_rec, part_list, modify_uid )
    end
    end

    return 0
    return 0
    end

    --########################################################################--
  2. treydock revised this gist Jan 14, 2015. 1 changed file with 30 additions and 30 deletions.
    60 changes: 30 additions & 30 deletions job_submit.lua
    Original file line number Diff line number Diff line change
    @@ -78,28 +78,28 @@ end

    function _build_part_table ( part_list )
    -- Create a partition table from SLURM structure
    local part_rec = {}
    local part_rec = {}

    for i in ipairs(part_list) do
    part_rec[i] = { part_rec_ptr=part_list[i] }
    setmetatable (part_rec[i], part_rec_meta)
    end
    return part_rec
    for i in ipairs(part_list) do
    part_rec[i] = { part_rec_ptr=part_list[i] }
    setmetatable (part_rec[i], part_rec_meta)
    end
    return part_rec
    end

    --========================================================================--

    function default_partition(part_rec)
    -- Return the name of the default partition
    -- part_rec : list of partitions
    local i = 1

    while part_rec[i] do
    if part_rec[i].flag_default == 1 then
    return part_rec[i].name
    end
    i = i + 1
    end
    -- Return the name of the default partition
    -- part_rec : list of partitions
    local i = 1

    while part_rec[i] do
    if part_rec[i].flag_default == 1 then
    return part_rec[i].name
    end
    i = i + 1
    end
    end

    --========================================================================--
    @@ -146,8 +146,8 @@ end
    --########################################################################--

    function slurm_job_submit ( job_desc, part_list, submit_uid )
    setmetatable (job_desc, job_req_meta)
    local part_rec = _build_part_table(part_list)
    setmetatable (job_desc, job_req_meta)
    local part_rec = _build_part_table(part_list)
    local account = job_desc.account or job_desc.default_account
    local partition = job_desc.partition or default_partition(part_rec)

    @@ -212,22 +212,22 @@ log_err = slurm.error
    log_user = slurm.log_user

    job_rec_meta = {
    __index = function (table, key)
    return _get_job_rec_field(table.job_rec_ptr, key)
    end
    __index = function (table, key)
    return _get_job_rec_field(table.job_rec_ptr, key)
    end
    }
    job_req_meta = {
    __index = function (table, key)
    return _get_job_req_field(table.job_desc_ptr, key)
    end,
    __newindex = function (table, key, value)
    return _set_job_req_field(table.job_desc_ptr, key, value or "")
    end
    __index = function (table, key)
    return _get_job_req_field(table.job_desc_ptr, key)
    end,
    __newindex = function (table, key, value)
    return _set_job_req_field(table.job_desc_ptr, key, value or "")
    end
    }
    part_rec_meta = {
    __index = function (table, key)
    return _get_part_rec_field(table.part_rec_ptr, key)
    end
    __index = function (table, key)
    return _get_part_rec_field(table.part_rec_ptr, key)
    end
    }

    log_info("initialized")
  3. treydock revised this gist Jan 14, 2015. 1 changed file with 46 additions and 5 deletions.
    51 changes: 46 additions & 5 deletions job_submit.lua
    Original file line number Diff line number Diff line change
    @@ -104,12 +104,28 @@ end

    --========================================================================--

    function get_partition(part_rec, name)
    -- Return the part_rec matching name
    -- part_rec : list of partitions
    -- name : partition name
    local i = 1

    while part_rec[i] do
    if part_rec[i].name == name then
    return part_rec[i]
    end
    i = i + 1
    end
    end

    --========================================================================--

    function get_partition_qos(partition)
    local qos = nil
    local part = partition or 'NONE'
    local partT = PARTITION_TO_QOS[part] or PARTITION_TO_QOS["default"]

    log_info("slurm_job_submit#get_partition_qos: partition: %s", part)
    log_debug("slurm_job_submit#get_partition_qos: partition: %s", part)

    if partT == nil then
    return qos
    @@ -118,7 +134,7 @@ function get_partition_qos(partition)
    end

    log_debug("slurm_job_submit#get_partition_qos: partT: %s", dump(partT))
    log_info("slurm_job_submit#get_partition_qos: partition: %s matched to qos: %s", part, qos)
    log_debug("slurm_job_submit#get_partition_qos: partition: %s matched to qos: %s", part, qos)

    return qos
    end
    @@ -148,9 +164,34 @@ function slurm_job_submit ( job_desc, part_list, submit_uid )
    end

    function slurm_job_modify ( job_desc, job_rec, part_list, modify_uid )
    setmetatable (job_desc, job_req_meta)
    setmetatable (job_rec, job_rec_meta)
    local part_rec = _build_part_table (part_list)
    setmetatable (job_desc, job_req_meta)
    setmetatable (job_rec, job_rec_meta)
    local part_rec = _build_part_table(part_list)
    local current_partition = job_rec.partition
    local new_partition = job_desc.partition or current_partition

    -- If changing partition
    if current_partition ~= new_partition then
    local new_part_rec = get_partition(part_rec, new_partition)
    -- If qos was not specified
    if job_desc.qos == nil then
    local qos = get_partition_qos(new_partition)

    if qos ~= nil then
    log_info("slurm_job_modify: for job %u from uid %d, qos value: %s", job_rec.job_id, modify_uid, qos)
    job_desc.qos = qos
    end
    end

    -- If time was not specified
    -- Instead of nil SLURM returns 4294967294
    if (job_desc.time_limit == nil or job_desc.time_limit == 4294967294) then
    if job_rec.time_limit > new_part_rec.max_time then
    log_info("slurm_job_modify: for job %u from uid %d, time_limit value: %s", job_rec.job_id, modify_uid, new_part_rec.max_time)
    job_desc.time_limit = new_part_rec.max_time
    end
    end
    end

    return 0
    end
  4. treydock revised this gist Oct 6, 2014. 1 changed file with 26 additions and 0 deletions.
    26 changes: 26 additions & 0 deletions test.lua
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,26 @@
    tests = {
    {["hepx"] = "hepx"},
    {["hepx"] = "serial"},
    {["hepx"] = "background"},
    {["idhmc"] = "serial"},
    {["idhmc"] = "background"},
    {["iamcs"] = "serial"},
    {["iamcs"] = "mpi-core8"},
    {["iamcs"] = "mpi-core32"},
    {["iamcs"] = "mpi-core32-4g"},
    {["iamcs"] = "background"},
    {["foo"] = "serial"},
    {["foo"] = "mpi-core8"},
    {["foo"] = "background"},
    {["foo"] = "bar"},
    }

    for i, test in ipairs(tests) do
    for account,partition in pairs(test) do
    printf("TEST: default_account: %s , partition: %s", account, partition)

    local qos = get_partition_qos(partition) or ""

    printf("-------------------------------------------------------------")
    end
    end
  5. treydock revised this gist Sep 23, 2014. 1 changed file with 31 additions and 82 deletions.
    113 changes: 31 additions & 82 deletions job_submit.lua
    Original file line number Diff line number Diff line change
    @@ -12,32 +12,27 @@ Some code and ideas pulled from https://github.com/edf-hpc/slurm-llnl-misc-plugi
    --
    --########################################################################--

    PART_ROUTES = {
    hepx = { part = "serial", qos = "hepx" },
    grid = { part = "background", qos = "grid" },
    idhmc = { part = "serial", qos = "idhmc" },
    PARTITION_TO_QOS = {
    ["hepx"] = { qos = "hepx" },
    ["idhmc"] = { qos = "idhmc" },
    ["serial"] = { qos = "general" },
    ["serial-long"] = { qos = "long" },
    ["mpi-core8"] = { qos = "mpi" },
    ["mpi-core32"] = { qos = "mpi" },
    ["mpi-core32-4g"] = { qos = "mpi" },
    ["background"] = { qos = "background" },
    ["background-4g"] = { qos = "background" },
    ["grid"] = { qos = "grid" },
    ["interactive"] = { qos = "interactive" },
    ["default"] = { qos = "general" }
    }

    QOS_MAP = {
    hepx = {
    partT = {
    serial = { cores = 12, qos = "hepx" },
    },
    },
    idhmc = {
    partT = {
    serial = { cores = 3, qos = "idhmc" },
    },
    },
    default = {
    partT = {
    serial = { qos = "general" },
    mpi_core8 = { qos = "mpi" },
    mpi_core32 = { qos = "mpi" },
    background = { qos = "background" },
    },
    },
    -- UNUSED
    --[[
    PARTITION_ROUTES = {
    ['something'] = 'something-else',
    }
    ]]

    --########################################################################--
    --
    @@ -47,19 +42,6 @@ QOS_MAP = {

    --========================================================================--

    function os.capture(cmd, raw)
    local f = assert(io.popen(cmd, 'r'))
    local s = assert(f:read('*a'))
    f:close()
    if raw then return s end
    s = string.gsub(s, '^%s+', '')
    s = string.gsub(s, '%s+$', '')
    s = string.gsub(s, '[\n\r]+', ' ')
    return s
    end

    --========================================================================--

    function dump(o)
    if type(o) == 'table' then
    local s = '{ '
    @@ -75,10 +57,11 @@ end

    --========================================================================--

    -- UNUSED
    --[[
    function reroute_job(job_desc, routeT)
    local partition = routeT["part"]
    local qos = routeT["qos"]
    log_info("slurm_job_submit#reroute_job: Setting partition to %s", partition)
    job_desc.partition = partition
    @@ -89,6 +72,7 @@ function reroute_job(job_desc, routeT)
    end
    end
    ]]

    --========================================================================--

    @@ -105,20 +89,6 @@ end

    --========================================================================--

    function get_allocated_cpus(qos, partition)
    local cmd = "squeue --qos=" .. qos .. " --states=R --partition=" .. partition .. " --noheader --format='%C' | paste -sd+ | bc"

    output = os.capture(cmd)

    if output == '' then
    output = 0
    end

    return tonumber(output)
    end

    --========================================================================--

    function default_partition(part_rec)
    -- Return the name of the default partition
    -- part_rec : list of partitions
    @@ -134,35 +104,21 @@ end

    --========================================================================--

    function get_qos(account, partition, job_desc)
    function get_partition_qos(partition)
    local qos = nil
    local part = partition or 'NONE'
    local partT = PARTITION_TO_QOS[part] or PARTITION_TO_QOS["default"]

    log_info("slurm_job_submit#get_qos: account: %s, partition: %s", account, partition)

    accountT = QOS_MAP[account] or QOS_MAP["default"]

    log_debug("slurm_job_submit#get_qos: accountT: %s", dump(accountT))

    partT = accountT["partT"][partition] or QOS_MAP["default"]["partT"][partition]
    log_info("slurm_job_submit#get_partition_qos: partition: %s", part)

    if partT == nil then
    return qos
    else
    qos = partT["qos"]
    end

    log_debug("slurm_job_submit#get_qos: partT: %s", dump(partT))

    qos = partT["qos"]

    if partT["cores"] ~= nil then
    cpus = partT["cores"]
    used_cpus = get_allocated_cpus(qos, partition)
    if used_cpus >= cpus then
    log_info("slurm_job_submit#get_qos: %s used_cpus >= %s cpus, reassigning to default account", cpus, used_cpus)
    return get_qos("default", partition)
    end
    end

    log_info("slurm_job_submit: account: %s, partition: %s, matched to qos: %s", account, partition, qos)
    log_debug("slurm_job_submit#get_partition_qos: partT: %s", dump(partT))
    log_info("slurm_job_submit#get_partition_qos: partition: %s matched to qos: %s", part, qos)

    return qos
    end
    @@ -175,19 +131,12 @@ end

    function slurm_job_submit ( job_desc, part_list, submit_uid )
    setmetatable (job_desc, job_req_meta)
    local part_rec = _build_part_table (part_list)
    local part_rec = _build_part_table(part_list)
    local account = job_desc.account or job_desc.default_account
    local partition = job_desc.partition or default_partition(part_rec)
    local routeT = PART_ROUTES[partition]

    --[[
    if routeT ~= nil then
    reroute_job(job_desc, routeT)
    end
    ]]

    if job_desc.qos == nil then
    local qos = get_qos(account, partition, job_desc)
    local qos = get_partition_qos(partition)

    if qos ~= nil then
    log_info("slurm_job_submit: job from uid %d, setting qos value: %s", submit_uid, qos)
  6. treydock created this gist Aug 1, 2014.
    245 changes: 245 additions & 0 deletions job_submit.lua
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,245 @@
    --[[
    SLURM job submit filter for QOS
    Some code and ideas pulled from https://github.com/edf-hpc/slurm-llnl-misc-plugins/blob/master/job_submit.lua
    --]]

    --########################################################################--
    --
    -- Define constant
    --
    --########################################################################--

    PART_ROUTES = {
    hepx = { part = "serial", qos = "hepx" },
    grid = { part = "background", qos = "grid" },
    idhmc = { part = "serial", qos = "idhmc" },
    }

    QOS_MAP = {
    hepx = {
    partT = {
    serial = { cores = 12, qos = "hepx" },
    },
    },
    idhmc = {
    partT = {
    serial = { cores = 3, qos = "idhmc" },
    },
    },
    default = {
    partT = {
    serial = { qos = "general" },
    mpi_core8 = { qos = "mpi" },
    mpi_core32 = { qos = "mpi" },
    background = { qos = "background" },
    },
    },
    }

    --########################################################################--
    --
    -- Define functions
    --
    --########################################################################--

    --========================================================================--

    function os.capture(cmd, raw)
    local f = assert(io.popen(cmd, 'r'))
    local s = assert(f:read('*a'))
    f:close()
    if raw then return s end
    s = string.gsub(s, '^%s+', '')
    s = string.gsub(s, '%s+$', '')
    s = string.gsub(s, '[\n\r]+', ' ')
    return s
    end

    --========================================================================--

    function dump(o)
    if type(o) == 'table' then
    local s = '{ '
    for k,v in pairs(o) do
    if type(k) ~= 'number' then k = '"'..k..'"' end
    s = s .. '['..k..'] = ' .. dump(v) .. ','
    end
    return s .. '} '
    else
    return tostring(o)
    end
    end

    --========================================================================--

    function reroute_job(job_desc, routeT)

    local partition = routeT["part"]
    local qos = routeT["qos"]

    log_info("slurm_job_submit#reroute_job: Setting partition to %s", partition)
    job_desc.partition = partition

    if job_desc.qos == nil then
    log_info("slurm_job_submit#reroute_job: Setting QOS to %s", qos)
    job_desc.qos = qos
    end

    end

    --========================================================================--

    function _build_part_table ( part_list )
    -- Create a partition table from SLURM structure
    local part_rec = {}

    for i in ipairs(part_list) do
    part_rec[i] = { part_rec_ptr=part_list[i] }
    setmetatable (part_rec[i], part_rec_meta)
    end
    return part_rec
    end

    --========================================================================--

    function get_allocated_cpus(qos, partition)
    local cmd = "squeue --qos=" .. qos .. " --states=R --partition=" .. partition .. " --noheader --format='%C' | paste -sd+ | bc"

    output = os.capture(cmd)

    if output == '' then
    output = 0
    end

    return tonumber(output)
    end

    --========================================================================--

    function default_partition(part_rec)
    -- Return the name of the default partition
    -- part_rec : list of partitions
    local i = 1

    while part_rec[i] do
    if part_rec[i].flag_default == 1 then
    return part_rec[i].name
    end
    i = i + 1
    end
    end

    --========================================================================--

    function get_qos(account, partition, job_desc)
    local qos = nil

    log_info("slurm_job_submit#get_qos: account: %s, partition: %s", account, partition)

    accountT = QOS_MAP[account] or QOS_MAP["default"]

    log_debug("slurm_job_submit#get_qos: accountT: %s", dump(accountT))

    partT = accountT["partT"][partition] or QOS_MAP["default"]["partT"][partition]

    if partT == nil then
    return qos
    end

    log_debug("slurm_job_submit#get_qos: partT: %s", dump(partT))

    qos = partT["qos"]

    if partT["cores"] ~= nil then
    cpus = partT["cores"]
    used_cpus = get_allocated_cpus(qos, partition)
    if used_cpus >= cpus then
    log_info("slurm_job_submit#get_qos: %s used_cpus >= %s cpus, reassigning to default account", cpus, used_cpus)
    return get_qos("default", partition)
    end
    end

    log_info("slurm_job_submit: account: %s, partition: %s, matched to qos: %s", account, partition, qos)

    return qos
    end

    --########################################################################--
    --
    -- SLURM job_submit/lua interface:
    --
    --########################################################################--

    function slurm_job_submit ( job_desc, part_list, submit_uid )
    setmetatable (job_desc, job_req_meta)
    local part_rec = _build_part_table (part_list)
    local account = job_desc.account or job_desc.default_account
    local partition = job_desc.partition or default_partition(part_rec)
    local routeT = PART_ROUTES[partition]

    --[[
    if routeT ~= nil then
    reroute_job(job_desc, routeT)
    end
    ]]

    if job_desc.qos == nil then
    local qos = get_qos(account, partition, job_desc)

    if qos ~= nil then
    log_info("slurm_job_submit: job from uid %d, setting qos value: %s", submit_uid, qos)
    job_desc.qos = qos
    end
    end

    return 0
    end

    function slurm_job_modify ( job_desc, job_rec, part_list, modify_uid )
    setmetatable (job_desc, job_req_meta)
    setmetatable (job_rec, job_rec_meta)
    local part_rec = _build_part_table (part_list)

    return 0
    end

    --########################################################################--
    --
    -- Initialization code:
    --
    -- Define functions for logging and accessing slurmctld structures
    --
    --########################################################################--


    log_info = slurm.log_info
    log_verbose = slurm.log_verbose
    log_debug = slurm.log_debug
    log_err = slurm.error
    log_user = slurm.log_user

    job_rec_meta = {
    __index = function (table, key)
    return _get_job_rec_field(table.job_rec_ptr, key)
    end
    }
    job_req_meta = {
    __index = function (table, key)
    return _get_job_req_field(table.job_desc_ptr, key)
    end,
    __newindex = function (table, key, value)
    return _set_job_req_field(table.job_desc_ptr, key, value or "")
    end
    }
    part_rec_meta = {
    __index = function (table, key)
    return _get_part_rec_field(table.part_rec_ptr, key)
    end
    }

    log_info("initialized")

    return slurm.SUCCESS