Add SLURM SINFO parser
This commit is contained in:
parent
e78d20787a
commit
f00b406962
4 changed files with 240 additions and 0 deletions
|
@ -53,6 +53,7 @@ select = [
|
||||||
ignore = [
|
ignore = [
|
||||||
"COM812", # disable ruff format warning
|
"COM812", # disable ruff format warning
|
||||||
"E501", # disable ruff line-too-long error
|
"E501", # disable ruff line-too-long error
|
||||||
|
"S603", # disable subprocess-without-shell-equals-true (see PyCQA/bandit#333)
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.ruff.format]
|
[tool.ruff.format]
|
||||||
|
|
87
src/glicid_spawner/slurm.py
Normal file
87
src/glicid_spawner/slurm.py
Normal file
|
@ -0,0 +1,87 @@
|
||||||
|
"""SLURM module."""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import shlex
|
||||||
|
import subprocess
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from itertools import groupby
|
||||||
|
from operator import attrgetter
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SlurmCpu:
|
||||||
|
"""SLURM CPU resource."""
|
||||||
|
|
||||||
|
allocated: int
|
||||||
|
idle: int
|
||||||
|
total: int
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
self.allocated = int(self.allocated)
|
||||||
|
self.idle = int(self.idle)
|
||||||
|
self.total = int(self.total)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SlurmGpu:
|
||||||
|
"""SLURM GPU resource."""
|
||||||
|
|
||||||
|
name: str = field(default='None')
|
||||||
|
nb: int = field(default=0)
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
self.name = str(self.name).capitalize()
|
||||||
|
self.nb = int(self.nb)
|
||||||
|
|
||||||
|
def __bool__(self):
|
||||||
|
return self.nb > 0
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SlurmNode:
|
||||||
|
"""SLURM node resource."""
|
||||||
|
|
||||||
|
cluster: str
|
||||||
|
partition: str
|
||||||
|
hostname: str
|
||||||
|
state: str
|
||||||
|
cpu: SlurmCpu
|
||||||
|
mem: int
|
||||||
|
gpu: SlurmGpu
|
||||||
|
|
||||||
|
def __init__(self, cluster, partition, hostname, state, cpus_state, memory_mb, gres): # noqa: PLR0913
|
||||||
|
self.cluster = None if 'N/A' in cluster else cluster.strip()
|
||||||
|
self.partition = partition.strip()
|
||||||
|
self.hostname = hostname.strip()
|
||||||
|
self.state = state.strip().lower()
|
||||||
|
self.cpu = SlurmCpu(*re.findall(r'(\d+)/(\d+)/\d+/(\d+)', cpus_state)[0])
|
||||||
|
self.mem = int(memory_mb) // 1000 # in GB
|
||||||
|
self.gpu = SlurmGpu(*re.findall(r'gpu:(\w+):(\d+)', gres)[0] if 'gpu:' in gres else [])
|
||||||
|
|
||||||
|
|
||||||
|
def _sinfo_run() -> str:
|
||||||
|
"""SLURM SINFO run command."""
|
||||||
|
flags = '--federation --noheader --responding'
|
||||||
|
fmt = 'Cluster,PartitionName,NodeHost,StateLong,CPUsState,Memory,Gres'
|
||||||
|
cmd = shlex.split(f'sinfo {flags} --Format={fmt}')
|
||||||
|
|
||||||
|
return subprocess.check_output(cmd).decode('utf-8')
|
||||||
|
|
||||||
|
|
||||||
|
def _sinfo_reader(result) -> list:
|
||||||
|
"""SLURM SINFO reader."""
|
||||||
|
return [SlurmNode(*re.findall('.{20}', node)) for node in result.splitlines()]
|
||||||
|
|
||||||
|
|
||||||
|
def sinfo(with_states=('idle', 'mixed')) -> dict:
|
||||||
|
"""SLURM SINFO resources available with a given state(s)."""
|
||||||
|
resources = _sinfo_reader(_sinfo_run())
|
||||||
|
|
||||||
|
return {
|
||||||
|
cluster: {
|
||||||
|
partition: available
|
||||||
|
for partition, nodes in groupby(partitions, key=attrgetter('partition'))
|
||||||
|
if (available := [node for node in nodes if node.state in with_states])
|
||||||
|
}
|
||||||
|
for cluster, partitions in groupby(resources, key=attrgetter('cluster'))
|
||||||
|
}
|
17
tests/data/sinfo.txt
Normal file
17
tests/data/sinfo.txt
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
N/A Devel nazare001 idle 0/20/0/20 128000 (null)
|
||||||
|
N/A GPU-short budbud001 mixed 20/20/0/40 184000 gpu:t4:2,mps:t4:2000
|
||||||
|
N/A A40-short budbud002 allocated 40/0/0/40 184000 gpu:a40:2,mps:a40:20
|
||||||
|
N/A AMD-short cloudbreak001 drained 0/0/32/32 128000 (null)
|
||||||
|
N/A lowp budbud003 down~ 0/0/40/40 128000 gpu:p100:2
|
||||||
|
N/A lowp budbud004 drained~ 0/0/20/20 128000 gpu:k80:4
|
||||||
|
N/A lowp budbud005 idle~ 0/20/0/20 192000 gpu:p100:1
|
||||||
|
nautilus standard cnode001 completing 0/96/0/96 384000 (null)
|
||||||
|
nautilus bigmem cnode002 planned 0/96/0/96 768000 (null)
|
||||||
|
nautilus gpu gnode1 mixed 4/92/0/96 768000 gpu:A100:1(S:0-1)
|
||||||
|
nautilus gpu gnode2 idle 0/96/0/96 256000 gpu:A100:2(S:0-1)
|
||||||
|
nautilus gpu gnode3 allocated 96/0/0/96 128000 gpu:A100:4(S:0-1)
|
||||||
|
nautilus all visu1 idle 0/96/0/96 768000 (null)
|
||||||
|
waves standard cribbar001 idle 0/40/0/40 128000 (null)
|
||||||
|
waves gpu budbud006 allocated 64/0/0/64 256000 gpu:a100:2,mps:a100:
|
||||||
|
waves all cribbar001 mixed 20/20/0/40 128000 (null)
|
||||||
|
waves devel vmworker001 inval 0/0/8/8 16000 (null)
|
135
tests/test_slurm.py
Normal file
135
tests/test_slurm.py
Normal file
|
@ -0,0 +1,135 @@
|
||||||
|
"""Test SLURM module."""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from glicid_spawner.slurm import (
|
||||||
|
SlurmCpu,
|
||||||
|
SlurmGpu,
|
||||||
|
SlurmNode,
|
||||||
|
_sinfo_reader,
|
||||||
|
_sinfo_run,
|
||||||
|
sinfo,
|
||||||
|
subprocess,
|
||||||
|
)
|
||||||
|
|
||||||
|
DATA = Path(__file__).parent / 'data'
|
||||||
|
|
||||||
|
|
||||||
|
def test_slurm_dataclasses():
|
||||||
|
"""Test SLURM dataclasses formatter."""
|
||||||
|
cpu = SlurmCpu(1, '2', 4.0)
|
||||||
|
|
||||||
|
assert cpu.allocated == 1
|
||||||
|
assert cpu.idle == 2
|
||||||
|
assert cpu.total == 4
|
||||||
|
|
||||||
|
assert isinstance(cpu.allocated, int)
|
||||||
|
assert isinstance(cpu.idle, int)
|
||||||
|
assert isinstance(cpu.total, int)
|
||||||
|
|
||||||
|
gpu = SlurmGpu('fOo', '1')
|
||||||
|
|
||||||
|
assert gpu # __bool__
|
||||||
|
assert gpu.name == 'Foo'
|
||||||
|
assert gpu.nb == 1
|
||||||
|
|
||||||
|
assert isinstance(gpu.name, str)
|
||||||
|
assert isinstance(gpu.nb, int)
|
||||||
|
|
||||||
|
# Default values
|
||||||
|
gpu = SlurmGpu()
|
||||||
|
|
||||||
|
assert not gpu # __bool__
|
||||||
|
assert gpu.name == 'None'
|
||||||
|
assert gpu.nb == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_slurm_sinfo_run(monkeypatch):
|
||||||
|
"""Test SLURM SINFO run command."""
|
||||||
|
monkeypatch.setattr(subprocess, 'check_output', lambda cmd: ' '.join(cmd).encode())
|
||||||
|
|
||||||
|
assert _sinfo_run() == (
|
||||||
|
'sinfo '
|
||||||
|
'--federation '
|
||||||
|
'--noheader '
|
||||||
|
'--responding '
|
||||||
|
'--Format=Cluster,PartitionName,NodeHost,StateLong,CPUsState,Memory,Gres'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_slurm_sinfo_reader():
|
||||||
|
"""Test SLURM SINFO reader."""
|
||||||
|
nodes = _sinfo_reader((DATA / 'sinfo.txt').read_text())
|
||||||
|
|
||||||
|
for node in nodes:
|
||||||
|
assert isinstance(node, SlurmNode)
|
||||||
|
|
||||||
|
node = nodes[0]
|
||||||
|
|
||||||
|
assert node.cluster is None
|
||||||
|
assert node.partition == 'Devel'
|
||||||
|
assert node.hostname == 'nazare001'
|
||||||
|
assert node.state == 'idle'
|
||||||
|
assert node.cpu.allocated == 0
|
||||||
|
assert node.cpu.idle == node.cpu.total == 20
|
||||||
|
assert node.mem == 128
|
||||||
|
assert not node.gpu
|
||||||
|
|
||||||
|
assert [node.cluster for node in nodes] == 7 * [None] + 6 * ['nautilus'] + 4 * ['waves']
|
||||||
|
|
||||||
|
assert len([node for node in nodes if node.state in ('idle', 'mixed')]) == 7
|
||||||
|
|
||||||
|
for node in nodes:
|
||||||
|
if node.state == 'idle':
|
||||||
|
assert node.cpu.allocated == 0
|
||||||
|
assert node.cpu.idle > 0
|
||||||
|
elif node.state == 'mixed':
|
||||||
|
assert node.cpu.allocated > 0
|
||||||
|
assert node.cpu.idle > 0
|
||||||
|
elif node.state == 'allocated':
|
||||||
|
assert node.cpu.allocated > 0
|
||||||
|
assert node.cpu.idle == 0
|
||||||
|
|
||||||
|
assert sum(node.mem for node in nodes) == 4_672
|
||||||
|
|
||||||
|
assert [node.gpu.name for node in nodes if node.gpu] == [
|
||||||
|
'T4',
|
||||||
|
'A40',
|
||||||
|
'P100',
|
||||||
|
'K80',
|
||||||
|
'P100',
|
||||||
|
] + 4 * ['A100']
|
||||||
|
|
||||||
|
assert [node.gpu.nb for node in nodes if node.gpu] == [2, 2, 2, 4, 1, 1, 2, 4, 2]
|
||||||
|
|
||||||
|
|
||||||
|
def test_slurm_sinfo_resources(monkeypatch):
|
||||||
|
"""Test SLURM SINFO resources."""
|
||||||
|
monkeypatch.setattr(subprocess, 'check_output', lambda _: (DATA / 'sinfo.txt').read_bytes())
|
||||||
|
|
||||||
|
clusters = sinfo()
|
||||||
|
|
||||||
|
assert isinstance(clusters, dict)
|
||||||
|
assert len(clusters) == 3
|
||||||
|
assert list(clusters) == [None, 'nautilus', 'waves']
|
||||||
|
|
||||||
|
assert [len(partitions) for partitions in clusters.values()] == [2, 2, 2]
|
||||||
|
|
||||||
|
nautilus = clusters['nautilus']
|
||||||
|
|
||||||
|
assert isinstance(nautilus, dict)
|
||||||
|
assert len(nautilus) == 2
|
||||||
|
assert list(nautilus) == ['gpu', 'all']
|
||||||
|
|
||||||
|
gpus = nautilus['gpu']
|
||||||
|
|
||||||
|
assert len(gpus) == 2
|
||||||
|
assert [partition.hostname for partition in gpus] == ['gnode1', 'gnode2']
|
||||||
|
assert [partition.cpu.allocated for partition in gpus] == [4, 0]
|
||||||
|
assert [partition.cpu.idle for partition in gpus] == [92, 96]
|
||||||
|
assert [partition.mem for partition in gpus] == [768, 256]
|
||||||
|
assert [partition.gpu.name for partition in gpus] == ['A100', 'A100']
|
||||||
|
assert [partition.gpu.nb for partition in gpus] == [1, 2]
|
||||||
|
|
||||||
|
# Get only `idle` nodes
|
||||||
|
assert [len(partitions) for partitions in sinfo(with_states=('idle')).values()] == [1, 2, 1]
|
Loading…
Add table
Add a link
Reference in a new issue