diff --git a/pyproject.toml b/pyproject.toml index de30b32..c593d8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,7 @@ select = [ ignore = [ "COM812", # disable ruff format warning "E501", # disable ruff line-too-long error + "S603", # disable subprocess-without-shell-equals-true (see PyCQA/bandit#333) ] [tool.ruff.format] diff --git a/src/glicid_spawner/slurm.py b/src/glicid_spawner/slurm.py new file mode 100644 index 0000000..49e8d70 --- /dev/null +++ b/src/glicid_spawner/slurm.py @@ -0,0 +1,87 @@ +"""SLURM module.""" + +import re +import shlex +import subprocess +from dataclasses import dataclass, field +from itertools import groupby +from operator import attrgetter + + +@dataclass +class SlurmCpu: + """SLURM CPU resource.""" + + allocated: int + idle: int + total: int + + def __post_init__(self): + self.allocated = int(self.allocated) + self.idle = int(self.idle) + self.total = int(self.total) + + +@dataclass +class SlurmGpu: + """SLURM GPU resource.""" + + name: str = field(default='None') + nb: int = field(default=0) + + def __post_init__(self): + self.name = str(self.name).capitalize() + self.nb = int(self.nb) + + def __bool__(self): + return self.nb > 0 + + +@dataclass +class SlurmNode: + """SLURM node resource.""" + + cluster: str + partition: str + hostname: str + state: str + cpu: SlurmCpu + mem: int + gpu: SlurmGpu + + def __init__(self, cluster, partition, hostname, state, cpus_state, memory_mb, gres): # noqa: PLR0913 + self.cluster = None if 'N/A' in cluster else cluster.strip() + self.partition = partition.strip() + self.hostname = hostname.strip() + self.state = state.strip().lower() + self.cpu = SlurmCpu(*re.findall(r'(\d+)/(\d+)/\d+/(\d+)', cpus_state)[0]) + self.mem = int(memory_mb) // 1000 # in GB + self.gpu = SlurmGpu(*re.findall(r'gpu:(\w+):(\d+)', gres)[0] if 'gpu:' in gres else []) + + +def _sinfo_run() -> str: + """SLURM SINFO run command.""" + flags = '--federation --noheader --responding' + fmt = 'Cluster,PartitionName,NodeHost,StateLong,CPUsState,Memory,Gres' + cmd = shlex.split(f'sinfo {flags} --Format={fmt}') + + return subprocess.check_output(cmd).decode('utf-8') + + +def _sinfo_reader(result) -> list: + """SLURM SINFO reader.""" + return [SlurmNode(*re.findall('.{20}', node)) for node in result.splitlines()] + + +def sinfo(with_states=('idle', 'mixed')) -> dict: + """SLURM SINFO resources available with a given state(s).""" + resources = _sinfo_reader(_sinfo_run()) + + return { + cluster: { + partition: available + for partition, nodes in groupby(partitions, key=attrgetter('partition')) + if (available := [node for node in nodes if node.state in with_states]) + } + for cluster, partitions in groupby(resources, key=attrgetter('cluster')) + } diff --git a/tests/data/sinfo.txt b/tests/data/sinfo.txt new file mode 100644 index 0000000..bf859e3 --- /dev/null +++ b/tests/data/sinfo.txt @@ -0,0 +1,17 @@ +N/A Devel nazare001 idle 0/20/0/20 128000 (null) +N/A GPU-short budbud001 mixed 20/20/0/40 184000 gpu:t4:2,mps:t4:2000 +N/A A40-short budbud002 allocated 40/0/0/40 184000 gpu:a40:2,mps:a40:20 +N/A AMD-short cloudbreak001 drained 0/0/32/32 128000 (null) +N/A lowp budbud003 down~ 0/0/40/40 128000 gpu:p100:2 +N/A lowp budbud004 drained~ 0/0/20/20 128000 gpu:k80:4 +N/A lowp budbud005 idle~ 0/20/0/20 192000 gpu:p100:1 +nautilus standard cnode001 completing 0/96/0/96 384000 (null) +nautilus bigmem cnode002 planned 0/96/0/96 768000 (null) +nautilus gpu gnode1 mixed 4/92/0/96 768000 gpu:A100:1(S:0-1) +nautilus gpu gnode2 idle 0/96/0/96 256000 gpu:A100:2(S:0-1) +nautilus gpu gnode3 allocated 96/0/0/96 128000 gpu:A100:4(S:0-1) +nautilus all visu1 idle 0/96/0/96 768000 (null) +waves standard cribbar001 idle 0/40/0/40 128000 (null) +waves gpu budbud006 allocated 64/0/0/64 256000 gpu:a100:2,mps:a100: +waves all cribbar001 mixed 20/20/0/40 128000 (null) +waves devel vmworker001 inval 0/0/8/8 16000 (null) diff --git a/tests/test_slurm.py b/tests/test_slurm.py new file mode 100644 index 0000000..b979b78 --- /dev/null +++ b/tests/test_slurm.py @@ -0,0 +1,135 @@ +"""Test SLURM module.""" + +from pathlib import Path + +from glicid_spawner.slurm import ( + SlurmCpu, + SlurmGpu, + SlurmNode, + _sinfo_reader, + _sinfo_run, + sinfo, + subprocess, +) + +DATA = Path(__file__).parent / 'data' + + +def test_slurm_dataclasses(): + """Test SLURM dataclasses formatter.""" + cpu = SlurmCpu(1, '2', 4.0) + + assert cpu.allocated == 1 + assert cpu.idle == 2 + assert cpu.total == 4 + + assert isinstance(cpu.allocated, int) + assert isinstance(cpu.idle, int) + assert isinstance(cpu.total, int) + + gpu = SlurmGpu('fOo', '1') + + assert gpu # __bool__ + assert gpu.name == 'Foo' + assert gpu.nb == 1 + + assert isinstance(gpu.name, str) + assert isinstance(gpu.nb, int) + + # Default values + gpu = SlurmGpu() + + assert not gpu # __bool__ + assert gpu.name == 'None' + assert gpu.nb == 0 + + +def test_slurm_sinfo_run(monkeypatch): + """Test SLURM SINFO run command.""" + monkeypatch.setattr(subprocess, 'check_output', lambda cmd: ' '.join(cmd).encode()) + + assert _sinfo_run() == ( + 'sinfo ' + '--federation ' + '--noheader ' + '--responding ' + '--Format=Cluster,PartitionName,NodeHost,StateLong,CPUsState,Memory,Gres' + ) + + +def test_slurm_sinfo_reader(): + """Test SLURM SINFO reader.""" + nodes = _sinfo_reader((DATA / 'sinfo.txt').read_text()) + + for node in nodes: + assert isinstance(node, SlurmNode) + + node = nodes[0] + + assert node.cluster is None + assert node.partition == 'Devel' + assert node.hostname == 'nazare001' + assert node.state == 'idle' + assert node.cpu.allocated == 0 + assert node.cpu.idle == node.cpu.total == 20 + assert node.mem == 128 + assert not node.gpu + + assert [node.cluster for node in nodes] == 7 * [None] + 6 * ['nautilus'] + 4 * ['waves'] + + assert len([node for node in nodes if node.state in ('idle', 'mixed')]) == 7 + + for node in nodes: + if node.state == 'idle': + assert node.cpu.allocated == 0 + assert node.cpu.idle > 0 + elif node.state == 'mixed': + assert node.cpu.allocated > 0 + assert node.cpu.idle > 0 + elif node.state == 'allocated': + assert node.cpu.allocated > 0 + assert node.cpu.idle == 0 + + assert sum(node.mem for node in nodes) == 4_672 + + assert [node.gpu.name for node in nodes if node.gpu] == [ + 'T4', + 'A40', + 'P100', + 'K80', + 'P100', + ] + 4 * ['A100'] + + assert [node.gpu.nb for node in nodes if node.gpu] == [2, 2, 2, 4, 1, 1, 2, 4, 2] + + +def test_slurm_sinfo_resources(monkeypatch): + """Test SLURM SINFO resources.""" + monkeypatch.setattr(subprocess, 'check_output', lambda _: (DATA / 'sinfo.txt').read_bytes()) + + clusters = sinfo() + + assert isinstance(clusters, dict) + assert len(clusters) == 3 + assert list(clusters) == [None, 'nautilus', 'waves'] + + assert [len(partitions) for partitions in clusters.values()] == [2, 2, 2] + + nautilus = clusters['nautilus'] + + assert isinstance(nautilus, dict) + assert len(nautilus) == 2 + assert list(nautilus) == ['gpu', 'all'] + + gpus = nautilus['gpu'] + + assert len(gpus) == 2 + assert [partition.hostname for partition in gpus] == ['gnode1', 'gnode2'] + assert [partition.cpu.allocated for partition in gpus] == [4, 0] + assert [partition.cpu.idle for partition in gpus] == [92, 96] + assert [partition.mem for partition in gpus] == [768, 256] + assert [partition.gpu.name for partition in gpus] == ['A100', 'A100'] + assert [partition.gpu.nb for partition in gpus] == [1, 2] + + # Get only `idle` nodes + assert [len(partitions) for partitions in sinfo(with_states=('idle')).values()] == [1, 2, 1]