spawner/src/glicid_spawner/slurm.py

159 lines
3.9 KiB
Python
Raw Normal View History

2024-02-08 16:20:59 +01:00
"""SLURM module."""
import re
import shlex
import subprocess
from dataclasses import dataclass, field
from itertools import groupby
from operator import attrgetter
2024-02-14 17:09:12 +01:00
from pathlib import Path
2024-02-08 16:20:59 +01:00
@dataclass
class SlurmCpu:
"""SLURM CPU."""
2024-02-08 16:20:59 +01:00
allocated: int
idle: int
total: int
def __post_init__(self):
self.allocated = int(self.allocated)
self.idle = int(self.idle)
self.total = int(self.total)
@dataclass
class SlurmGpu:
"""SLURM GPU."""
2024-02-08 16:20:59 +01:00
name: str = field(default='None')
nb: int = field(default=0)
def __post_init__(self):
self.name = str(self.name).capitalize()
self.nb = int(self.nb)
def __bool__(self):
return self.nb > 0
def __str__(self):
return self.name
2024-02-08 16:20:59 +01:00
@dataclass
class SlurmNode:
"""SLURM node."""
2024-02-08 16:20:59 +01:00
cluster: str
partition: str
hostname: str
state: str
cpu: SlurmCpu
mem: int
gpu: SlurmGpu
def __init__(self, cluster, partition, hostname, state, cpus_state, memory_mb, gres): # noqa: PLR0913
2024-02-14 18:46:53 +01:00
self.cluster = cluster.strip()
2024-02-08 16:20:59 +01:00
self.partition = partition.strip()
self.hostname = hostname.strip()
self.state = state.strip().lower()
self.cpu = SlurmCpu(*re.findall(r'(\d+)/(\d+)/\d+/(\d+)', cpus_state)[0])
self.mem = int(memory_mb) // 1000 # in GB
self.gpu = SlurmGpu(*re.findall(r'gpu:(\w+):(\d+)', gres)[0] if 'gpu:' in gres else [])
def __str__(self):
return self.hostname
@dataclass
class SlurmPartition:
"""SLURM partition."""
name: str
nodes: list
def __str__(self):
return self.name
def __iter__(self):
return iter(self.nodes)
@property
def gpus(self) -> str:
"""List of GPUs available."""
return ':'.join({node.gpu.name for node in self.nodes})
@property
def max_idle_cpu(self) -> int:
"""Maximum of idle CPU available."""
return max(node.cpu.idle for node in self.nodes)
@property
def max_mem(self) -> int:
"""Maximum of memory available."""
return max(node.mem for node in self.nodes)
@dataclass
class SlurmCluster:
"""SLURM cluster."""
name: str
partitions: list
def __str__(self):
return self.name
def __iter__(self):
return iter(self.partitions)
def __eq__(self, other):
return str(self) == str(other)
2024-02-08 16:20:59 +01:00
def sinfo_run(username: str = None) -> str:
2024-02-08 16:20:59 +01:00
"""SLURM SINFO run command."""
2024-02-14 13:18:21 +01:00
flags = '--federation --noheader --responding'
2024-02-08 16:20:59 +01:00
fmt = 'Cluster,PartitionName,NodeHost,StateLong,CPUsState,Memory,Gres'
2024-02-14 13:18:21 +01:00
cmd = f'sinfo {flags} --Format={fmt}'
2024-02-08 16:20:59 +01:00
2024-02-14 13:18:21 +01:00
if username:
cmd = f'su - {username} -c "{cmd}"'
return subprocess.check_output(shlex.split(cmd, posix=False)).decode('utf-8')
2024-02-08 16:20:59 +01:00
def sinfo_reader(result: str) -> list:
2024-02-08 16:20:59 +01:00
"""SLURM SINFO reader."""
return [SlurmNode(*re.findall('.{20}', node)) for node in result.splitlines()]
def sinfo_filter(resources: list, with_states=('idle', 'mixed')) -> dict:
"""SLURM SINFO filtered resources available with a given state(s).
2024-02-08 16:37:20 +01:00
Grouped by cluster and partition names.
"""
resources = {
2024-02-08 16:20:59 +01:00
cluster: {
partition: available
for partition, nodes in groupby(partitions, key=attrgetter('partition'))
if (available := [node for node in nodes if node.state in with_states])
}
for cluster, partitions in groupby(resources, key=attrgetter('cluster'))
}
return {key: values for key, values in resources.items() if values}
2024-02-14 17:09:12 +01:00
def sinfo_from_file(fname, with_states=('idle', 'mixed')) -> dict:
"""SLURM SINFO resources available from a given file."""
content = Path(fname).read_text()
return sinfo_filter(sinfo_reader(content), with_states=with_states)
def sinfo(username: str = None, with_states=('idle', 'mixed')) -> dict:
"""SLURM SINFO resources available for a given user."""
return sinfo_filter(sinfo_reader(sinfo_run(username=username)), with_states=with_states)