"""SLURM module.""" import re import shlex import subprocess from dataclasses import dataclass, field from itertools import groupby from operator import attrgetter from pathlib import Path @dataclass class SlurmCpu: """SLURM CPU.""" allocated: int idle: int total: int def __post_init__(self): self.allocated = int(self.allocated) self.idle = int(self.idle) self.total = int(self.total) @dataclass class SlurmGpu: """SLURM GPU.""" name: str = field(default='None') nb: int = field(default=0) def __post_init__(self): self.name = str(self.name).capitalize() self.nb = int(self.nb) def __bool__(self): return self.nb > 0 def __str__(self): return self.name @dataclass class SlurmNode: """SLURM node.""" cluster: str partition: str hostname: str state: str cpu: SlurmCpu mem: int gpu: SlurmGpu def __init__(self, cluster, partition, hostname, state, cpus_state, memory_mb, gres): # noqa: PLR0913 self.cluster = cluster.strip() self.partition = partition.strip() self.hostname = hostname.strip() self.state = state.strip().lower() self.cpu = SlurmCpu(*re.findall(r'(\d+)/(\d+)/\d+/(\d+)', cpus_state)[0]) self.mem = int(memory_mb) // 1000 # in GB self.gpu = SlurmGpu(*re.findall(r'gpu:(\w+):(\d+)', gres)[0] if 'gpu:' in gres else []) def __str__(self): return self.hostname @dataclass class SlurmPartition: """SLURM partition.""" name: str nodes: list def __str__(self): return self.name def __iter__(self): return iter(self.nodes) def __len__(self): return len(self.nodes) def __eq__(self, other): return str(self) == str(other) @property def gpus(self) -> str: """List of GPUs available.""" return ':'.join({node.gpu.name for node in self.nodes}) @property def max_idle_cpu(self) -> int: """Maximum of idle CPU available.""" return max(node.cpu.idle for node in self.nodes) @property def max_mem(self) -> int: """Maximum of memory available.""" return max(node.mem for node in self.nodes) @dataclass class SlurmCluster: """SLURM cluster.""" name: str partitions: list def __str__(self): return self.name def __iter__(self): return iter(self.partitions) def __len__(self): return len(self.partitions) def __eq__(self, other): return str(self) == str(other) def sinfo_run(username: str = None) -> str: """SLURM SINFO run command.""" flags = '--federation --noheader --responding' fmt = 'Cluster,PartitionName,NodeHost,StateLong,CPUsState,Memory,Gres' cmd = f'sinfo {flags} --Format={fmt}' if username: cmd = f'su - {username} -c "{cmd}"' return subprocess.check_output(shlex.split(cmd, posix=False)).decode('utf-8') def sinfo_reader(result: str) -> list: """SLURM SINFO reader.""" return [SlurmNode(*re.findall('.{20}', node)) for node in result.splitlines()] def sinfo_filter(resources: list, with_states=('idle', 'mixed')) -> dict: """SLURM SINFO filtered resources available with a given state(s). Grouped by cluster and partition names. """ resources = { cluster: { partition: available for partition, nodes in groupby(partitions, key=attrgetter('partition')) if (available := [node for node in nodes if node.state in with_states]) } for cluster, partitions in groupby(resources, key=attrgetter('cluster')) } return {key: values for key, values in resources.items() if values} def sinfo_from_file(fname, with_states=('idle', 'mixed')) -> dict: """SLURM SINFO resources available from a given file.""" content = Path(fname).read_text() return sinfo_filter(sinfo_reader(content), with_states=with_states) def sinfo(username: str = None, with_states=('idle', 'mixed')) -> dict: """SLURM SINFO resources available for a given user.""" return sinfo_filter(sinfo_reader(sinfo_run(username=username)), with_states=with_states)