CachedCMakePackage: Improve finding mpiexec for non-slurm machines (#49033)

* Check for LSF, FLux, and Slurm when determing MPI exec

* Make scheduler/MPI exec helper functions methods of CachedCMakeBuilder

* Remove axom workaround for running mpi on machines with flux
This commit is contained in:
Tara Drwenski 2025-03-31 10:11:09 -06:00 committed by GitHub
parent 46f7737626
commit 0f64f1baec
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 64 additions and 38 deletions

View File

@ -2,9 +2,10 @@
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
import collections.abc
import enum
import os
import re
from typing import Tuple
from typing import Optional, Tuple
import llnl.util.filesystem as fs
import llnl.util.tty as tty
@ -13,6 +14,7 @@
import spack.spec
import spack.util.prefix
from spack.directives import depends_on
from spack.util.executable import which_string
from .cmake import CMakeBuilder, CMakePackage
@ -178,6 +180,64 @@ def initconfig_compiler_entries(self):
return entries
class Scheduler(enum.Enum):
LSF = enum.auto()
SLURM = enum.auto()
FLUX = enum.auto()
def get_scheduler(self) -> Optional[Scheduler]:
spec = self.pkg.spec
# Check for Spectrum-mpi, which always uses LSF or LSF MPI variant
if spec.satisfies("^spectrum-mpi") or spec["mpi"].satisfies("schedulers=lsf"):
return self.Scheduler.LSF
# Check for Slurm MPI variants
slurm_checks = ["+slurm", "schedulers=slurm", "process_managers=slurm"]
if any(spec["mpi"].satisfies(variant) for variant in slurm_checks):
return self.Scheduler.SLURM
# TODO improve this when MPI implementations support flux
# Do this check last to avoid using a flux wrapper present next to Slurm/ LSF schedulers
if which_string("flux") is not None:
return self.Scheduler.FLUX
return None
def get_mpi_exec(self) -> Optional[str]:
spec = self.pkg.spec
scheduler = self.get_scheduler()
if scheduler == self.Scheduler.LSF:
return which_string("lrun")
elif scheduler == self.Scheduler.SLURM:
if spec["mpi"].external:
return which_string("srun")
else:
return os.path.join(spec["slurm"].prefix.bin, "srun")
elif scheduler == self.Scheduler.FLUX:
flux = which_string("flux")
return f"{flux};run" if flux else None
elif hasattr(spec["mpi"].package, "mpiexec"):
return spec["mpi"].package.mpiexec
else:
mpiexec = os.path.join(spec["mpi"].prefix.bin, "mpirun")
if not os.path.exists(mpiexec):
mpiexec = os.path.join(spec["mpi"].prefix.bin, "mpiexec")
return mpiexec
def get_mpi_exec_num_proc(self) -> str:
scheduler = self.get_scheduler()
if scheduler in [self.Scheduler.FLUX, self.Scheduler.LSF, self.Scheduler.SLURM]:
return "-n"
else:
return "-np"
def initconfig_mpi_entries(self):
spec = self.pkg.spec
@ -197,27 +257,10 @@ def initconfig_mpi_entries(self):
if hasattr(spec["mpi"], "mpifc"):
entries.append(cmake_cache_path("MPI_Fortran_COMPILER", spec["mpi"].mpifc))
# Check for slurm
using_slurm = False
slurm_checks = ["+slurm", "schedulers=slurm", "process_managers=slurm"]
if any(spec["mpi"].satisfies(variant) for variant in slurm_checks):
using_slurm = True
# Determine MPIEXEC
if using_slurm:
if spec["mpi"].external:
# Heuristic until we have dependents on externals
mpiexec = "/usr/bin/srun"
else:
mpiexec = os.path.join(spec["slurm"].prefix.bin, "srun")
elif hasattr(spec["mpi"].package, "mpiexec"):
mpiexec = spec["mpi"].package.mpiexec
else:
mpiexec = os.path.join(spec["mpi"].prefix.bin, "mpirun")
if not os.path.exists(mpiexec):
mpiexec = os.path.join(spec["mpi"].prefix.bin, "mpiexec")
mpiexec = self.get_mpi_exec()
if not os.path.exists(mpiexec):
if mpiexec is None or not os.path.exists(mpiexec.split(";")[0]):
msg = "Unable to determine MPIEXEC, %s tests may fail" % self.pkg.name
entries.append("# {0}\n".format(msg))
tty.warn(msg)
@ -230,10 +273,7 @@ def initconfig_mpi_entries(self):
entries.append(cmake_cache_path("MPIEXEC", mpiexec))
# Determine MPIEXEC_NUMPROC_FLAG
if using_slurm:
entries.append(cmake_cache_string("MPIEXEC_NUMPROC_FLAG", "-n"))
else:
entries.append(cmake_cache_string("MPIEXEC_NUMPROC_FLAG", "-np"))
entries.append(cmake_cache_string("MPIEXEC_NUMPROC_FLAG", self.get_mpi_exec_num_proc()))
return entries

View File

@ -8,7 +8,6 @@
from os.path import join as pjoin
from spack.package import *
from spack.util.executable import which_string
def get_spec_path(spec, package_name, path_replacements={}, use_bin=False):
@ -452,19 +451,6 @@ def initconfig_mpi_entries(self):
entries.append(cmake_cache_option("ENABLE_MPI", True))
if spec["mpi"].name == "spectrum-mpi":
entries.append(cmake_cache_string("BLT_MPI_COMMAND_APPEND", "mpibind"))
# Replace /usr/bin/srun path with srun flux wrapper path on TOSS 4
# TODO: Remove this logic by adding `using_flux` case in
# spack/lib/spack/spack/build_systems/cached_cmake.py:196 and remove hard-coded
# path to srun in same file.
if "toss_4" in self._get_sys_type(spec):
srun_wrapper = which_string("srun")
mpi_exec_index = [
index for index, entry in enumerate(entries) if "MPIEXEC_EXECUTABLE" in entry
]
if mpi_exec_index:
del entries[mpi_exec_index[0]]
entries.append(cmake_cache_path("MPIEXEC_EXECUTABLE", srun_wrapper))
else:
entries.append(cmake_cache_option("ENABLE_MPI", False))