From ba86a0eb480c9d5bf78391cfabe60acf23371e40 Mon Sep 17 00:00:00 2001 From: Tara Drwenski Date: Wed, 19 Mar 2025 14:16:41 -0600 Subject: [PATCH 1/5] Check for LSF, FLux, and Slurm when determing MPI exec --- lib/spack/spack/build_systems/cached_cmake.py | 90 +++++++++++++------ 1 file changed, 64 insertions(+), 26 deletions(-) diff --git a/lib/spack/spack/build_systems/cached_cmake.py b/lib/spack/spack/build_systems/cached_cmake.py index c9965bbc144..f9973275aa9 100644 --- a/lib/spack/spack/build_systems/cached_cmake.py +++ b/lib/spack/spack/build_systems/cached_cmake.py @@ -2,17 +2,19 @@ # # SPDX-License-Identifier: (Apache-2.0 OR MIT) import collections.abc +import enum import os import re -from typing import Tuple +from typing import Optional, Tuple import llnl.util.filesystem as fs import llnl.util.tty as tty import spack.phase_callbacks -import spack.spec import spack.util.prefix from spack.directives import depends_on +from spack.spec import Spec +from spack.util.executable import which_string from .cmake import CMakeBuilder, CMakePackage @@ -48,6 +50,62 @@ def cmake_cache_filepath(name, value, comment=""): return 'set({0} "{1}" CACHE FILEPATH "{2}")\n'.format(name, value, comment) +class Scheduler(enum.Enum): + LSF = enum.auto() + SLURM = enum.auto() + FLUX = enum.auto() + + +def get_scheduler(spec: Spec) -> Optional[Scheduler]: + if spec.satisfies("^spectrum-mpi") or spec["mpi"].satisfies("schedulers=lsf"): + return Scheduler.LSF + + slurm_checks = ["+slurm", "schedulers=slurm", "process_managers=slurm"] + if any(spec["mpi"].satisfies(variant) for variant in slurm_checks): + return Scheduler.SLURM + + # TODO improve this when MPI implementations support flux + if which_string("flux") is not None: + return Scheduler.FLUX + + return None + + +def get_mpi_exec(spec: Spec) -> Optional[str]: + scheduler = get_scheduler(spec) + + if scheduler == Scheduler.LSF: + return which_string("lrun") + + elif scheduler == Scheduler.SLURM: + if spec["mpi"].external: + return which_string("srun") + else: + return os.path.join(spec["slurm"].prefix.bin, "srun") + + elif scheduler == Scheduler.FLUX: + flux = which_string("flux") + return f"{flux};run" if flux else None + + elif hasattr(spec["mpi"].package, "mpiexec"): + return spec["mpi"].package.mpiexec + + else: + mpiexec = os.path.join(spec["mpi"].prefix.bin, "mpirun") + if not os.path.exists(mpiexec): + mpiexec = os.path.join(spec["mpi"].prefix.bin, "mpiexec") + return mpiexec + + +def get_mpi_exec_num_proc(spec: Spec) -> str: + scheduler = get_scheduler(spec) + + if scheduler in [Scheduler.FLUX, Scheduler.LSF, Scheduler.SLURM]: + return "-n" + else: + return "-np" + + class CachedCMakeBuilder(CMakeBuilder): #: Phases of a Cached CMake package #: Note: the initconfig phase is used for developer builds as a final phase to stop on @@ -199,27 +257,10 @@ def initconfig_mpi_entries(self): if hasattr(spec["mpi"], "mpifc"): entries.append(cmake_cache_path("MPI_Fortran_COMPILER", spec["mpi"].mpifc)) - # Check for slurm - using_slurm = False - slurm_checks = ["+slurm", "schedulers=slurm", "process_managers=slurm"] - if any(spec["mpi"].satisfies(variant) for variant in slurm_checks): - using_slurm = True - # Determine MPIEXEC - if using_slurm: - if spec["mpi"].external: - # Heuristic until we have dependents on externals - mpiexec = "/usr/bin/srun" - else: - mpiexec = os.path.join(spec["slurm"].prefix.bin, "srun") - elif hasattr(spec["mpi"].package, "mpiexec"): - mpiexec = spec["mpi"].package.mpiexec - else: - mpiexec = os.path.join(spec["mpi"].prefix.bin, "mpirun") - if not os.path.exists(mpiexec): - mpiexec = os.path.join(spec["mpi"].prefix.bin, "mpiexec") + mpiexec = get_mpi_exec(spec) - if not os.path.exists(mpiexec): + if mpiexec is None or not os.path.exists(mpiexec.split(";")[0]): msg = "Unable to determine MPIEXEC, %s tests may fail" % self.pkg.name entries.append("# {0}\n".format(msg)) tty.warn(msg) @@ -232,10 +273,7 @@ def initconfig_mpi_entries(self): entries.append(cmake_cache_path("MPIEXEC", mpiexec)) # Determine MPIEXEC_NUMPROC_FLAG - if using_slurm: - entries.append(cmake_cache_string("MPIEXEC_NUMPROC_FLAG", "-n")) - else: - entries.append(cmake_cache_string("MPIEXEC_NUMPROC_FLAG", "-np")) + entries.append(cmake_cache_string("MPIEXEC_NUMPROC_FLAG", get_mpi_exec_num_proc(spec))) return entries @@ -341,7 +379,7 @@ def initconfig_package_entries(self): return [] def initconfig( - self, pkg: "CachedCMakePackage", spec: spack.spec.Spec, prefix: spack.util.prefix.Prefix + self, pkg: "CachedCMakePackage", spec: Spec, prefix: spack.util.prefix.Prefix ) -> None: cache_entries = ( self.std_initconfig_entries() From aaadce699515307d940a94ac961c7dddbac58065 Mon Sep 17 00:00:00 2001 From: Tara Drwenski Date: Mon, 24 Mar 2025 15:38:02 -0600 Subject: [PATCH 2/5] Make scheduler/MPI exec helper functions methods of CachedCMakeBuilder --- lib/spack/spack/build_systems/cached_cmake.py | 114 +++++++++--------- 1 file changed, 56 insertions(+), 58 deletions(-) diff --git a/lib/spack/spack/build_systems/cached_cmake.py b/lib/spack/spack/build_systems/cached_cmake.py index f9973275aa9..b203954ec63 100644 --- a/lib/spack/spack/build_systems/cached_cmake.py +++ b/lib/spack/spack/build_systems/cached_cmake.py @@ -50,62 +50,6 @@ def cmake_cache_filepath(name, value, comment=""): return 'set({0} "{1}" CACHE FILEPATH "{2}")\n'.format(name, value, comment) -class Scheduler(enum.Enum): - LSF = enum.auto() - SLURM = enum.auto() - FLUX = enum.auto() - - -def get_scheduler(spec: Spec) -> Optional[Scheduler]: - if spec.satisfies("^spectrum-mpi") or spec["mpi"].satisfies("schedulers=lsf"): - return Scheduler.LSF - - slurm_checks = ["+slurm", "schedulers=slurm", "process_managers=slurm"] - if any(spec["mpi"].satisfies(variant) for variant in slurm_checks): - return Scheduler.SLURM - - # TODO improve this when MPI implementations support flux - if which_string("flux") is not None: - return Scheduler.FLUX - - return None - - -def get_mpi_exec(spec: Spec) -> Optional[str]: - scheduler = get_scheduler(spec) - - if scheduler == Scheduler.LSF: - return which_string("lrun") - - elif scheduler == Scheduler.SLURM: - if spec["mpi"].external: - return which_string("srun") - else: - return os.path.join(spec["slurm"].prefix.bin, "srun") - - elif scheduler == Scheduler.FLUX: - flux = which_string("flux") - return f"{flux};run" if flux else None - - elif hasattr(spec["mpi"].package, "mpiexec"): - return spec["mpi"].package.mpiexec - - else: - mpiexec = os.path.join(spec["mpi"].prefix.bin, "mpirun") - if not os.path.exists(mpiexec): - mpiexec = os.path.join(spec["mpi"].prefix.bin, "mpiexec") - return mpiexec - - -def get_mpi_exec_num_proc(spec: Spec) -> str: - scheduler = get_scheduler(spec) - - if scheduler in [Scheduler.FLUX, Scheduler.LSF, Scheduler.SLURM]: - return "-n" - else: - return "-np" - - class CachedCMakeBuilder(CMakeBuilder): #: Phases of a Cached CMake package #: Note: the initconfig phase is used for developer builds as a final phase to stop on @@ -238,6 +182,60 @@ def initconfig_compiler_entries(self): return entries + class Scheduler(enum.Enum): + LSF = enum.auto() + SLURM = enum.auto() + FLUX = enum.auto() + + def get_scheduler(self) -> Optional[Scheduler]: + spec = self.pkg.spec + if spec.satisfies("^spectrum-mpi") or spec["mpi"].satisfies("schedulers=lsf"): + return self.Scheduler.LSF + + slurm_checks = ["+slurm", "schedulers=slurm", "process_managers=slurm"] + if any(spec["mpi"].satisfies(variant) for variant in slurm_checks): + return self.Scheduler.SLURM + + # TODO improve this when MPI implementations support flux + if which_string("flux") is not None: + return self.Scheduler.FLUX + + return None + + def get_mpi_exec(self) -> Optional[str]: + spec = self.pkg.spec + scheduler = self.get_scheduler() + + if scheduler == self.Scheduler.LSF: + return which_string("lrun") + + elif scheduler == self.Scheduler.SLURM: + if spec["mpi"].external: + return which_string("srun") + else: + return os.path.join(spec["slurm"].prefix.bin, "srun") + + elif scheduler == self.Scheduler.FLUX: + flux = which_string("flux") + return f"{flux};run" if flux else None + + elif hasattr(spec["mpi"].package, "mpiexec"): + return spec["mpi"].package.mpiexec + + else: + mpiexec = os.path.join(spec["mpi"].prefix.bin, "mpirun") + if not os.path.exists(mpiexec): + mpiexec = os.path.join(spec["mpi"].prefix.bin, "mpiexec") + return mpiexec + + def get_mpi_exec_num_proc(self) -> str: + scheduler = self.get_scheduler() + + if scheduler in [self.Scheduler.FLUX, self.Scheduler.LSF, self.Scheduler.SLURM]: + return "-n" + else: + return "-np" + def initconfig_mpi_entries(self): spec = self.pkg.spec @@ -258,7 +256,7 @@ def initconfig_mpi_entries(self): entries.append(cmake_cache_path("MPI_Fortran_COMPILER", spec["mpi"].mpifc)) # Determine MPIEXEC - mpiexec = get_mpi_exec(spec) + mpiexec = self.get_mpi_exec() if mpiexec is None or not os.path.exists(mpiexec.split(";")[0]): msg = "Unable to determine MPIEXEC, %s tests may fail" % self.pkg.name @@ -273,7 +271,7 @@ def initconfig_mpi_entries(self): entries.append(cmake_cache_path("MPIEXEC", mpiexec)) # Determine MPIEXEC_NUMPROC_FLAG - entries.append(cmake_cache_string("MPIEXEC_NUMPROC_FLAG", get_mpi_exec_num_proc(spec))) + entries.append(cmake_cache_string("MPIEXEC_NUMPROC_FLAG", self.get_mpi_exec_num_proc())) return entries From 51930949c40119d93128de25e05e1b435633fab3 Mon Sep 17 00:00:00 2001 From: Tara Drwenski Date: Mon, 24 Mar 2025 15:59:53 -0600 Subject: [PATCH 3/5] Revert change to spack spec import --- lib/spack/spack/build_systems/cached_cmake.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/spack/spack/build_systems/cached_cmake.py b/lib/spack/spack/build_systems/cached_cmake.py index b203954ec63..552f5433294 100644 --- a/lib/spack/spack/build_systems/cached_cmake.py +++ b/lib/spack/spack/build_systems/cached_cmake.py @@ -11,9 +11,9 @@ import llnl.util.tty as tty import spack.phase_callbacks +import spack.spec import spack.util.prefix from spack.directives import depends_on -from spack.spec import Spec from spack.util.executable import which_string from .cmake import CMakeBuilder, CMakePackage @@ -377,7 +377,7 @@ def initconfig_package_entries(self): return [] def initconfig( - self, pkg: "CachedCMakePackage", spec: Spec, prefix: spack.util.prefix.Prefix + self, pkg: "CachedCMakePackage", spec: spack.spec.Spec, prefix: spack.util.prefix.Prefix ) -> None: cache_entries = ( self.std_initconfig_entries() From dcf19395faa90331709785126aeaaf0a17a412c3 Mon Sep 17 00:00:00 2001 From: Tara Drwenski Date: Mon, 24 Mar 2025 16:08:50 -0600 Subject: [PATCH 4/5] Add additional comments about logic to choose a scheduler to run MPI --- lib/spack/spack/build_systems/cached_cmake.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/spack/spack/build_systems/cached_cmake.py b/lib/spack/spack/build_systems/cached_cmake.py index 552f5433294..7d522691aa6 100644 --- a/lib/spack/spack/build_systems/cached_cmake.py +++ b/lib/spack/spack/build_systems/cached_cmake.py @@ -189,14 +189,18 @@ class Scheduler(enum.Enum): def get_scheduler(self) -> Optional[Scheduler]: spec = self.pkg.spec + + # Check for Spectrum-mpi, which always uses LSF or LSF MPI variant if spec.satisfies("^spectrum-mpi") or spec["mpi"].satisfies("schedulers=lsf"): return self.Scheduler.LSF + # Check for Slurm MPI variants slurm_checks = ["+slurm", "schedulers=slurm", "process_managers=slurm"] if any(spec["mpi"].satisfies(variant) for variant in slurm_checks): return self.Scheduler.SLURM # TODO improve this when MPI implementations support flux + # Do this check last to avoid using a flux wrapper present next to Slurm/ LSF schedulers if which_string("flux") is not None: return self.Scheduler.FLUX From 269b3aa6bfe5039564302774d072270c7fa8b424 Mon Sep 17 00:00:00 2001 From: Tara Drwenski Date: Mon, 24 Mar 2025 16:35:05 -0600 Subject: [PATCH 5/5] Remove axom workaround for running mpi on machines with flux --- var/spack/repos/builtin/packages/axom/package.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/var/spack/repos/builtin/packages/axom/package.py b/var/spack/repos/builtin/packages/axom/package.py index 7c67a3176b6..c9ee774813a 100644 --- a/var/spack/repos/builtin/packages/axom/package.py +++ b/var/spack/repos/builtin/packages/axom/package.py @@ -8,7 +8,6 @@ from os.path import join as pjoin from spack.package import * -from spack.util.executable import which_string def get_spec_path(spec, package_name, path_replacements={}, use_bin=False): @@ -452,19 +451,6 @@ def initconfig_mpi_entries(self): entries.append(cmake_cache_option("ENABLE_MPI", True)) if spec["mpi"].name == "spectrum-mpi": entries.append(cmake_cache_string("BLT_MPI_COMMAND_APPEND", "mpibind")) - - # Replace /usr/bin/srun path with srun flux wrapper path on TOSS 4 - # TODO: Remove this logic by adding `using_flux` case in - # spack/lib/spack/spack/build_systems/cached_cmake.py:196 and remove hard-coded - # path to srun in same file. - if "toss_4" in self._get_sys_type(spec): - srun_wrapper = which_string("srun") - mpi_exec_index = [ - index for index, entry in enumerate(entries) if "MPIEXEC_EXECUTABLE" in entry - ] - if mpi_exec_index: - del entries[mpi_exec_index[0]] - entries.append(cmake_cache_path("MPIEXEC_EXECUTABLE", srun_wrapper)) else: entries.append(cmake_cache_option("ENABLE_MPI", False))