Bugfix HIP and aluminum rocm build (#33344)

* Fixed two bugs in the HIP package recipe.  The first is that the
HIP_PATH was being set to the actual spec, and not the spec prefix.

The second bug is that HIP is expected to be in /opt/rocm-x.y.z/hip
but it's libraries can exist at both /opt/rocm-x.y.z/hip/lib and
/opt/rocm-x.y.z/lib.  This means that the external detection logic may
find it in either and it turns out that some modules only expose one
of those two locations.  Logic is added to ensure that the internal
HIP_PATH and associated ROCM_PATH are correctly set in both cases.

* Added support for Aluminum to use the libfabric plugin with either
RCCL or NCCL.
This commit is contained in:
Brian Van Essen 2022-10-17 13:07:27 -07:00 committed by GitHub
parent 9b87b4c8cd
commit 47bfc60845
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 31 additions and 7 deletions

View File

@ -52,9 +52,16 @@ class Aluminum(CMakePackage, CudaPackage, ROCmPackage):
)
variant("rccl", default=False, description="Builds with support for RCCL communication lib")
variant(
"ofi_rccl_plugin",
default=False,
description="Builds with support for OFI libfabric enhanced RCCL communication lib",
"ofi_libfabric_plugin",
default=True,
when="+rccl platform=cray",
description="Builds with support for OFI libfabric enhanced RCCL/NCCL communication lib",
)
variant(
"ofi_libfabric_plugin",
default=True,
when="+nccl platform=cray",
description="Builds with support for OFI libfabric enhanced RCCL/NCCL communication lib",
)
depends_on("cmake@3.21.0:", type="build", when="@1.0.1:")
@ -68,12 +75,12 @@ class Aluminum(CMakePackage, CudaPackage, ROCmPackage):
depends_on("hipcub", when="@:0.1,0.6.0: +rocm")
depends_on("rccl", when="+rccl")
depends_on("aws-ofi-rccl", when="+ofi_rccl_plugin platform=cray")
depends_on("aws-ofi-rccl", when="+rccl +ofi_libfabric_plugin platform=cray")
depends_on("aws-ofi-nccl", when="+nccl +ofi_libfabric_plugin platform=cray")
conflicts("~cuda", when="+cuda_rma", msg="CUDA RMA support requires CUDA")
conflicts("+cuda", when="+rocm", msg="CUDA and ROCm support are mutually exclusive")
conflicts("+nccl", when="+rccl", msg="NCCL and RCCL support are mutually exclusive")
conflicts("~rccl", when="+ofi_rccl_plugin", msg="libfabric enhancements require RCCL support")
generator = "Ninja"
depends_on("ninja", type="build")

View File

@ -288,12 +288,22 @@ def get_paths(self):
if self.spec.external:
# For external packages we only assume the `hip` prefix is known,
# because spack does not set prefixes of dependencies of externals.
hip_libs_at_top = os.path.basename(self.spec.prefix) != "hip"
# We assume self.spec.prefix is /opt/rocm-x.y.z for rocm-5.2.0 and newer
# and /opt/rocm-x.y.z/hip for older versions
if self.spec.satisfies("@5.2.0:"):
rocm_prefix = Prefix(self.spec.prefix)
else:
rocm_prefix = Prefix(os.path.dirname(self.spec.prefix))
# We assume self.spec.prefix is /opt/rocm-x.y.z/hip and rocm has a
# default installation with everything installed under
# /opt/rocm-x.y.z
# Note that since the key hip library can also exist at the top of the
# /opt/rocm-x.y.z/lib tree, it is possible that the package is detected
# without the correct prefix. Work around it.
if hip_libs_at_top:
rocm_prefix = Prefix(self.spec.prefix)
else:
rocm_prefix = Prefix(os.path.dirname(self.spec.prefix))
if not os.path.isdir(rocm_prefix):
msg = "Could not determine prefix for other rocm components\n"
@ -302,7 +312,13 @@ def get_paths(self):
msg += "a workaround."
raise RuntimeError(msg)
if hip_libs_at_top:
hip_path = "{0}/hip".format(self.spec.prefix)
else:
hip_path = self.spec.prefix
paths = {
"hip-path": hip_path,
"rocm-path": rocm_prefix,
"llvm-amdgpu": rocm_prefix.llvm,
"hsa-rocr-dev": rocm_prefix.hsa,
@ -311,6 +327,7 @@ def get_paths(self):
}
else:
paths = {
"hip-path": self.spec.prefix,
"rocm-path": self.spec.prefix,
"llvm-amdgpu": self.spec["llvm-amdgpu"].prefix,
"hsa-rocr-dev": self.spec["hsa-rocr-dev"].prefix,
@ -374,7 +391,7 @@ def set_variables(self, env):
env.set("HIP_DEVICE_LIB_PATH", paths["bitcode"])
# Just the prefix of hip (used in hipcc)
env.set("HIP_PATH", paths["rocm-path"])
env.set("HIP_PATH", paths["hip-path"])
# Used in comgr and seems necessary when using the JIT compiler, e.g.
# hiprtcCreateProgram: