Support ROCm backing in DiHydrogen (#33563)

* Added support for building the DiHydrogen package and LBANN extensions
to DiHydrogen with ROCm libraries.

Fixed a bug on Cray systems where CMake didn't try hard enough to find
an MPI-compatible compiler wrapper.  Make it look more.

Added support for the roctracer package when using ROCm libraries.

* Fixed how ROCm support is defined for pre-v0.3 versions.
This commit is contained in:
Brian Van Essen 2022-10-27 12:19:56 -07:00 committed by GitHub
parent 4be67facdc
commit 6408b51def
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 30 additions and 8 deletions

View File

@ -83,9 +83,9 @@ class Dihydrogen(CMakePackage, CudaPackage, ROCmPackage):
for val in ROCmPackage.amdgpu_targets:
depends_on("aluminum amdgpu_target=%s" % val, when="amdgpu_target=%s" % val)
for when in ["+cuda", "+distconv"]:
depends_on("cuda", when=when)
depends_on("cudnn", when=when)
depends_on("roctracer-dev", when="+rocm +distconv")
depends_on("cudnn", when="+cuda +distconv")
depends_on("cub", when="^cuda@:10")
# Note that #1712 forces us to enumerate the different blas variants
@ -108,8 +108,8 @@ class Dihydrogen(CMakePackage, CudaPackage, ROCmPackage):
depends_on("cray-libsci", when="blas=libsci")
depends_on("cray-libsci +openmp", when="blas=libsci +openmp_blas")
# Distconv builds require cuda
conflicts("~cuda", when="+distconv")
# Distconv builds require cuda or rocm
conflicts("+distconv", when="~cuda ~rocm")
conflicts("+distconv", when="+half")
conflicts("+rocm", when="+half")
@ -120,6 +120,8 @@ class Dihydrogen(CMakePackage, CudaPackage, ROCmPackage):
depends_on("ninja", type="build")
depends_on("cmake@3.17.0:", type="build")
depends_on("spdlog", when="@:0.1,0.2:")
depends_on("llvm-openmp", when="%apple-clang +openmp")
# TODO: Debug linker errors when NVSHMEM is built with UCX
@ -155,10 +157,14 @@ def cmake_args(self):
"-DH2_ENABLE_DISTCONV_LEGACY=%s" % ("+distconv" in spec),
"-DH2_ENABLE_OPENMP=%s" % ("+openmp" in spec),
"-DH2_ENABLE_FP16=%s" % ("+half" in spec),
"-DH2_ENABLE_HIP_ROCM=%s" % ("+rocm" in spec),
"-DH2_DEVELOPER_BUILD=%s" % ("+developer" in spec),
]
if spec.version < Version("0.3"):
args.append("-DH2_ENABLE_HIP_ROCM=%s" % ("+rocm" in spec))
else:
args.append("-DH2_ENABLE_ROCM=%s" % ("+rocm" in spec))
if not spec.satisfies("^cmake@3.23.0"):
# There is a bug with using Ninja generator in this version
# of CMake
@ -181,7 +187,7 @@ def cmake_args(self):
if spec.satisfies("%cce") and spec.satisfies("^cuda+allow-unsupported-compilers"):
args.append("-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler")
if "+cuda" in spec or "+distconv" in spec:
if "+cuda" in spec:
args.append("-DcuDNN_DIR={0}".format(spec["cudnn"].prefix))
if spec.satisfies("^cuda@:10"):
@ -209,6 +215,12 @@ def cmake_args(self):
"-DHIP_CXX_COMPILER={0}".format(self.spec["hip"].hipcc),
]
)
if "platform=cray" in spec:
args.extend(
[
"-DMPI_ASSUME_NO_BUILTIN_MPI=ON",
]
)
archs = self.spec.variants["amdgpu_target"].value
if archs != "none":
arch_str = ",".join(archs)

View File

@ -167,7 +167,8 @@ class Lbann(CMakePackage, CudaPackage, ROCmPackage):
depends_on("dihydrogen +cuda", when="+dihydrogen +cuda")
depends_on("dihydrogen ~al", when="+dihydrogen ~al")
depends_on("dihydrogen +al", when="+dihydrogen +al")
depends_on("dihydrogen +distconv +cuda", when="+distconv")
depends_on("dihydrogen +distconv +cuda", when="+distconv +cuda")
depends_on("dihydrogen +distconv +rocm", when="+distconv +rocm")
depends_on("dihydrogen ~half", when="+dihydrogen ~half")
depends_on("dihydrogen +half", when="+dihydrogen +half")
depends_on("dihydrogen ~nvshmem", when="+dihydrogen ~nvshmem")
@ -191,6 +192,8 @@ class Lbann(CMakePackage, CudaPackage, ROCmPackage):
depends_on("aluminum amdgpu_target=%s" % val, when="+al amdgpu_target=%s" % val)
depends_on("dihydrogen amdgpu_target=%s" % val, when="+dihydrogen amdgpu_target=%s" % val)
depends_on("roctracer-dev", when="+rocm +distconv")
depends_on("cudnn", when="@0.90:0.100 +cuda")
depends_on("cudnn@8.0.2:", when="@:0.90,0.101: +cuda")
depends_on("cub", when="@0.94:0.98.2 +cuda ^cuda@:10")
@ -334,6 +337,7 @@ def cmake_args(self):
"-DLBANN_WITH_ONNX:BOOL=%s" % ("+onnx" in spec),
"-DLBANN_WITH_EMBEDDED_PYTHON:BOOL=%s" % ("+python" in spec),
"-DLBANN_WITH_PYTHON_FRONTEND:BOOL=%s" % ("+pfe" in spec),
"-DLBANN_WITH_ROCTRACER:BOOL=%s" % ("+rocm +distconv" in spec),
"-DLBANN_WITH_TBINF=OFF",
"-DLBANN_WITH_UNIT_TESTING:BOOL=%s" % ("+unit_tests" in spec),
"-DLBANN_WITH_VISION:BOOL=%s" % ("+vision" in spec),
@ -424,6 +428,12 @@ def cmake_args(self):
"-DHIP_CXX_COMPILER={0}".format(self.spec["hip"].hipcc),
]
)
if "platform=cray" in spec:
args.extend(
[
"-DMPI_ASSUME_NO_BUILTIN_MPI=ON",
]
)
archs = self.spec.variants["amdgpu_target"].value
if archs != "none":
arch_str = ",".join(archs)