Support ROCm backing in DiHydrogen (#33563)
* Added support for building the DiHydrogen package and LBANN extensions to DiHydrogen with ROCm libraries. Fixed a bug on Cray systems where CMake didn't try hard enough to find an MPI-compatible compiler wrapper. Make it look more. Added support for the roctracer package when using ROCm libraries. * Fixed how ROCm support is defined for pre-v0.3 versions.
This commit is contained in:
parent
4be67facdc
commit
6408b51def
@ -83,9 +83,9 @@ class Dihydrogen(CMakePackage, CudaPackage, ROCmPackage):
|
|||||||
for val in ROCmPackage.amdgpu_targets:
|
for val in ROCmPackage.amdgpu_targets:
|
||||||
depends_on("aluminum amdgpu_target=%s" % val, when="amdgpu_target=%s" % val)
|
depends_on("aluminum amdgpu_target=%s" % val, when="amdgpu_target=%s" % val)
|
||||||
|
|
||||||
for when in ["+cuda", "+distconv"]:
|
depends_on("roctracer-dev", when="+rocm +distconv")
|
||||||
depends_on("cuda", when=when)
|
|
||||||
depends_on("cudnn", when=when)
|
depends_on("cudnn", when="+cuda +distconv")
|
||||||
depends_on("cub", when="^cuda@:10")
|
depends_on("cub", when="^cuda@:10")
|
||||||
|
|
||||||
# Note that #1712 forces us to enumerate the different blas variants
|
# Note that #1712 forces us to enumerate the different blas variants
|
||||||
@ -108,8 +108,8 @@ class Dihydrogen(CMakePackage, CudaPackage, ROCmPackage):
|
|||||||
depends_on("cray-libsci", when="blas=libsci")
|
depends_on("cray-libsci", when="blas=libsci")
|
||||||
depends_on("cray-libsci +openmp", when="blas=libsci +openmp_blas")
|
depends_on("cray-libsci +openmp", when="blas=libsci +openmp_blas")
|
||||||
|
|
||||||
# Distconv builds require cuda
|
# Distconv builds require cuda or rocm
|
||||||
conflicts("~cuda", when="+distconv")
|
conflicts("+distconv", when="~cuda ~rocm")
|
||||||
|
|
||||||
conflicts("+distconv", when="+half")
|
conflicts("+distconv", when="+half")
|
||||||
conflicts("+rocm", when="+half")
|
conflicts("+rocm", when="+half")
|
||||||
@ -120,6 +120,8 @@ class Dihydrogen(CMakePackage, CudaPackage, ROCmPackage):
|
|||||||
depends_on("ninja", type="build")
|
depends_on("ninja", type="build")
|
||||||
depends_on("cmake@3.17.0:", type="build")
|
depends_on("cmake@3.17.0:", type="build")
|
||||||
|
|
||||||
|
depends_on("spdlog", when="@:0.1,0.2:")
|
||||||
|
|
||||||
depends_on("llvm-openmp", when="%apple-clang +openmp")
|
depends_on("llvm-openmp", when="%apple-clang +openmp")
|
||||||
|
|
||||||
# TODO: Debug linker errors when NVSHMEM is built with UCX
|
# TODO: Debug linker errors when NVSHMEM is built with UCX
|
||||||
@ -155,10 +157,14 @@ def cmake_args(self):
|
|||||||
"-DH2_ENABLE_DISTCONV_LEGACY=%s" % ("+distconv" in spec),
|
"-DH2_ENABLE_DISTCONV_LEGACY=%s" % ("+distconv" in spec),
|
||||||
"-DH2_ENABLE_OPENMP=%s" % ("+openmp" in spec),
|
"-DH2_ENABLE_OPENMP=%s" % ("+openmp" in spec),
|
||||||
"-DH2_ENABLE_FP16=%s" % ("+half" in spec),
|
"-DH2_ENABLE_FP16=%s" % ("+half" in spec),
|
||||||
"-DH2_ENABLE_HIP_ROCM=%s" % ("+rocm" in spec),
|
|
||||||
"-DH2_DEVELOPER_BUILD=%s" % ("+developer" in spec),
|
"-DH2_DEVELOPER_BUILD=%s" % ("+developer" in spec),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
if spec.version < Version("0.3"):
|
||||||
|
args.append("-DH2_ENABLE_HIP_ROCM=%s" % ("+rocm" in spec))
|
||||||
|
else:
|
||||||
|
args.append("-DH2_ENABLE_ROCM=%s" % ("+rocm" in spec))
|
||||||
|
|
||||||
if not spec.satisfies("^cmake@3.23.0"):
|
if not spec.satisfies("^cmake@3.23.0"):
|
||||||
# There is a bug with using Ninja generator in this version
|
# There is a bug with using Ninja generator in this version
|
||||||
# of CMake
|
# of CMake
|
||||||
@ -181,7 +187,7 @@ def cmake_args(self):
|
|||||||
if spec.satisfies("%cce") and spec.satisfies("^cuda+allow-unsupported-compilers"):
|
if spec.satisfies("%cce") and spec.satisfies("^cuda+allow-unsupported-compilers"):
|
||||||
args.append("-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler")
|
args.append("-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler")
|
||||||
|
|
||||||
if "+cuda" in spec or "+distconv" in spec:
|
if "+cuda" in spec:
|
||||||
args.append("-DcuDNN_DIR={0}".format(spec["cudnn"].prefix))
|
args.append("-DcuDNN_DIR={0}".format(spec["cudnn"].prefix))
|
||||||
|
|
||||||
if spec.satisfies("^cuda@:10"):
|
if spec.satisfies("^cuda@:10"):
|
||||||
@ -209,6 +215,12 @@ def cmake_args(self):
|
|||||||
"-DHIP_CXX_COMPILER={0}".format(self.spec["hip"].hipcc),
|
"-DHIP_CXX_COMPILER={0}".format(self.spec["hip"].hipcc),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
if "platform=cray" in spec:
|
||||||
|
args.extend(
|
||||||
|
[
|
||||||
|
"-DMPI_ASSUME_NO_BUILTIN_MPI=ON",
|
||||||
|
]
|
||||||
|
)
|
||||||
archs = self.spec.variants["amdgpu_target"].value
|
archs = self.spec.variants["amdgpu_target"].value
|
||||||
if archs != "none":
|
if archs != "none":
|
||||||
arch_str = ",".join(archs)
|
arch_str = ",".join(archs)
|
||||||
|
@ -167,7 +167,8 @@ class Lbann(CMakePackage, CudaPackage, ROCmPackage):
|
|||||||
depends_on("dihydrogen +cuda", when="+dihydrogen +cuda")
|
depends_on("dihydrogen +cuda", when="+dihydrogen +cuda")
|
||||||
depends_on("dihydrogen ~al", when="+dihydrogen ~al")
|
depends_on("dihydrogen ~al", when="+dihydrogen ~al")
|
||||||
depends_on("dihydrogen +al", when="+dihydrogen +al")
|
depends_on("dihydrogen +al", when="+dihydrogen +al")
|
||||||
depends_on("dihydrogen +distconv +cuda", when="+distconv")
|
depends_on("dihydrogen +distconv +cuda", when="+distconv +cuda")
|
||||||
|
depends_on("dihydrogen +distconv +rocm", when="+distconv +rocm")
|
||||||
depends_on("dihydrogen ~half", when="+dihydrogen ~half")
|
depends_on("dihydrogen ~half", when="+dihydrogen ~half")
|
||||||
depends_on("dihydrogen +half", when="+dihydrogen +half")
|
depends_on("dihydrogen +half", when="+dihydrogen +half")
|
||||||
depends_on("dihydrogen ~nvshmem", when="+dihydrogen ~nvshmem")
|
depends_on("dihydrogen ~nvshmem", when="+dihydrogen ~nvshmem")
|
||||||
@ -191,6 +192,8 @@ class Lbann(CMakePackage, CudaPackage, ROCmPackage):
|
|||||||
depends_on("aluminum amdgpu_target=%s" % val, when="+al amdgpu_target=%s" % val)
|
depends_on("aluminum amdgpu_target=%s" % val, when="+al amdgpu_target=%s" % val)
|
||||||
depends_on("dihydrogen amdgpu_target=%s" % val, when="+dihydrogen amdgpu_target=%s" % val)
|
depends_on("dihydrogen amdgpu_target=%s" % val, when="+dihydrogen amdgpu_target=%s" % val)
|
||||||
|
|
||||||
|
depends_on("roctracer-dev", when="+rocm +distconv")
|
||||||
|
|
||||||
depends_on("cudnn", when="@0.90:0.100 +cuda")
|
depends_on("cudnn", when="@0.90:0.100 +cuda")
|
||||||
depends_on("cudnn@8.0.2:", when="@:0.90,0.101: +cuda")
|
depends_on("cudnn@8.0.2:", when="@:0.90,0.101: +cuda")
|
||||||
depends_on("cub", when="@0.94:0.98.2 +cuda ^cuda@:10")
|
depends_on("cub", when="@0.94:0.98.2 +cuda ^cuda@:10")
|
||||||
@ -334,6 +337,7 @@ def cmake_args(self):
|
|||||||
"-DLBANN_WITH_ONNX:BOOL=%s" % ("+onnx" in spec),
|
"-DLBANN_WITH_ONNX:BOOL=%s" % ("+onnx" in spec),
|
||||||
"-DLBANN_WITH_EMBEDDED_PYTHON:BOOL=%s" % ("+python" in spec),
|
"-DLBANN_WITH_EMBEDDED_PYTHON:BOOL=%s" % ("+python" in spec),
|
||||||
"-DLBANN_WITH_PYTHON_FRONTEND:BOOL=%s" % ("+pfe" in spec),
|
"-DLBANN_WITH_PYTHON_FRONTEND:BOOL=%s" % ("+pfe" in spec),
|
||||||
|
"-DLBANN_WITH_ROCTRACER:BOOL=%s" % ("+rocm +distconv" in spec),
|
||||||
"-DLBANN_WITH_TBINF=OFF",
|
"-DLBANN_WITH_TBINF=OFF",
|
||||||
"-DLBANN_WITH_UNIT_TESTING:BOOL=%s" % ("+unit_tests" in spec),
|
"-DLBANN_WITH_UNIT_TESTING:BOOL=%s" % ("+unit_tests" in spec),
|
||||||
"-DLBANN_WITH_VISION:BOOL=%s" % ("+vision" in spec),
|
"-DLBANN_WITH_VISION:BOOL=%s" % ("+vision" in spec),
|
||||||
@ -424,6 +428,12 @@ def cmake_args(self):
|
|||||||
"-DHIP_CXX_COMPILER={0}".format(self.spec["hip"].hipcc),
|
"-DHIP_CXX_COMPILER={0}".format(self.spec["hip"].hipcc),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
if "platform=cray" in spec:
|
||||||
|
args.extend(
|
||||||
|
[
|
||||||
|
"-DMPI_ASSUME_NO_BUILTIN_MPI=ON",
|
||||||
|
]
|
||||||
|
)
|
||||||
archs = self.spec.variants["amdgpu_target"].value
|
archs = self.spec.variants["amdgpu_target"].value
|
||||||
if archs != "none":
|
if archs != "none":
|
||||||
arch_str = ",".join(archs)
|
arch_str = ",".join(archs)
|
||||||
|
Loading…
Reference in New Issue
Block a user