Add aws ofi rccl (#32773)
* Added a package for the aws-ofi-rccl plug-in from the ROCm software stack. It allows RCCL to use the libfabric communication library. Added support for using libfabric in Aluminum. * Updated the run environment so that the plugin would get loaded. * Added support for setting up the the LD_LIBRARY_PATH for dependent packages. * Added package for RCCL tests to assess the impact of OFI libfabric RCCL plug-in.
This commit is contained in:
parent
699f575976
commit
400a9f3df7
@ -50,7 +50,12 @@ class Aluminum(CMakePackage, CudaPackage, ROCmPackage):
|
||||
description="Builds with support for CUDA intra-node "
|
||||
" Put/Get and IPC RMA functionality",
|
||||
)
|
||||
variant("rccl", default=False, description="Builds with support for NCCL communication lib")
|
||||
variant("rccl", default=False, description="Builds with support for RCCL communication lib")
|
||||
variant(
|
||||
"ofi_rccl_plugin",
|
||||
default=False,
|
||||
description="Builds with support for OFI libfabric enhanced RCCL communication lib",
|
||||
)
|
||||
|
||||
depends_on("cmake@3.21.0:", type="build", when="@1.0.1:")
|
||||
depends_on("cmake@3.17.0:", type="build", when="@:1.0.0")
|
||||
@ -62,8 +67,13 @@ class Aluminum(CMakePackage, CudaPackage, ROCmPackage):
|
||||
depends_on("cub", when="@:0.1,0.6.0: +cuda ^cuda@:10")
|
||||
depends_on("hipcub", when="@:0.1,0.6.0: +rocm")
|
||||
|
||||
depends_on("rccl", when="+rccl")
|
||||
depends_on("aws-ofi-rccl", when="+ofi_rccl_plugin platform=cray")
|
||||
|
||||
conflicts("~cuda", when="+cuda_rma", msg="CUDA RMA support requires CUDA")
|
||||
conflicts("+cuda", when="+rocm", msg="CUDA and ROCm support are mutually exclusive")
|
||||
conflicts("+nccl", when="+rccl", msg="NCCL and RCCL support are mutually exclusive")
|
||||
conflicts("~rccl", when="+ofi_rccl_plugin", msg="libfabric enhancements require RCCL support")
|
||||
|
||||
generator = "Ninja"
|
||||
depends_on("ninja", type="build")
|
||||
|
64
var/spack/repos/builtin/packages/aws-ofi-rccl/package.py
Normal file
64
var/spack/repos/builtin/packages/aws-ofi-rccl/package.py
Normal file
@ -0,0 +1,64 @@
|
||||
# Copyright 2013-2022 Lawrence Livermore National Security, LLC and other
|
||||
# Spack Project Developers. See the top-level COPYRIGHT file for details.
|
||||
#
|
||||
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
|
||||
|
||||
from spack.package import *
|
||||
|
||||
|
||||
class AwsOfiRccl(AutotoolsPackage):
|
||||
"""AWS OFI RCCL is a plug-in which enables EC2 developers to use
|
||||
libfabric as a network provider while running AMD's RCCL based
|
||||
applications."""
|
||||
|
||||
homepage = "https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl"
|
||||
git = "https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl.git"
|
||||
url = "https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl.git"
|
||||
tags = ["rocm"]
|
||||
|
||||
maintainers = ["bvanessen"]
|
||||
|
||||
version("cxi", branch="cxi", default=True)
|
||||
version("master", branch="master")
|
||||
|
||||
variant("enable-trace", default=False, description="Enable printing trace messages")
|
||||
variant("disable-tests", default=False, description="Disable build of tests")
|
||||
|
||||
depends_on("libfabric")
|
||||
depends_on("hip")
|
||||
depends_on("rccl")
|
||||
depends_on("mpi")
|
||||
depends_on("autoconf", type="build")
|
||||
depends_on("automake", type="build")
|
||||
depends_on("libtool", type="build")
|
||||
|
||||
# To enable this plug-in to work with RCCL add it to the LD_LIBRARY_PATH
|
||||
def setup_run_environment(self, env):
|
||||
aws_ofi_rccl_home = self.spec["aws-ofi-rccl"].prefix
|
||||
env.append_path("LD_LIBRARY_PATH", aws_ofi_rccl_home.lib)
|
||||
|
||||
# To enable this plug-in to work with RCCL add it to the LD_LIBRARY_PATH
|
||||
def setup_dependent_run_environment(self, env, dependent_spec):
|
||||
aws_ofi_rccl_home = self.spec["aws-ofi-rccl"].prefix
|
||||
env.append_path("LD_LIBRARY_PATH", aws_ofi_rccl_home.lib)
|
||||
|
||||
def configure_args(self):
|
||||
spec = self.spec
|
||||
args = []
|
||||
|
||||
args.extend(
|
||||
[
|
||||
"--with-libfabric={0}".format(spec["libfabric"].prefix),
|
||||
"--with-hip={0}".format(spec["hip"].prefix),
|
||||
"--with-rccl={0}".format(spec["rccl"].prefix),
|
||||
"--with-mpi={0}".format(spec["mpi"].prefix),
|
||||
]
|
||||
)
|
||||
|
||||
if "+enable-trace" in self.spec:
|
||||
args.append("--enable-trace")
|
||||
|
||||
if "+disable-tests" in self.spec:
|
||||
args.append("--disable-tests")
|
||||
|
||||
return args
|
@ -13,7 +13,6 @@ class Libfabric(AutotoolsPackage):
|
||||
homepage = "https://libfabric.org/"
|
||||
url = "https://github.com/ofiwg/libfabric/releases/download/v1.8.0/libfabric-1.8.0.tar.bz2"
|
||||
git = "https://github.com/ofiwg/libfabric.git"
|
||||
|
||||
maintainers = ["rajachan"]
|
||||
|
||||
version("main", branch="main")
|
||||
|
40
var/spack/repos/builtin/packages/rccl-tests/package.py
Normal file
40
var/spack/repos/builtin/packages/rccl-tests/package.py
Normal file
@ -0,0 +1,40 @@
|
||||
# Copyright 2013-2022 Lawrence Livermore National Security, LLC and other
|
||||
# Spack Project Developers. See the top-level COPYRIGHT file for details.
|
||||
#
|
||||
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
|
||||
|
||||
from spack.package import *
|
||||
|
||||
|
||||
class RcclTests(MakefilePackage):
|
||||
"""These tests check both the performance and the correctness of RCCL
|
||||
operations. They can be compiled against RCCL."""
|
||||
|
||||
homepage = "https://github.com/ROCmSoftwarePlatform/rccl-tests"
|
||||
git = "https://github.com/ROCmSoftwarePlatform/rccl-tests.git"
|
||||
url = "https://github.com/ROCmSoftwarePlatform/rccl-tests.git"
|
||||
tags = ["rocm"]
|
||||
|
||||
maintainers = ["bvanessen"]
|
||||
|
||||
version("develop", branch="develop", default=True)
|
||||
version("master", branch="master")
|
||||
|
||||
variant("mpi", default=True, description="with MPI support")
|
||||
|
||||
depends_on("hip")
|
||||
depends_on("rccl")
|
||||
depends_on("mpi", when="+mpi")
|
||||
|
||||
def build_targets(self):
|
||||
targets = []
|
||||
targets.append("HIP_HOME={0}".format(self.spec["hip"].prefix))
|
||||
targets.append("RCCL_HOME={0}".format(self.spec["rccl"].prefix))
|
||||
if "+mpi" in self.spec:
|
||||
targets.append("MPI_HOME={0}".format(self.spec["mpi"].prefix))
|
||||
targets.append("MPI=1")
|
||||
return targets
|
||||
|
||||
def install(self, spec, prefix):
|
||||
mkdirp(prefix.bin)
|
||||
install_tree("./build", prefix.bin)
|
Loading…
Reference in New Issue
Block a user