Add aws ofi rccl (#32773)

* Added a package for the aws-ofi-rccl plug-in from the ROCm software
stack.  It allows RCCL to use the libfabric communication library.

Added support for using libfabric in Aluminum.

* Updated the run environment so that the plugin would get loaded.

* Added support for setting up the the LD_LIBRARY_PATH for dependent packages.

* Added package for RCCL tests to assess the impact of OFI libfabric RCCL plug-in.
This commit is contained in:
Brian Van Essen 2022-09-29 11:46:27 -06:00 committed by GitHub
parent 699f575976
commit 400a9f3df7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 115 additions and 2 deletions

View File

@ -50,7 +50,12 @@ class Aluminum(CMakePackage, CudaPackage, ROCmPackage):
description="Builds with support for CUDA intra-node "
" Put/Get and IPC RMA functionality",
)
variant("rccl", default=False, description="Builds with support for NCCL communication lib")
variant("rccl", default=False, description="Builds with support for RCCL communication lib")
variant(
"ofi_rccl_plugin",
default=False,
description="Builds with support for OFI libfabric enhanced RCCL communication lib",
)
depends_on("cmake@3.21.0:", type="build", when="@1.0.1:")
depends_on("cmake@3.17.0:", type="build", when="@:1.0.0")
@ -62,8 +67,13 @@ class Aluminum(CMakePackage, CudaPackage, ROCmPackage):
depends_on("cub", when="@:0.1,0.6.0: +cuda ^cuda@:10")
depends_on("hipcub", when="@:0.1,0.6.0: +rocm")
depends_on("rccl", when="+rccl")
depends_on("aws-ofi-rccl", when="+ofi_rccl_plugin platform=cray")
conflicts("~cuda", when="+cuda_rma", msg="CUDA RMA support requires CUDA")
conflicts("+cuda", when="+rocm", msg="CUDA and ROCm support are mutually exclusive")
conflicts("+nccl", when="+rccl", msg="NCCL and RCCL support are mutually exclusive")
conflicts("~rccl", when="+ofi_rccl_plugin", msg="libfabric enhancements require RCCL support")
generator = "Ninja"
depends_on("ninja", type="build")

View File

@ -0,0 +1,64 @@
# Copyright 2013-2022 Lawrence Livermore National Security, LLC and other
# Spack Project Developers. See the top-level COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
from spack.package import *
class AwsOfiRccl(AutotoolsPackage):
"""AWS OFI RCCL is a plug-in which enables EC2 developers to use
libfabric as a network provider while running AMD's RCCL based
applications."""
homepage = "https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl"
git = "https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl.git"
url = "https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl.git"
tags = ["rocm"]
maintainers = ["bvanessen"]
version("cxi", branch="cxi", default=True)
version("master", branch="master")
variant("enable-trace", default=False, description="Enable printing trace messages")
variant("disable-tests", default=False, description="Disable build of tests")
depends_on("libfabric")
depends_on("hip")
depends_on("rccl")
depends_on("mpi")
depends_on("autoconf", type="build")
depends_on("automake", type="build")
depends_on("libtool", type="build")
# To enable this plug-in to work with RCCL add it to the LD_LIBRARY_PATH
def setup_run_environment(self, env):
aws_ofi_rccl_home = self.spec["aws-ofi-rccl"].prefix
env.append_path("LD_LIBRARY_PATH", aws_ofi_rccl_home.lib)
# To enable this plug-in to work with RCCL add it to the LD_LIBRARY_PATH
def setup_dependent_run_environment(self, env, dependent_spec):
aws_ofi_rccl_home = self.spec["aws-ofi-rccl"].prefix
env.append_path("LD_LIBRARY_PATH", aws_ofi_rccl_home.lib)
def configure_args(self):
spec = self.spec
args = []
args.extend(
[
"--with-libfabric={0}".format(spec["libfabric"].prefix),
"--with-hip={0}".format(spec["hip"].prefix),
"--with-rccl={0}".format(spec["rccl"].prefix),
"--with-mpi={0}".format(spec["mpi"].prefix),
]
)
if "+enable-trace" in self.spec:
args.append("--enable-trace")
if "+disable-tests" in self.spec:
args.append("--disable-tests")
return args

View File

@ -13,7 +13,6 @@ class Libfabric(AutotoolsPackage):
homepage = "https://libfabric.org/"
url = "https://github.com/ofiwg/libfabric/releases/download/v1.8.0/libfabric-1.8.0.tar.bz2"
git = "https://github.com/ofiwg/libfabric.git"
maintainers = ["rajachan"]
version("main", branch="main")

View File

@ -0,0 +1,40 @@
# Copyright 2013-2022 Lawrence Livermore National Security, LLC and other
# Spack Project Developers. See the top-level COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
from spack.package import *
class RcclTests(MakefilePackage):
"""These tests check both the performance and the correctness of RCCL
operations. They can be compiled against RCCL."""
homepage = "https://github.com/ROCmSoftwarePlatform/rccl-tests"
git = "https://github.com/ROCmSoftwarePlatform/rccl-tests.git"
url = "https://github.com/ROCmSoftwarePlatform/rccl-tests.git"
tags = ["rocm"]
maintainers = ["bvanessen"]
version("develop", branch="develop", default=True)
version("master", branch="master")
variant("mpi", default=True, description="with MPI support")
depends_on("hip")
depends_on("rccl")
depends_on("mpi", when="+mpi")
def build_targets(self):
targets = []
targets.append("HIP_HOME={0}".format(self.spec["hip"].prefix))
targets.append("RCCL_HOME={0}".format(self.spec["rccl"].prefix))
if "+mpi" in self.spec:
targets.append("MPI_HOME={0}".format(self.spec["mpi"].prefix))
targets.append("MPI=1")
return targets
def install(self, spec, prefix):
mkdirp(prefix.bin)
install_tree("./build", prefix.bin)