Compare commits

...

3 Commits

Author SHA1 Message Date
Andrey Portnoy
5722c147de [CUDA] Update calls to cudaMemAdvise and cudaGraphAddDependencies for CUDA 13 (#2525)
* [CUDA] Update cudaMemAdvise and cudaGraphAddDependencies for CUDA 13

These functions' signatures changed in CUDA 13, so we differentiate
between CUDA 13 and preceding releases at compile time.

* Mention NVIDIA in ACKNOWLEDGMENTS.md
2025-08-21 19:57:20 -07:00
Cheng
f6819a1f26 Fix warning 186-D from nvcc (#2527) 2025-08-22 10:29:55 +09:00
Awni Hannun
f93f87c802 nccl dep + default for cuda (#2526) 2025-08-21 17:57:49 -07:00
7 changed files with 33 additions and 9 deletions

View File

@@ -25,6 +25,11 @@ MLX was developed with contributions from the following individuals:
<img class="dark-light" src="https://contrib.rocks/image?repo=ml-explore/mlx&anon=0&columns=20&max=100&r=true" />
</a>
# Organizations
MLX has received contributions from the following companies:
- NVIDIA Corporation & Affiliates
# Third-Party Software
MLX leverages several third-party software, listed here together with

View File

@@ -30,8 +30,15 @@ SmallSizePool::SmallSizePool() {
next_free_ = buffer_;
CHECK_CUDA_ERROR(cudaMallocManaged(&data_, small_pool_size));
#if CUDART_VERSION >= 13000
cudaMemLocation loc;
loc.type = cudaMemLocationTypeDevice;
loc.id = 0;
#else
int loc = 0;
#endif // CUDART_VERSION >= 13000
CHECK_CUDA_ERROR(
cudaMemAdvise(data_, small_pool_size, cudaMemAdviseSetReadMostly, 0));
cudaMemAdvise(data_, small_pool_size, cudaMemAdviseSetReadMostly, loc));
auto curr = next_free_;
for (size_t i = 1; i < num_blocks; ++i) {

View File

@@ -269,7 +269,13 @@ void CommandEncoder::commit() {
if (node_count_ > 0) {
if (!from_nodes_.empty()) {
CHECK_CUDA_ERROR(cudaGraphAddDependencies(
graph_, from_nodes_.data(), to_nodes_.data(), from_nodes_.size()));
graph_,
from_nodes_.data(),
to_nodes_.data(),
#if CUDART_VERSION >= 13000
nullptr, // edgeData
#endif // CUDART_VERSION >= 13000
from_nodes_.size()));
}
graph_key_ += ".";

View File

@@ -205,8 +205,10 @@ struct Power {
if constexpr (cuda::std::is_integral_v<T>) {
T res = 1;
// Raising an integer to a negative power is undefined
if (exp < 0) {
return 0;
if constexpr (cuda::std::is_signed_v<T>) {
if (exp < 0) {
return 0;
}
}
while (exp) {
if (exp & 1) {

View File

@@ -20,6 +20,8 @@ from select import select
from subprocess import PIPE, Popen, run
from typing import Optional
import mlx.core as mx
@dataclass
class Host:
@@ -437,8 +439,8 @@ def launch_nccl(parser, hosts, args, command):
try:
for rank in range(world_size):
env = base_env.copy()
env["MLX_RANK"] = str(rank)
env["CUDA_VISIBLE_DEVICES"] = str(rank % args.nproc_per_node)
env["MLX_RANK"] = str(rank % args.repeat_hosts)
env["CUDA_VISIBLE_DEVICES"] = str(rank % args.repeat_hosts)
p = Popen(command, env=env)
procs.append(p)
@@ -708,7 +710,7 @@ def distributed_config():
parser.add_argument(
"--backend",
choices=["ring", "mpi", "nccl"],
default="ring",
default="nccl" if mx.cuda.is_available() else "ring",
help="Which distributed backend to configure",
)
parser.add_argument(
@@ -780,7 +782,7 @@ def main():
parser.add_argument(
"--backend",
choices=["ring", "mpi", "nccl"],
default="ring",
default="nccl" if mx.cuda.is_available() else "ring",
help="Which distributed backend to launch",
)
parser.add_argument(

View File

@@ -6,6 +6,7 @@ auditwheel repair dist/* \
--exclude libnvrtc* \
--exclude libcuda* \
--exclude libcudnn* \
--exclude libnccl* \
-w wheel_tmp
@@ -17,7 +18,7 @@ rm "${repaired_wheel}"
mlx_so="mlx/lib/libmlx.so"
rpath=$(patchelf --print-rpath "${mlx_so}")
base="\$ORIGIN/../../nvidia"
rpath=$rpath:${base}/cublas/lib:${base}/cuda_nvrtc/lib:${base}/cudnn/lib
rpath=$rpath:${base}/cublas/lib:${base}/cuda_nvrtc/lib:${base}/cudnn/lib:${base}/nccl/lib
patchelf --force-rpath --set-rpath "$rpath" "$mlx_so"
python ../python/scripts/repair_record.py ${mlx_so}

View File

@@ -297,6 +297,7 @@ if __name__ == "__main__":
"nvidia-cublas-cu12==12.9.*",
"nvidia-cuda-nvrtc-cu12==12.9.*",
"nvidia-cudnn-cu12==9.*",
"nvidia-nccl-cu12",
]
else:
name = "mlx-cpu"