Make allocator::malloc throw on allocation failure (#2874 )

[CUDA] Release build for cuda 13 (#2872 )
[CUDA] Faster general copy (#2873 )
2025-12-16 01:49:05 +08:00 · 2025-12-05 17:44:38 +09:00 · 2025-12-04 21:42:26 -08:00 · 2025-12-04 21:42:15 -08:00
10 changed files with 68 additions and 55 deletions
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -131,6 +131,7 @@ jobs:
    strategy:
      matrix:
        arch: ['x86_64', 'aarch64']
+        toolkit: ['cuda-12.9', 'cuda-13.0']
    runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22-large' || 'ubuntu-22-large-arm' }}
    env:
      PYPI_RELEASE: 1
@@ -139,7 +140,7 @@ jobs:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/setup-linux
        with:
-          toolkit: 'cuda-12.9'
+          toolkit: ${{ matrix.toolkit }}
      - name: Build Python package
        uses: ./.github/actions/build-cuda-release
        with:
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -29,17 +29,20 @@ MLX has a CUDA backend which you can install with:

 .. code-block:: shell

-    pip install mlx[cuda]
+    pip install mlx[cuda12]
+

 To install the CUDA package from PyPi your system must meet the following
 requirements:

- Nvidia architecture >= SM 7.0 (Volta)
+- Nvidia architecture >= SM 7.5
 - Nvidia driver >= 550.54.14
 - CUDA toolkit >= 12.0
 - Linux distribution with glibc >= 2.35
 - Python >= 3.10

+For CUDA 13 use ``pip install mlx[cuda13]``. The CUDA 13 package requires
+an Nvidia driver >= 580 or an appropriate CUDA compatibility package.

 CPU-only (Linux)
 ^^^^^^^^^^^^^^^^
--- a/mlx/CMakeLists.txt
+++ b/mlx/CMakeLists.txt
@@ -1,7 +1,6 @@
 target_sources(
  mlx
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/array.cpp
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/array.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/compile.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/dtype.cpp
--- a/mlx/allocator.cpp
+++ b/mlx/allocator.cpp
@@ -1,24 +0,0 @@
-// Copyright © 2023 Apple Inc.
-
-#include <cstdlib>
-#include <sstream>
-
-#include "mlx/allocator.h"
-
-namespace mlx::core::allocator {
-
-Buffer malloc(size_t size) {
-  auto buffer = allocator().malloc(size);
-  if (size && !buffer.ptr()) {
-    std::ostringstream msg;
-    msg << "[malloc] Unable to allocate " << size << " bytes.";
-    throw std::runtime_error(msg.str());
-  }
-  return buffer;
-}
-
-void free(Buffer buffer) {
-  allocator().free(buffer);
-}
-
-} // namespace mlx::core::allocator
--- a/mlx/allocator.h
+++ b/mlx/allocator.h
@@ -28,10 +28,6 @@ class Buffer {
  };
 };

-Buffer malloc(size_t size);
-
-void free(Buffer buffer);
-
 class Allocator {
  /** Abstract base class for a memory allocator. */
 public:
@@ -49,4 +45,12 @@ class Allocator {

 Allocator& allocator();

+inline Buffer malloc(size_t size) {
+  return allocator().malloc(size);
+}
+
+inline void free(Buffer buffer) {
+  allocator().free(buffer);
+}
+
 } // namespace mlx::core::allocator
--- a/mlx/backend/cuda/allocator.cpp
+++ b/mlx/backend/cuda/allocator.cpp
@@ -157,16 +157,14 @@ CudaAllocator::malloc_async(size_t size, int device, cudaStream_t stream) {
      cudaError_t err;
      void* data = nullptr;
      if (device == -1) {
-        err = cudaMallocManaged(&data, size);
+        CHECK_CUDA_ERROR(cudaMallocManaged(&data, size));
      } else {
-        err = cudaMallocAsync(&data, size, stream);
-      }
-      if (err != cudaSuccess && err != cudaErrorMemoryAllocation) {
-        throw std::runtime_error(fmt::format(
-            "cudaMallocManaged failed: {}.", cudaGetErrorString(err)));
+        CHECK_CUDA_ERROR(cudaMallocAsync(&data, size, stream));
      }
      if (!data) {
-        return Buffer{nullptr};
+        std::ostringstream msg;
+        msg << "[malloc] Unable to allocate " << size << " bytes.";
+        throw std::runtime_error(msg.str());
      }
      buf = new CudaBuffer{data, size, device};
    }
--- a/mlx/backend/cuda/copy/copy_general_input.cu
+++ b/mlx/backend/cuda/copy/copy_general_input.cu
@@ -95,11 +95,14 @@ void copy_general_input(
            const InType* in_ptr = gpu_ptr<InType>(in) + offset_in;
            OutType* out_ptr = gpu_ptr<OutType>(out) + offset_out;
            int ndim = shape.size();
-            int work_per_thread = 1;
+
+            int work_per_thread = 8;
            auto dim0 = ndim > 0 ? shape.back() : 1;
            auto rest = out.size() / dim0;
-            if (dim0 >= 4) {
+            if (dim0 >= 4 && dim0 < 8) {
              work_per_thread = 4;
+            } else if (dim0 < 4) {
+              work_per_thread = 1;
            }
            dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
            auto block_dims = get_block_dims(dim0, rest, 1);
@@ -110,7 +113,10 @@ void copy_general_input(
              dispatch_1_2_3(ndim, [&](auto dims_constant) {
                auto kernel =
                    cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 1>;
-                if (work_per_thread == 4) {
+                if (work_per_thread == 8) {
+                  kernel =
+                      cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 8>;
+                } else if (work_per_thread == 4) {
                  kernel =
                      cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 4>;
                }
@@ -127,7 +133,9 @@ void copy_general_input(
              });
            } else { // ndim >= 4
              auto kernel = cu::copy_g<InType, OutType, IdxT, 1>;
-              if (work_per_thread == 4) {
+              if (work_per_thread == 8) {
+                kernel = cu::copy_g<InType, OutType, IdxT, 8>;
+              } else if (work_per_thread == 4) {
                kernel = cu::copy_g<InType, OutType, IdxT, 4>;
              }
              encoder.add_kernel_node(
--- a/mlx/backend/gpu/copy.cpp
+++ b/mlx/backend/gpu/copy.cpp
@@ -7,8 +7,6 @@

 namespace mlx::core {

-void copy_gpu(const array& in, array& out, CopyType ctype, const Stream& s);
-
 void copy_gpu(const array& in, array& out, CopyType ctype) {
  copy_gpu(in, out, ctype, out.primitive().stream());
 }
--- a/mlx/backend/metal/allocator.cpp
+++ b/mlx/backend/metal/allocator.cpp
@@ -149,7 +149,9 @@ Buffer MetalAllocator::malloc(size_t size) {
      buf = device_->newBuffer(size, resource_options);
    }
    if (!buf) {
-      return Buffer{nullptr};
+      std::ostringstream msg;
+      msg << "[malloc] Unable to allocate " << size << " bytes.";
+      throw std::runtime_error(msg.str());
    }
    lk.lock();
    num_resources_++;
--- a/setup.py
+++ b/setup.py
@@ -7,13 +7,21 @@ import re
 import subprocess
 from functools import partial
 from pathlib import Path
-from subprocess import run

 from setuptools import Command, Extension, find_namespace_packages, setup
 from setuptools.command.bdist_wheel import bdist_wheel
 from setuptools.command.build_ext import build_ext


+def cuda_toolkit_major_version():
+    out = subprocess.check_output(["nvcc", "--version"], stderr=subprocess.STDOUT)
+    text = out.decode()
+    m = re.search(r"release (\d+)", text)
+    if m:
+        return int(m.group(1))
+    return None
+
+
 def get_version():
    with open("mlx/version.h", "r") as fid:
        for l in fid:
@@ -31,7 +39,7 @@ def get_version():
        version = f"{version}.dev{today.year}{today.month:02d}{today.day:02d}"
    if not pypi_release and not dev_release:
        git_hash = (
-            run(
+            subprocess.run(
                "git rev-parse --short HEAD".split(),
                capture_output=True,
                check=True,
@@ -284,7 +292,11 @@ if __name__ == "__main__":
            install_requires.append(
                f'mlx-metal=={version}; platform_system == "Darwin"'
            )
-            extras["cuda"] = [f'mlx-cuda=={version}; platform_system == "Linux"']
+            extras["cuda"] = [f'mlx-cuda-12=={version}; platform_system == "Linux"']
+            for toolkit in [12, 13]:
+                extras[f"cuda{toolkit}"] = [
+                    f'mlx-cuda-{toolkit}=={version}; platform_system == "Linux"'
+                ]
            extras["cpu"] = [f'mlx-cpu=={version}; platform_system == "Linux"']

        _setup(
@@ -299,13 +311,25 @@ if __name__ == "__main__":
        if build_macos:
            name = "mlx-metal"
        elif build_cuda:
-            name = "mlx-cuda"
+            toolkit = cuda_toolkit_major_version()
+            name = f"mlx-cuda-{toolkit}"
+            if toolkit == 12:
                install_requires += [
                    "nvidia-cublas-cu12==12.9.*",
                    "nvidia-cuda-nvrtc-cu12==12.9.*",
-                "nvidia-cudnn-cu12==9.*",
-                "nvidia-nccl-cu12",
                ]
+            elif toolkit == 13:
+                install_requires += [
+                    "nvidia-cublas-cu13",
+                    "nvidia-cuda-nvrtc-cu13",
+                ]
+            else:
+                raise ValueError(f"Unknown toolkit {toolkit}")
+            install_requires += [
+                f"nvidia-cudnn-cu{toolkit}==9.*",
+                f"nvidia-nccl-cu{toolkit}",
+            ]
+
        else:
            name = "mlx-cpu"
        _setup(
Author	SHA1	Message	Date
Cheng	6245824d42	Make allocator::malloc throw on allocation failure (#2874 ) Some checks failed Build and Test / Check Lint (push) Has been cancelled Details Build and Test / Linux (cpu, aarch64) (push) Has been cancelled Details Build and Test / Linux (cpu, x86_64) (push) Has been cancelled Details Build and Test / Linux (cuda-12.6, aarch64) (push) Has been cancelled Details Build and Test / Linux (cuda-12.9, aarch64) (push) Has been cancelled Details Build and Test / Linux (cuda-12.6, x86_64) (push) Has been cancelled Details Build and Test / Linux (cuda-12.9, x86_64) (push) Has been cancelled Details Build and Test / macOS (14.0) (push) Has been cancelled Details Build and Test / macOS (15.0) (push) Has been cancelled Details Build and Test / Build Documentation (push) Has been cancelled Details Build and Test / Linux Fedora (aarch64) (push) Has been cancelled Details Build and Test / Linux Fedora (x86_64) (push) Has been cancelled Details	2025-12-05 17:44:38 +09:00
Awni Hannun	39289ef025	[CUDA] Release build for cuda 13 (#2872 )	2025-12-04 21:42:26 -08:00
Awni Hannun	aefc9bd3f6	[CUDA] Faster general copy (#2873 )	2025-12-04 21:42:15 -08:00