Fix building with CUDA < 12.8 (#2782)

2025-12-06 19:08:15 +08:00 · 2025-11-18 12:55:19 +09:00
parent 35f81728f1
commit 940f4c7818
40 changed files with 91 additions and 113 deletions
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@@ -56,7 +56,6 @@ runs:
        PACKAGES: |
          {
            "cuda-12.6": "libcudnn9-dev-cuda-12 cuda-toolkit-12-6",
-            "cuda-12.8": "libcudnn9-dev-cuda-12 cuda-toolkit-12-8",
            "cuda-12.9": "libcudnn9-dev-cuda-12 cuda-toolkit-12-9",
            "cuda-13.0": "libcudnn9-dev-cuda-13 cuda-toolkit-13-0"
          }
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -52,7 +52,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        toolkit: ['cuda-12.8', 'cuda-12.9']
+        toolkit: ['cuda-12.6', 'cuda-12.9']
    runs-on: gpu-t4-4-core
    needs: check_lint
    steps:
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -142,6 +142,7 @@ FetchContent_Declare(
  URL "https://github.com/NVIDIA/cccl/releases/download/v2.8.1/cccl-v2.8.1.zip")
 FetchContent_MakeAvailable(cccl)
 target_include_directories(mlx BEFORE PRIVATE "${cccl_SOURCE_DIR}/include")
+set_target_properties(mlx PROPERTIES CCCL_DIR "${cccl_SOURCE_DIR}/include")

 # Use fixed version of NVTX.
 FetchContent_Declare(
--- a/mlx/backend/cuda/allocator.cpp
+++ b/mlx/backend/cuda/allocator.cpp
@@ -119,7 +119,8 @@ void copy_to_managed(CudaBuffer& buf) {
  buf.data = new_data;
 }

-Buffer CudaAllocator::malloc_impl(size_t size, cudaStream_t stream) {
+Buffer
+CudaAllocator::malloc_async(size_t size, int device, cudaStream_t stream) {
  if (size == 0) {
    return Buffer{new CudaBuffer{nullptr, 0, -1}};
  }
@@ -134,9 +135,8 @@ Buffer CudaAllocator::malloc_impl(size_t size, cudaStream_t stream) {
    size = page_size * ((size + page_size - 1) / page_size);
  }

-  int device = -1;
-  if (size > small_block_size && stream != nullptr) {
-    CHECK_CUDA_ERROR(cudaStreamGetDevice(stream, &device));
+  if (size <= small_block_size || stream == nullptr) {
+    device = -1;
  }

  CudaBuffer* buf = buffer_cache_.reuse_from_cache(size);
@@ -182,12 +182,8 @@ Buffer CudaAllocator::malloc_impl(size_t size, cudaStream_t stream) {
  return Buffer{buf};
 }

-Buffer CudaAllocator::malloc_async(size_t size, cudaStream_t stream) {
-  return malloc_impl(size, stream);
-}
-
 Buffer CudaAllocator::malloc(size_t size) {
-  return malloc_impl(size, nullptr);
+  return malloc_async(size, -1, nullptr);
 }

 void CudaAllocator::free(Buffer buffer) {
@@ -277,8 +273,9 @@ CudaAllocator& allocator() {
  return *allocator_;
 }

-Buffer malloc_async(size_t size, cudaStream_t stream) {
-  auto buffer = allocator().malloc_async(size, stream);
+Buffer malloc_async(size_t size, CommandEncoder& encoder) {
+  auto buffer = allocator().malloc_async(
+      size, encoder.device().cuda_device(), encoder.stream());
  if (size && !buffer.ptr()) {
    std::ostringstream msg;
    msg << "[malloc_async] Unable to allocate " << size << " bytes.";
--- a/mlx/backend/cuda/allocator.h
+++ b/mlx/backend/cuda/allocator.h
@@ -13,6 +13,8 @@

 namespace mlx::core::cu {

+class CommandEncoder;
+
 using allocator::Buffer;

 // Stores cuda-managed unified memory.
@@ -48,7 +50,7 @@ class SmallSizePool {
 class CudaAllocator : public allocator::Allocator {
 public:
  Buffer malloc(size_t size) override;
-  Buffer malloc_async(size_t size, cudaStream_t stream);
+  Buffer malloc_async(size_t size, int device, cudaStream_t stream);
  void free(Buffer buffer) override;
  size_t size(Buffer buffer) const override;

@@ -62,7 +64,6 @@ class CudaAllocator : public allocator::Allocator {
  void clear_cache();

 private:
-  Buffer malloc_impl(size_t size, cudaStream_t stream);
  void cuda_free(CudaBuffer* buf);

  CudaAllocator();
@@ -80,6 +81,6 @@ class CudaAllocator : public allocator::Allocator {

 CudaAllocator& allocator();

-Buffer malloc_async(size_t size, cudaStream_t stream);
+Buffer malloc_async(size_t size, CommandEncoder& encoder);

 } // namespace mlx::core::cu
--- a/mlx/backend/cuda/arange.cu
+++ b/mlx/backend/cuda/arange.cu
@@ -42,7 +42,7 @@ void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
    return;
  }
  auto& encoder = cu::get_command_encoder(stream());
-  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder));
  encoder.set_output_array(out);

  dispatch_int_float_types(out.dtype(), "Arange", [&](auto type_tag) {
--- a/mlx/backend/cuda/arg_reduce.cu
+++ b/mlx/backend/cuda/arg_reduce.cu
@@ -143,7 +143,7 @@ void ArgReduce::eval_gpu(const std::vector<array>& inputs, array& out) {

  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
-  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder));

  // Prepare the shapes, strides and axis arguments.
  Shape shape = remove_index(in.shape(), axis_);
--- a/mlx/backend/cuda/binary/binary.cuh
+++ b/mlx/backend/cuda/binary/binary.cuh
@@ -367,9 +367,8 @@ void binary_op_gpu(
  auto bopt = get_binary_op_type(a, b);
  auto& encoder = cu::get_command_encoder(s);

-  set_binary_op_output_data(a, b, out, bopt, [&](auto n) {
-    return cu::malloc_async(n, encoder.stream());
-  });
+  set_binary_op_output_data(
+      a, b, out, bopt, [&](auto n) { return cu::malloc_async(n, encoder); });
  binary_op_gpu_inplace<Op>(inputs, out, op, s);
 }

--- a/mlx/backend/cuda/binary_two.cu
+++ b/mlx/backend/cuda/binary_two.cu
@@ -246,12 +246,10 @@ void binary_two_op_gpu_inplace(
  auto& out_b = outputs[1];
  auto bopt = get_binary_op_type(a, b);
  auto& encoder = cu::get_command_encoder(s);
-  set_binary_op_output_data(a, b, out_a, bopt, [&](auto n) {
-    return cu::malloc_async(n, encoder.stream());
-  });
-  set_binary_op_output_data(a, b, out_b, bopt, [&](auto n) {
-    return cu::malloc_async(n, encoder.stream());
-  });
+  set_binary_op_output_data(
+      a, b, out_a, bopt, [&](auto n) { return cu::malloc_async(n, encoder); });
+  set_binary_op_output_data(
+      a, b, out_b, bopt, [&](auto n) { return cu::malloc_async(n, encoder); });

  if (out_a.size() == 0) {
    return;
--- a/mlx/backend/cuda/compiled.cpp
+++ b/mlx/backend/cuda/compiled.cpp
@@ -298,7 +298,7 @@ void Compiled::eval_gpu(
  // Put outputs.
  compiled_allocate_outputs(
      inputs, outputs, is_constant_, contiguous, [&](auto n) {
-        return cu::malloc_async(n, encoder.stream());
+        return cu::malloc_async(n, encoder);
      });
  for (auto& x : outputs) {
    args.append(x);
--- a/mlx/backend/cuda/conv.cpp
+++ b/mlx/backend/cuda/conv.cpp
@@ -277,7 +277,7 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
  array in = inputs[0];
  array wt = inputs[1];
  array out = out_;
-  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder));
  Dtype dtype = out.dtype();

  // Search cache.
--- a/mlx/backend/cuda/conv/gemm_conv.cu
+++ b/mlx/backend/cuda/conv/gemm_conv.cu
@@ -86,7 +86,7 @@ array unfold_inputs_nd(
    int mat_N,
    ConvParams<NDIM>& params) {
  array unfolded({mat_M, mat_K}, in.dtype(), nullptr, {});
-  unfolded.set_data(cu::malloc_async(unfolded.nbytes(), encoder.stream()));
+  unfolded.set_data(cu::malloc_async(unfolded.nbytes(), encoder));
  encoder.add_temporary(unfolded);

  int filter_size = params.C;
--- a/mlx/backend/cuda/conv/gemm_grouped_conv.cu
+++ b/mlx/backend/cuda/conv/gemm_grouped_conv.cu
@@ -89,7 +89,7 @@ array grouped_unfold_transpose_inputs_nd(
    int mat_N,
    ConvParams<NDIM>& params) {
  array unfolded({mat_M, mat_K * params.groups}, in.dtype(), nullptr, {});
-  unfolded.set_data(cu::malloc_async(unfolded.nbytes(), encoder.stream()));
+  unfolded.set_data(cu::malloc_async(unfolded.nbytes(), encoder));
  encoder.add_temporary(unfolded);

  int filter_size = params.C;
--- a/mlx/backend/cuda/copy.cu
+++ b/mlx/backend/cuda/copy.cu
@@ -7,9 +7,8 @@ namespace mlx::core {

 void copy_gpu(const array& in, array& out, CopyType ctype, const Stream& s) {
  auto& encoder = cu::get_command_encoder(s);
-  bool donated = set_copy_output_data(in, out, ctype, [&](auto n) {
-    return cu::malloc_async(n, encoder.stream());
-  });
+  bool donated = set_copy_output_data(
+      in, out, ctype, [&](auto n) { return cu::malloc_async(n, encoder); });
  if (donated && in.dtype() == out.dtype()) {
    // If the output has the same type as the input then there is nothing to
    // copy, just use the buffer.
@@ -104,7 +103,7 @@ void fill_gpu(const array& in, array& out, const Stream& s) {
    return;
  }
  auto& encoder = cu::get_command_encoder(s);
-  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder));
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  copy_contiguous(encoder, CopyType::Scalar, in, out, 0, 0);
@@ -114,7 +113,7 @@ void reshape_gpu(const array& in, array& out, Stream s) {
  auto [copy_necessary, out_strides] = prepare_reshape(in, out);
  if (copy_necessary) {
    auto& encoder = cu::get_command_encoder(s);
-    out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
+    out.set_data(cu::malloc_async(out.nbytes(), encoder));
    copy_gpu_inplace(
        in,
        out,
--- a/mlx/backend/cuda/cudnn_utils.cpp
+++ b/mlx/backend/cuda/cudnn_utils.cpp
@@ -135,9 +135,7 @@ bool prepare_cudnn_plan(
  void* workspace_ptr = nullptr;
  if (workspace_size > 0) {
    array workspace(
-        cu::malloc_async(workspace_size, encoder.stream()),
-        {workspace_size},
-        uint8);
+        cu::malloc_async(workspace_size, encoder), {workspace_size}, uint8);
    encoder.add_temporary(workspace);
    workspace_ptr = gpu_ptr<void>(workspace);
  }
--- a/mlx/backend/cuda/custom_kernel.cpp
+++ b/mlx/backend/cuda/custom_kernel.cpp
@@ -289,7 +289,7 @@ void CustomKernel::eval_gpu(
      copies.emplace_back(init_value_.value(), out.dtype());
      fill_gpu(copies.back(), out, s);
    } else {
-      out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
+      out.set_data(cu::malloc_async(out.nbytes(), encoder));
    }
  }

--- a/mlx/backend/cuda/distributed.cu
+++ b/mlx/backend/cuda/distributed.cu
@@ -26,7 +26,7 @@ void AllReduce::eval_gpu(
      out.copy_shared_buffer(in);
      return {in, out};
    } else {
-      out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
+      out.set_data(cu::malloc_async(out.nbytes(), encoder));
      return {in, out};
    }
  };
@@ -74,7 +74,7 @@ void AllGather::eval_gpu(
  };

  auto input = ensure_contiguous(inputs[0]);
-  outputs[0].set_data(cu::malloc_async(outputs[0].nbytes(), encoder.stream()));
+  outputs[0].set_data(cu::malloc_async(outputs[0].nbytes(), encoder));

  encoder.set_input_array(input);
  encoder.set_output_array(outputs[0]);
@@ -103,7 +103,7 @@ void ReduceScatter::eval_gpu(
  };

  auto input = ensure_contiguous(inputs[0]);
-  outputs[0].set_data(cu::malloc_async(outputs[0].nbytes(), encoder.stream()));
+  outputs[0].set_data(cu::malloc_async(outputs[0].nbytes(), encoder));

  encoder.set_input_array(input);
  encoder.set_output_array(outputs[0]);
--- a/mlx/backend/cuda/gemms/cublas_gemm.cpp
+++ b/mlx/backend/cuda/gemms/cublas_gemm.cpp
@@ -370,7 +370,7 @@ void CublasGemm::execute(
    // Ensure workspace is 256-byte aligned
    int nbytes = cuda::ceil_div(heuristic_.workspaceSize, 256) * 256;
    array workspace(
-        cu::malloc_async(nbytes, encoder.stream()),
+        cu::malloc_async(nbytes, encoder),
        {static_cast<int>(heuristic_.workspaceSize)},
        int8);
    encoder.add_temporary(workspace);
--- a/mlx/backend/cuda/gemms/cublas_gemm_batched_12_9.cu
+++ b/mlx/backend/cuda/gemms/cublas_gemm_batched_12_9.cu
@@ -163,7 +163,7 @@ void CublasGemm::run_batched(

  // Launch kernel to set device offsets
  auto pointers = array(
-      cu::malloc_async(batch_count * sizeof(void*) * 3, encoder.stream()),
+      cu::malloc_async(batch_count * sizeof(void*) * 3, encoder),
      {batch_count * 3},
      uint64);

@@ -251,7 +251,7 @@ void CublasGemm::run_batched(

  // Launch kernel to set device offsets
  auto pointers = array(
-      cu::malloc_async(batch_count * sizeof(uint64_t) * 4, encoder.stream()),
+      cu::malloc_async(batch_count * sizeof(uint64_t) * 4, encoder),
      {batch_count * 4},
      uint64);

--- a/mlx/backend/cuda/indexing.cpp
+++ b/mlx/backend/cuda/indexing.cpp
@@ -61,7 +61,7 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {

  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
-  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder));
  if (out.size() == 0) {
    return;
  }
@@ -241,7 +241,7 @@ void GatherAxis::eval_gpu(const std::vector<array>& inputs, array& out) {

  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
-  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder));
  if (out.size() == 0) {
    return;
  }
--- a/mlx/backend/cuda/layer_norm.cu
+++ b/mlx/backend/cuda/layer_norm.cu
@@ -244,7 +244,7 @@ void LayerNorm::eval_gpu(
        out.copy_shared_buffer(x);
      } else {
        out.set_data(
-            cu::malloc_async(x.data_size() * x.itemsize(), encoder.stream()),
+            cu::malloc_async(x.data_size() * x.itemsize(), encoder),
            x.data_size(),
            x.strides(),
            x.flags());
@@ -335,7 +335,7 @@ void LayerNormVJP::eval_gpu(
    gx.copy_shared_buffer(g);
    g_in_gx = true;
  } else {
-    gx.set_data(cu::malloc_async(gx.nbytes(), encoder.stream()));
+    gx.set_data(cu::malloc_async(gx.nbytes(), encoder));
  }
  if (g_copied && !g_in_gx) {
    encoder.add_temporary(g);
@@ -355,7 +355,7 @@ void LayerNormVJP::eval_gpu(
      g_in_gw = true;
      gw_temp.copy_shared_buffer(g);
    } else {
-      gw_temp.set_data(cu::malloc_async(gw_temp.nbytes(), encoder.stream()));
+      gw_temp.set_data(cu::malloc_async(gw_temp.nbytes(), encoder));
      encoder.add_temporary(gw_temp);
    }
  }
--- a/mlx/backend/cuda/load.cpp
+++ b/mlx/backend/cuda/load.cpp
@@ -32,7 +32,7 @@ void Load::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& encoder = cu::get_command_encoder(stream());
  auto size = out.size();
  auto nbytes = size * out.itemsize();
-  out.set_data(cu::malloc_async(nbytes, encoder.stream()));
+  out.set_data(cu::malloc_async(nbytes, encoder));
  auto out_ptr = malloc(nbytes);
  reader_->read(static_cast<char*>(out_ptr), nbytes, offset_);
  if (swap_endianness_) {
--- a/mlx/backend/cuda/logsumexp.cu
+++ b/mlx/backend/cuda/logsumexp.cu
@@ -115,7 +115,7 @@ void LogSumExp::eval_gpu(const std::vector<array>& inputs, array& out) {

  auto in = ensure_contiguous(inputs[0]);
  if (in.flags().row_contiguous) {
-    out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
+    out.set_data(cu::malloc_async(out.nbytes(), encoder));
  } else {
    auto n = in.shape(-1);
    auto flags = in.flags();
@@ -130,7 +130,7 @@ void LogSumExp::eval_gpu(const std::vector<array>& inputs, array& out) {
    }
    flags.col_contiguous = col_contig;
    out.set_data(
-        cu::malloc_async(in.nbytes() / n, encoder.stream()),
+        cu::malloc_async(in.nbytes() / n, encoder),
        in.data_size() / n,
        std::move(strides),
        flags);
--- a/mlx/backend/cuda/matmul.cpp
+++ b/mlx/backend/cuda/matmul.cpp
@@ -121,7 +121,7 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
    return;
  }

-  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder));

  int M = a_pre.shape(-2);
  int N = b_pre.shape(-1);
@@ -163,7 +163,7 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {

  if (beta_ == 1 && a.dtype() != complex64 && c.strides(-1) == 1 &&
      c.data_size() == out.shape(-1)) {
-    out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
+    out.set_data(cu::malloc_async(out.nbytes(), encoder));
    gemm_and_bias(
        encoder,
        M,
@@ -187,10 +187,10 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
    auto sty = c.strides()[c.ndim() - 1];
    if (sty == 1 && stx == c.shape(-1)) {
      ldc = stx;
-      out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
+      out.set_data(cu::malloc_async(out.nbytes(), encoder));
    } else if (sty == 1 && stx == 0) {
      ldc = 0;
-      out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
+      out.set_data(cu::malloc_async(out.nbytes(), encoder));
    } else {
      // Copy C into out and set C to out
      ldc = c.shape(-1);
--- a/mlx/backend/cuda/quantized/quantized.cpp
+++ b/mlx/backend/cuda/quantized/quantized.cpp
@@ -59,7 +59,7 @@ void fast::Quantize::eval_gpu(
    auto scales = ensure_row_contiguous(inputs[1], enc, s);
    auto& w = outputs[0];

-    w.set_data(cu::malloc_async(w.nbytes(), enc.stream()));
+    w.set_data(cu::malloc_async(w.nbytes(), enc));

    if (mode_ == QuantizationMode::Affine) {
      auto biases = ensure_row_contiguous(inputs[2], enc, s);
@@ -72,11 +72,11 @@ void fast::Quantize::eval_gpu(
    auto& wq = outputs[0];
    auto& scales = outputs[1];

-    wq.set_data(cu::malloc_async(wq.nbytes(), enc.stream()));
-    scales.set_data(cu::malloc_async(scales.nbytes(), enc.stream()));
+    wq.set_data(cu::malloc_async(wq.nbytes(), enc));
+    scales.set_data(cu::malloc_async(scales.nbytes(), enc));
    if (mode_ == QuantizationMode::Affine) {
      auto& biases = outputs[2];
-      biases.set_data(cu::malloc_async(biases.nbytes(), enc.stream()));
+      biases.set_data(cu::malloc_async(biases.nbytes(), enc));
      affine_quantize(w, wq, scales, biases, group_size_, bits_, enc, s);
    } else {
      fp_quantize(w, wq, scales, group_size_, bits_, enc, s);
--- a/mlx/backend/cuda/random.cu
+++ b/mlx/backend/cuda/random.cu
@@ -145,7 +145,7 @@ void RandomBits::eval_gpu(const std::vector<array>& inputs, array& out) {
  uint32_t bytes_per_key = out.itemsize() * elems_per_key;
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
-  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder));
  if (out.size() == 0) {
    return;
  }
--- a/mlx/backend/cuda/reduce/all_reduce.cu
+++ b/mlx/backend/cuda/reduce/all_reduce.cu
@@ -66,7 +66,7 @@ void all_reduce(
    Reduce::ReduceType reduce_type) {
  constexpr int N_READS = 8;

-  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder));

  auto get_args = [](size_t size, int N) {
    int threads = std::min(512UL, (size + N - 1) / N);
@@ -107,8 +107,7 @@ void all_reduce(
  encoder.set_input_array(in);
  if (blocks > 1) {
    array intermediate({blocks}, out.dtype(), nullptr, {});
-    intermediate.set_data(
-        cu::malloc_async(intermediate.nbytes(), encoder.stream()));
+    intermediate.set_data(cu::malloc_async(intermediate.nbytes(), encoder));
    encoder.add_temporary(intermediate);
    encoder.set_output_array(intermediate);
    dispatch_all_types(dt, [&](auto type_tag) {
--- a/mlx/backend/cuda/reduce/init_reduce.cu
+++ b/mlx/backend/cuda/reduce/init_reduce.cu
@@ -28,7 +28,7 @@ void init_reduce(
    Reduce::ReduceType reduce_type) {
  // Allocate if needed
  if (out.data_shared_ptr() == nullptr) {
-    out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
+    out.set_data(cu::malloc_async(out.nbytes(), encoder));
  }

  encoder.set_output_array(out);
--- a/mlx/backend/cuda/reduce/reduce_utils.cuh
+++ b/mlx/backend/cuda/reduce/reduce_utils.cuh
@@ -96,7 +96,7 @@ inline void allocate_same_layout(
    const std::vector<int>& axes,
    cu::CommandEncoder& encoder) {
  if (in.flags().row_contiguous) {
-    out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
+    out.set_data(cu::malloc_async(out.nbytes(), encoder));
    return;
  }

@@ -135,7 +135,7 @@ inline void allocate_same_layout(
  fl.col_contiguous = cc;
  fl.contiguous = true;
  out.set_data(
-      cu::malloc_async(out.nbytes(), encoder.stream()),
+      cu::malloc_async(out.nbytes(), encoder),
      data_size,
      final_strides,
      fl,
--- a/mlx/backend/cuda/rms_norm.cu
+++ b/mlx/backend/cuda/rms_norm.cu
@@ -190,7 +190,7 @@ void RMSNorm::eval_gpu(
        out.copy_shared_buffer(x);
      } else {
        out.set_data(
-            cu::malloc_async(x.data_size() * x.itemsize(), encoder.stream()),
+            cu::malloc_async(x.data_size() * x.itemsize(), encoder),
            x.data_size(),
            x.strides(),
            x.flags());
@@ -274,7 +274,7 @@ void RMSNormVJP::eval_gpu(
    gx.copy_shared_buffer(g);
    g_in_gx = true;
  } else {
-    gx.set_data(cu::malloc_async(gx.nbytes(), encoder.stream()));
+    gx.set_data(cu::malloc_async(gx.nbytes(), encoder));
  }
  if (g_copied && !g_in_gx) {
    encoder.add_temporary(g);
@@ -292,7 +292,7 @@ void RMSNormVJP::eval_gpu(
    if (!g_in_gx && donate_g) {
      gw_temp.copy_shared_buffer(g);
    } else {
-      gw_temp.set_data(cu::malloc_async(gw_temp.nbytes(), encoder.stream()));
+      gw_temp.set_data(cu::malloc_async(gw_temp.nbytes(), encoder));
      encoder.add_temporary(gw_temp);
    }
  }
--- a/mlx/backend/cuda/rope.cu
+++ b/mlx/backend/cuda/rope.cu
@@ -292,14 +292,14 @@ void RoPE::eval_gpu(
      donated = true;
      out.copy_shared_buffer(in);
    } else {
-      out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
+      out.set_data(cu::malloc_async(out.nbytes(), encoder));
    }
    strides[0] = mat_size;
    strides[1] = in.strides()[ndim - 2];
    strides[2] = in.strides()[ndim - 1];
  } else if (dispatch_ndim == 3) {
    // Handle non-contiguous 3D inputs
-    out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
+    out.set_data(cu::malloc_async(out.nbytes(), encoder));
    strides[0] = in.strides()[ndim - 3];
    strides[1] = in.strides()[ndim - 2];
    strides[2] = in.strides()[ndim - 1];
--- a/mlx/backend/cuda/scaled_dot_product_attention.cpp
+++ b/mlx/backend/cuda/scaled_dot_product_attention.cpp
@@ -196,7 +196,7 @@ void sdpa_cudnn(
  auto& encoder = cu::get_command_encoder(s);
  // TODO: Handle donation.
  // TODO: Make O use same memory layout with Q.
-  o.set_data(cu::malloc_async(o.nbytes(), encoder.stream()));
+  o.set_data(cu::malloc_async(o.nbytes(), encoder));

  encoder.set_input_array(q);
  encoder.set_input_array(k);
@@ -240,7 +240,7 @@ void sdpa_cudnn(
  void* workspace_ptr = nullptr;
  if (workspace_size > 0) {
    array workspace(
-        cu::malloc_async(workspace_size, encoder.stream()),
+        cu::malloc_async(workspace_size, encoder),
        {static_cast<int>(workspace_size)},
        uint8);
    encoder.add_temporary(workspace);
--- a/mlx/backend/cuda/scaled_dot_product_attention.cu
+++ b/mlx/backend/cuda/scaled_dot_product_attention.cu
@@ -561,10 +561,9 @@ void sdpa_vector_2pass_fallback(
  array sums(intermediate_shape, float32, nullptr, {});
  array maxs(std::move(intermediate_shape), float32, nullptr, {});

-  intermediate.set_data(
-      cu::malloc_async(intermediate.nbytes(), encoder.stream()));
-  sums.set_data(cu::malloc_async(sums.nbytes(), encoder.stream()));
-  maxs.set_data(cu::malloc_async(maxs.nbytes(), encoder.stream()));
+  intermediate.set_data(cu::malloc_async(intermediate.nbytes(), encoder));
+  sums.set_data(cu::malloc_async(sums.nbytes(), encoder));
+  maxs.set_data(cu::malloc_async(maxs.nbytes(), encoder));

  encoder.add_temporary(intermediate);
  encoder.add_temporary(sums);
@@ -769,7 +768,7 @@ void sdpa_vector(
      };

      o.set_data(
-          cu::malloc_async(o.nbytes(), encoder.stream()),
+          cu::malloc_async(o.nbytes(), encoder),
          o.size(),
          {str_oB, str_oH, str_oL, str_oD},
          flags);
--- a/mlx/backend/cuda/scan.cu
+++ b/mlx/backend/cuda/scan.cu
@@ -374,7 +374,7 @@ void Scan::eval_gpu(const std::vector<array>& inputs, array& out) {
      out.copy_shared_buffer(in);
    } else {
      out.set_data(
-          cu::malloc_async(in.data_size() * out.itemsize(), encoder.stream()),
+          cu::malloc_async(in.data_size() * out.itemsize(), encoder),
          in.data_size(),
          in.strides(),
          in.flags());
--- a/mlx/backend/cuda/slicing.cpp
+++ b/mlx/backend/cuda/slicing.cpp
@@ -24,7 +24,7 @@ void concatenate_gpu(
  std::partial_sum(sizes.cbegin(), sizes.cend(), sizes.begin());

  auto& encoder = cu::get_command_encoder(s);
-  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder));

  auto strides = out.strides();
  auto flags = out.flags();
@@ -89,7 +89,7 @@ array compute_dynamic_offset(
  if (donate) {
    offset.copy_shared_buffer(indices);
  } else {
-    offset.set_data(cu::malloc_async(offset.itemsize(), encoder.stream()));
+    offset.set_data(cu::malloc_async(offset.itemsize(), encoder));
  }

  encoder.add_temporary(offset);
--- a/mlx/backend/cuda/softmax.cu
+++ b/mlx/backend/cuda/softmax.cu
@@ -118,7 +118,7 @@ void Softmax::eval_gpu(const std::vector<array>& inputs, array& out) {
        out.copy_shared_buffer(x);
      } else {
        out.set_data(
-            cu::malloc_async(x.data_size() * x.itemsize(), encoder.stream()),
+            cu::malloc_async(x.data_size() * x.itemsize(), encoder),
            x.data_size(),
            x.strides(),
            x.flags());
--- a/mlx/backend/cuda/sort.cu
+++ b/mlx/backend/cuda/sort.cu
@@ -49,14 +49,12 @@ void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
    array trans = swapaxes_in_eval(in, axis, last_dim);
    in = contiguous_copy_gpu(trans, s);
    encoder.add_temporary(in);
-    out = array(
-        cu::malloc_async(out.nbytes(), encoder.stream()),
-        in.shape(),
-        out.dtype());
+    out =
+        array(cu::malloc_async(out.nbytes(), encoder), in.shape(), out.dtype());
    encoder.add_temporary(out);
  } else {
    out.set_data(
-        cu::malloc_async(in.data_size() * out.itemsize(), encoder.stream()),
+        cu::malloc_async(in.data_size() * out.itemsize(), encoder),
        in.data_size(),
        in.strides(),
        in.flags());
@@ -74,17 +72,13 @@ void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
      if (argsort) {
        // Indices in the sorted dimension.
        array indices(
-            cu::malloc_async(out.nbytes(), encoder.stream()),
-            in.shape(),
-            out.dtype());
+            cu::malloc_async(out.nbytes(), encoder), in.shape(), out.dtype());
        encoder.add_temporary(indices);

        // In argsort though we don't need the result of sorted values, the
        // API requires us to provide an array to store it.
        array discard(
-            cu::malloc_async(in.nbytes(), encoder.stream()),
-            in.shape(),
-            in.dtype());
+            cu::malloc_async(in.nbytes(), encoder), in.shape(), in.dtype());
        encoder.add_temporary(discard);

        size_t size;
@@ -104,9 +98,7 @@ void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
            stream));

        array temp(
-            cu::malloc_async(size, encoder.stream()),
-            {static_cast<int>(size)},
-            uint8);
+            cu::malloc_async(size, encoder), {static_cast<int>(size)}, uint8);
        encoder.add_temporary(temp);

        // Start capturing after allocations
@@ -148,9 +140,7 @@ void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
            stream));

        array temp(
-            cu::malloc_async(size, encoder.stream()),
-            {static_cast<int>(size)},
-            uint8);
+            cu::malloc_async(size, encoder), {static_cast<int>(size)}, uint8);
        encoder.add_temporary(temp);

        // Start capturing after allocations
--- a/mlx/backend/cuda/ternary.cu
+++ b/mlx/backend/cuda/ternary.cu
@@ -257,9 +257,8 @@ void ternary_op_gpu(
  auto& c = inputs[2];
  auto topt = get_ternary_op_type(a, b, c);
  auto& encoder = cu::get_command_encoder(s);
-  set_ternary_op_output_data(a, b, c, out, topt, [&](auto n) {
-    return cu::malloc_async(n, encoder.stream());
-  });
+  set_ternary_op_output_data(
+      a, b, c, out, topt, [&](auto n) { return cu::malloc_async(n, encoder); });
  ternary_op_gpu_inplace<Op>(inputs, out, s);
 }

--- a/mlx/backend/cuda/unary/unary.cuh
+++ b/mlx/backend/cuda/unary/unary.cuh
@@ -208,9 +208,8 @@ void unary_op_gpu(
    const char* op,
    const Stream& s) {
  auto& encoder = cu::get_command_encoder(s);
-  set_unary_output_data(inputs[0], out, [&](auto n) {
-    return cu::malloc_async(n, encoder.stream());
-  });
+  set_unary_output_data(
+      inputs[0], out, [&](auto n) { return cu::malloc_async(n, encoder); });
  unary_op_gpu_inplace<Op>(inputs, out, op, s);
 }

--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -37,11 +37,11 @@ target_sources(
          ${METAL_TEST_SOURCES})

 if(MLX_BUILD_CUDA)
-  # Find the CCCL headers in install dir.
-  target_compile_definitions(
-    mlx
-    PRIVATE
-      MLX_CCCL_DIR="${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/cccl")
+  # C++ tests are always built from source, so we have to specify where to find
+  # CCCL headers for JIT as they are not installed in system.
+  get_target_property(MLX_CCCL_DIR mlx CCCL_DIR)
+  target_compile_definitions(mlx PRIVATE MLX_CCCL_DIR="${MLX_CCCL_DIR}")
+  message(STATUS MLX_CCCL_DIR="${MLX_CCCL_DIR}")
 endif()

 target_link_libraries(tests PRIVATE mlx doctest)