rebase + nit (#2260 )

Co-authored-by: Awni Hannun <awni@apple.com>
fix conv export (#2265 )
2025-12-16 01:49:05 +08:00 · 2025-06-10 10:51:51 -07:00 · 2025-06-10 09:34:01 -07:00 · 2025-06-10 08:59:56 -07:00 · 2025-06-10 08:59:47 -07:00
15 changed files with 1002 additions and 32 deletions
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -7,7 +7,11 @@ target_sources(
  mlx
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_contiguous.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general_dynamic.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general_input.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/eval.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/event.cu
@@ -15,7 +19,9 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/kernel_utils.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/random.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/sort.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/unary.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/worker.cpp)
@@ -26,6 +32,15 @@ target_compile_definitions(mlx PRIVATE MLX_USE_CUDA)
 target_compile_options(mlx
                       PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>")
 # CUDA 12.8 emits warning #20280-D for copy kernels which is a false positive.
 # Explicitly pass this flag to suppress the warning, it is safe to set it to
 # true but the warning wouldn't be suppressed.
 if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
  target_compile_options(
    mlx
    PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--static-global-template-stub=false>")
 endif()
 # Compute capability 7 is required for synchronization between CPU/GPU with
 # managed memory. TODO: Add more architectures for potential performance gain.
 set(MLX_CUDA_ARCHITECTURES
--- a/mlx/backend/cuda/copy.cpp
+++ b/mlx/backend/cuda/copy.cpp
@@ -1,26 +0,0 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/gpu/copy.h"
 namespace mlx::core {
 void copy_gpu_inplace(
    const array& in,
    array& out,
    const Shape& data_shape,
    const Strides& strides_in_pre,
    const Strides& strides_out_pre,
    int64_t inp_offset,
    int64_t out_offset,
    CopyType ctype,
    const Stream& s,
    const std::optional<array>& dynamic_i_offset /* = std::nullopt */,
    const std::optional<array>& dynamic_o_offset /* = std::nullopt */) {
  throw std::runtime_error("copy_gpu_inplace not implemented in CUDA backend.");
 }
 void fill_gpu(const array& val, array& out, const Stream& s) {
  throw std::runtime_error("fill_gpu not implemented in CUDA backend.");
 }
 } // namespace mlx::core
--- a/mlx/backend/cuda/copy.cu
+++ b/mlx/backend/cuda/copy.cu
@@ -0,0 +1,89 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/common/utils.h"
 #include "mlx/backend/cuda/copy/copy.cuh"
 namespace mlx::core {
 void copy_gpu_inplace(
    const array& in_,
    array& out,
    const Shape& shape,
    const Strides& strides_in,
    const Strides& strides_out,
    int64_t offset_in,
    int64_t offset_out,
    CopyType ctype,
    const Stream& s,
    const std::optional<array>& dynamic_offset_in,
    const std::optional<array>& dynamic_offset_out) {
  if (out.size() == 0) {
    return;
  }
  const array& in = in_.data_shared_ptr() ? in_ : out;
  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  if (ctype == CopyType::Scalar || ctype == CopyType::Vector) {
    copy_contiguous(encoder, ctype, in, out, offset_in, offset_out);
    return;
  }
  if (ctype == CopyType::General || ctype == CopyType::GeneralGeneral) {
    auto [shape_collapsed, strides_vec] = collapse_contiguous_dims(
        shape, std::vector{strides_in, strides_out}, INT32_MAX);
    if (ctype == CopyType::General) {
      copy_general_input(
          encoder,
          ctype,
          in,
          out,
          offset_in,
          offset_out,
          shape_collapsed,
          strides_vec[0]);
    } else {
      if (dynamic_offset_in || dynamic_offset_out) {
        copy_general_dynamic(
            encoder,
            ctype,
            in,
            out,
            offset_in,
            offset_out,
            shape_collapsed,
            strides_vec[0],
            strides_vec[1],
            dynamic_offset_in ? *dynamic_offset_in : array(0, int64),
            dynamic_offset_out ? *dynamic_offset_out : array(0, int64));
      } else {
        copy_general(
            encoder,
            ctype,
            in,
            out,
            offset_in,
            offset_out,
            shape_collapsed,
            strides_vec[0],
            strides_vec[1]);
      }
    }
    return;
  }
 }
 void fill_gpu(const array& in, array& out, const Stream& s) {
  if (out.size() == 0) {
    return;
  }
  out.set_data(allocator::malloc(out.nbytes()));
  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  copy_contiguous(encoder, CopyType::Scalar, in, out, 0, 0);
 }
 } // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy.cuh
+++ b/mlx/backend/cuda/copy/copy.cuh
@@ -0,0 +1,71 @@
 // Copyright © 2025 Apple Inc.
 #pragma once
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/backend/cuda/kernels/cast_op.cuh"
 #include "mlx/backend/gpu/copy.h"
 #include "mlx/dtype_utils.h"
 namespace mlx::core {
 #define MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, ...)    \
  MLX_SWITCH_ALL_TYPES(in.dtype(), CTYPE_IN, {                  \
    MLX_SWITCH_ALL_TYPES(out.dtype(), CTYPE_OUT, {              \
      using InType = cuda_type_t<CTYPE_IN>;                     \
      using OutType = cuda_type_t<CTYPE_OUT>;                   \
      if constexpr (cu::CastOp<InType, OutType>::is_castable) { \
        __VA_ARGS__;                                            \
      } else {                                                  \
        throw std::runtime_error(fmt::format(                   \
            "Can not copy data from dtype {} to {}.",           \
            dtype_to_string(out.dtype()),                       \
            dtype_to_string(in.dtype())));                      \
      }                                                         \
    });                                                         \
  })
 void copy_contiguous(
    cu::CommandEncoder& encoder,
    CopyType ctype,
    const array& in,
    array& out,
    int64_t offset_in,
    int64_t offset_out);
 void copy_general(
    cu::CommandEncoder& encoder,
    CopyType ctype,
    const array& in,
    array& out,
    int64_t offset_in,
    int64_t offset_out,
    const Shape& shape,
    const Strides& strides_in,
    const Strides& strides_out);
 void copy_general_dynamic(
    cu::CommandEncoder& encoder,
    CopyType ctype,
    const array& in,
    array& out,
    int64_t offset_in,
    int64_t offset_out,
    const Shape& shape,
    const Strides& strides_in,
    const Strides& strides_out,
    const array& dynamic_offset_in,
    const array& dynamic_offset_out);
 void copy_general_input(
    cu::CommandEncoder& encoder,
    CopyType ctype,
    const array& in,
    array& out,
    int64_t offset_in,
    int64_t offset_out,
    const Shape& shape,
    const Strides& strides_in);
 } // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_contiguous.cu
+++ b/mlx/backend/cuda/copy/copy_contiguous.cu
@@ -0,0 +1,56 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/cuda/copy/copy.cuh"
 #include <cooperative_groups.h>
 namespace mlx::core {
 namespace cu {
 namespace cg = cooperative_groups;
 template <typename In, typename Out, typename IdxT>
 __global__ void copy_s(const In* in, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
  if (index < size) {
    out[index] = CastOp<In, Out>{}(in[0]);
  }
 }
 template <typename In, typename Out, typename IdxT>
 __global__ void copy_v(const In* in, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
  if (index < size) {
    out[index] = CastOp<In, Out>{}(in[index]);
  }
 }
 } // namespace cu
 void copy_contiguous(
    cu::CommandEncoder& encoder,
    CopyType ctype,
    const array& in,
    array& out,
    int64_t in_offset,
    int64_t out_offset) {
  encoder.launch_kernel([&](cudaStream_t stream) {
    MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, {
      MLX_SWITCH_BOOL(out.data_size() > UINT32_MAX, LARGE, {
        using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
        auto kernel = cu::copy_s<InType, OutType, IdxT>;
        if (ctype == CopyType::Vector) {
          kernel = cu::copy_v<InType, OutType, IdxT>;
        }
        auto [num_blocks, block_dims] = get_launch_args(kernel, out, LARGE);
        kernel<<<num_blocks, block_dims, 0, stream>>>(
            in.data<InType>() + in_offset,
            out.data<OutType>() + out_offset,
            out.data_size());
      });
    });
  });
 }
 } // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_general.cu
+++ b/mlx/backend/cuda/copy/copy_general.cu
@@ -0,0 +1,95 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/cuda/copy/copy.cuh"
 #include <cooperative_groups.h>
 namespace mlx::core {
 namespace cu {
 namespace cg = cooperative_groups;
 template <typename In, typename Out, typename IdxT, int NDIM>
 __global__ void copy_gg_nd(
    const In* in,
    Out* out,
    IdxT size,
    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_out) {
  IdxT index = cg::this_grid().thread_rank();
  if (index < size) {
    auto [idx_in, idx_out] = elem_to_loc_nd<NDIM>(
        index, shape.data(), strides_in.data(), strides_out.data());
    out[idx_out] = CastOp<In, Out>{}(in[idx_in]);
  }
 }
 template <typename In, typename Out, typename IdxT>
 __global__ void copy_gg(
    const In* in,
    Out* out,
    IdxT size,
    const __grid_constant__ Shape shape,
    const __grid_constant__ Strides strides_in,
    const __grid_constant__ Strides strides_out,
    int ndim) {
  IdxT index = cg::this_grid().thread_rank();
  if (index < size) {
    auto [idx_in, idx_out] = elem_to_loc_4d(
        index, shape.data(), strides_in.data(), strides_out.data(), ndim);
    out[idx_out] = CastOp<In, Out>{}(in[idx_in]);
  }
 }
 } // namespace cu
 void copy_general(
    cu::CommandEncoder& encoder,
    CopyType ctype,
    const array& in,
    array& out,
    int64_t offset_in,
    int64_t offset_out,
    const Shape& shape,
    const Strides& strides_in,
    const Strides& strides_out) {
  encoder.launch_kernel([&](cudaStream_t stream) {
    MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, {
      const InType* in_ptr = in.data<InType>() + offset_in;
      OutType* out_ptr = out.data<OutType>() + offset_out;
      bool large = in.data_size() > UINT32_MAX || out.data_size() > UINT32_MAX;
      MLX_SWITCH_BOOL(large, LARGE, {
        using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
        int ndim = shape.size();
        if (ndim <= 3) {
          MLX_SWITCH_1_2_3(ndim, NDIM, {
            auto kernel = cu::copy_gg_nd<InType, OutType, IdxT, NDIM>;
            auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
            kernel<<<num_blocks, block_dims, 0, stream>>>(
                in_ptr,
                out_ptr,
                out.data_size(),
                const_param<NDIM>(shape),
                const_param<NDIM>(strides_in),
                const_param<NDIM>(strides_out));
          });
        } else { // ndim >= 4
          auto kernel = cu::copy_gg<InType, OutType, IdxT>;
          auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
          kernel<<<num_blocks, block_dims, 0, stream>>>(
              in_ptr,
              out_ptr,
              out.data_size(),
              const_param(shape),
              const_param(strides_in),
              const_param(strides_out),
              ndim);
        }
      });
    });
  });
 }
 } // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_general_dynamic.cu
+++ b/mlx/backend/cuda/copy/copy_general_dynamic.cu
@@ -0,0 +1,105 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/cuda/copy/copy.cuh"
 #include <cooperative_groups.h>
 namespace mlx::core {
 namespace cu {
 namespace cg = cooperative_groups;
 template <typename In, typename Out, typename IdxT, int NDIM>
 __global__ void copy_gg_dynamic_nd(
    const In* in,
    Out* out,
    IdxT size,
    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_out,
    const int64_t* offset_in,
    const int64_t* offset_out) {
  IdxT index = cg::this_grid().thread_rank();
  if (index < size) {
    auto [idx_in, idx_out] = elem_to_loc_nd<NDIM>(
        index, shape.data(), strides_in.data(), strides_out.data());
    out[idx_out + *offset_out] = CastOp<In, Out>{}(in[idx_in + *offset_in]);
  }
 }
 template <typename In, typename Out, typename IdxT>
 __global__ void copy_gg_dynamic(
    const In* in,
    Out* out,
    IdxT size,
    const __grid_constant__ Shape shape,
    const __grid_constant__ Strides strides_in,
    const __grid_constant__ Strides strides_out,
    int ndim,
    const int64_t* offset_in,
    const int64_t* offset_out) {
  IdxT index = cg::this_grid().thread_rank();
  if (index < size) {
    auto [idx_in, idx_out] = elem_to_loc_4d(
        index, shape.data(), strides_in.data(), strides_out.data(), ndim);
    out[idx_out + *offset_out] = CastOp<In, Out>{}(in[idx_in + *offset_in]);
  }
 }
 } // namespace cu
 void copy_general_dynamic(
    cu::CommandEncoder& encoder,
    CopyType ctype,
    const array& in,
    array& out,
    int64_t offset_in,
    int64_t offset_out,
    const Shape& shape,
    const Strides& strides_in,
    const Strides& strides_out,
    const array& dynamic_offset_in,
    const array& dynamic_offset_out) {
  encoder.launch_kernel([&](cudaStream_t stream) {
    MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, {
      const InType* in_ptr = in.data<InType>() + offset_in;
      OutType* out_ptr = out.data<OutType>() + offset_out;
      bool large = in.data_size() > UINT32_MAX || out.data_size() > UINT32_MAX;
      MLX_SWITCH_BOOL(large, LARGE, {
        using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
        int ndim = shape.size();
        if (ndim <= 3) {
          MLX_SWITCH_1_2_3(ndim, NDIM, {
            auto kernel = cu::copy_gg_dynamic_nd<InType, OutType, IdxT, NDIM>;
            auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
            kernel<<<num_blocks, block_dims, 0, stream>>>(
                in_ptr,
                out_ptr,
                out.data_size(),
                const_param<NDIM>(shape),
                const_param<NDIM>(strides_in),
                const_param<NDIM>(strides_out),
                dynamic_offset_in.data<int64_t>(),
                dynamic_offset_out.data<int64_t>());
          });
        } else { // ndim >= 4
          auto kernel = cu::copy_gg_dynamic<InType, OutType, IdxT>;
          auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
          kernel<<<num_blocks, block_dims, 0, stream>>>(
              in_ptr,
              out_ptr,
              out.data_size(),
              const_param(shape),
              const_param(strides_in),
              const_param(strides_out),
              ndim,
              dynamic_offset_in.data<int64_t>(),
              dynamic_offset_out.data<int64_t>());
        }
      });
    });
  });
 }
 } // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_general_input.cu
+++ b/mlx/backend/cuda/copy/copy_general_input.cu
@@ -0,0 +1,88 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/cuda/copy/copy.cuh"
 #include <cooperative_groups.h>
 namespace mlx::core {
 namespace cu {
 namespace cg = cooperative_groups;
 template <typename In, typename Out, typename IdxT, int NDIM>
 __global__ void copy_g_nd(
    const In* in,
    Out* out,
    IdxT size,
    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in) {
  IdxT index = cg::this_grid().thread_rank();
  if (index < size) {
    IdxT idx_in = elem_to_loc_nd<NDIM>(index, shape.data(), strides_in.data());
    out[index] = CastOp<In, Out>{}(in[idx_in]);
  }
 }
 template <typename In, typename Out, typename IdxT>
 __global__ void copy_g(
    const In* in,
    Out* out,
    IdxT size,
    const __grid_constant__ Shape shape,
    const __grid_constant__ Strides strides_in,
    int ndim) {
  IdxT index = cg::this_grid().thread_rank();
  if (index < size) {
    IdxT idx_in = elem_to_loc_4d(index, shape.data(), strides_in.data(), ndim);
    out[index] = CastOp<In, Out>{}(in[idx_in]);
  }
 }
 } // namespace cu
 void copy_general_input(
    cu::CommandEncoder& encoder,
    CopyType ctype,
    const array& in,
    array& out,
    int64_t offset_in,
    int64_t offset_out,
    const Shape& shape,
    const Strides& strides_in) {
  encoder.launch_kernel([&](cudaStream_t stream) {
    MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, {
      const InType* in_ptr = in.data<InType>() + offset_in;
      OutType* out_ptr = out.data<OutType>() + offset_out;
      bool large = in.data_size() > UINT32_MAX || out.data_size() > UINT32_MAX;
      MLX_SWITCH_BOOL(large, LARGE, {
        using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
        int ndim = shape.size();
        if (ndim <= 3) {
          MLX_SWITCH_1_2_3(ndim, NDIM, {
            auto kernel = cu::copy_g_nd<InType, OutType, IdxT, NDIM>;
            auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
            kernel<<<num_blocks, block_dims, 0, stream>>>(
                in_ptr,
                out_ptr,
                out.data_size(),
                const_param<NDIM>(shape),
                const_param<NDIM>(strides_in));
          });
        } else { // ndim >= 4
          auto kernel = cu::copy_g<InType, OutType, IdxT>;
          auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
          kernel<<<num_blocks, block_dims, 0, stream>>>(
              in_ptr,
              out_ptr,
              out.data_size(),
              const_param(shape),
              const_param(strides_in),
              ndim);
        }
      });
    });
  });
 }
 } // namespace mlx::core
--- a/mlx/backend/cuda/kernels/cast_op.cuh
+++ b/mlx/backend/cuda/kernels/cast_op.cuh
@@ -0,0 +1,59 @@
 // Copyright © 2025 Apple Inc.
 #pragma once
 #include <cuComplex.h>
 #include <thrust/iterator/transform_iterator.h>
 namespace mlx::core::cu {
 // An op that does static_cast, with custom conversions for some types.
 template <typename SrcT, typename DstT, typename = void>
 struct CastOp {
  static constexpr bool is_castable = cuda::std::is_convertible_v<SrcT, DstT>;
  __device__ DstT operator()(SrcT x) {
    return static_cast<DstT>(x);
  }
 };
 // Converting a complex number to real number discards the imaginary part.
 template <typename DstT>
 struct CastOp<
    cuComplex,
    DstT,
    cuda::std::enable_if_t<!cuda::std::is_same_v<cuComplex, DstT>>> {
  static constexpr bool is_castable = cuda::std::is_convertible_v<float, DstT>;
  __device__ DstT operator()(cuComplex x) {
    static_assert(!cuda::std::is_same_v<cuComplex, DstT>);
    return static_cast<DstT>(cuCrealf(x));
  }
 };
 // Allow converting a real number to complex number.
 template <typename SrcT>
 struct CastOp<
    SrcT,
    cuComplex,
    cuda::std::enable_if_t<!cuda::std::is_same_v<SrcT, cuComplex>>> {
  static constexpr bool is_castable = cuda::std::is_convertible_v<SrcT, float>;
  __device__ cuComplex operator()(SrcT x) {
    static_assert(!cuda::std::is_same_v<SrcT, cuComplex>);
    return cuComplex{static_cast<float>(x), 0};
  }
 };
 // Return an iterator that cast the value to DstT using CastOp.
 template <typename DstT, typename Iterator>
 __host__ __device__ auto make_cast_iterator(Iterator it) {
  using SrcT = typename cuda::std::iterator_traits<Iterator>::value_type;
  if constexpr (std::is_same_v<SrcT, DstT>) {
    return it;
  } else {
    return thrust::make_transform_iterator(it, CastOp<SrcT, DstT>{});
  }
 }
 } // namespace mlx::core::cu
--- a/mlx/backend/cuda/primitives.cu
+++ b/mlx/backend/cuda/primitives.cu
@@ -73,7 +73,6 @@ bool fast::ScaledDotProductAttention::use_fallback(
 NO_GPU(ArgPartition)
 NO_GPU(ArgReduce)
 NO_GPU(ArgSort)
 NO_GPU(BlockMaskedMM)
 NO_GPU_MULTI(Compiled)
 NO_GPU(Convolution)
@@ -92,7 +91,6 @@ NO_GPU_MULTI(LUF)
 NO_GPU(Partition)
 NO_GPU_MULTI(QRF)
 NO_GPU(QuantizedMatmul)
 NO_GPU(RandomBits)
 NO_GPU(Reduce)
 NO_GPU(Scan)
 NO_GPU(Scatter)
@@ -100,7 +98,6 @@ NO_GPU(ScatterAxis)
 NO_GPU(Select)
 NO_GPU(SliceUpdate)
 NO_GPU(Softmax)
 NO_GPU(Sort)
 NO_GPU_MULTI(SVD)
 NO_GPU(Inverse)
 NO_GPU(Cholesky)
--- a/mlx/backend/cuda/random.cu
+++ b/mlx/backend/cuda/random.cu
@@ -0,0 +1,181 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/primitives.h"
 #include <nvtx3/nvtx3.hpp>
 #include <cassert>
 namespace mlx::core {
 namespace cu {
 __constant__ constexpr uint32_t rotations[2][4] = {
    {13, 15, 26, 6},
    {17, 29, 16, 24}};
 union rbits {
  uint2 val;
  uint8_t bytes[2][4];
 };
 __device__ rbits threefry2x32_hash(uint2 key, uint2 count) {
  uint32_t ks[] = {key.x, key.y, key.x ^ key.y ^ 0x1BD11BDA};
  rbits v;
  v.val.x = count.x + ks[0];
  v.val.y = count.y + ks[1];
  for (int i = 0; i < 5; ++i) {
    for (auto r : rotations[i % 2]) {
      v.val.x += v.val.y;
      v.val.y = (v.val.y << r) | (v.val.y >> (32 - r));
      v.val.y ^= v.val.x;
    }
    v.val.x += ks[(i + 1) % 3];
    v.val.y += ks[(i + 2) % 3] + i + 1;
  }
  return v;
 }
 __global__ void rbitsc(
    const uint32_t* keys,
    uint8_t* out,
    dim3 grid_dims,
    bool odd,
    uint32_t bytes_per_key) {
  uint2 index{
      blockIdx.x * blockDim.x + threadIdx.x,
      blockIdx.y * blockDim.y + threadIdx.y};
  if (index.x >= grid_dims.x || index.y >= grid_dims.y) {
    return;
  }
  auto kidx = 2 * index.x;
  auto key = uint2{keys[kidx], keys[kidx + 1]};
  auto half_size = grid_dims.y - odd;
  out += index.x * bytes_per_key;
  bool drop_last = odd && (index.y == half_size);
  auto bits = threefry2x32_hash(
      key, uint2{index.y, drop_last ? 0 : index.y + grid_dims.y});
  size_t idx = size_t(index.y) << 2;
  for (int i = 0; i < 4; ++i) {
    out[idx + i] = bits.bytes[0][i];
  }
  if (!drop_last) {
    idx = (drop_last ? 0 : size_t(index.y) + grid_dims.y) << 2;
    if ((index.y + 1) == half_size && (bytes_per_key % 4) > 0) {
      int edge_bytes = (bytes_per_key % 4);
      for (int i = 0; i < edge_bytes; ++i) {
        out[idx + i] = bits.bytes[1][i];
      }
    } else {
      for (int i = 0; i < 4; ++i) {
        out[idx + i] = bits.bytes[1][i];
      }
    }
  }
 }
 __global__ void rbits(
    const uint32_t* keys,
    uint8_t* out,
    dim3 grid_dims,
    bool odd,
    uint32_t bytes_per_key,
    int32_t ndim,
    const __grid_constant__ Shape key_shape,
    const __grid_constant__ Strides key_strides) {
  uint2 index{
      blockIdx.x * blockDim.x + threadIdx.x,
      blockIdx.y * blockDim.y + threadIdx.y};
  if (index.x >= grid_dims.x || index.y >= grid_dims.y) {
    return;
  }
  auto kidx = 2 * index.x;
  auto k1_elem = elem_to_loc(kidx, key_shape.data(), key_strides.data(), ndim);
  auto k2_elem =
      elem_to_loc(kidx + 1, key_shape.data(), key_strides.data(), ndim);
  auto key = uint2{keys[k1_elem], keys[k2_elem]};
  auto half_size = grid_dims.y - odd;
  out += size_t(index.x) * bytes_per_key;
  bool drop_last = odd && (index.y == half_size);
  auto bits = threefry2x32_hash(
      key, uint2{index.y, drop_last ? 0 : index.y + grid_dims.y});
  size_t idx = size_t(index.y) << 2;
  for (int i = 0; i < 4; ++i) {
    out[idx + i] = bits.bytes[0][i];
  }
  if (!drop_last) {
    idx = (drop_last ? 0 : size_t(index.y) + grid_dims.y) << 2;
    if ((index.y + 1) == half_size && (bytes_per_key % 4) > 0) {
      int edge_bytes = (bytes_per_key % 4);
      for (int i = 0; i < edge_bytes; ++i) {
        out[idx + i] = bits.bytes[1][i];
      }
    } else {
      for (int i = 0; i < 4; ++i) {
        out[idx + i] = bits.bytes[1][i];
      }
    }
  }
 }
 } // namespace cu
 void RandomBits::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("RandomBits::eval_gpu");
  assert(inputs.size() == 1);
  // keys has shape (N1, ..., NK, 2)
  // out has shape (N1, ..., NK, M1, M2, ...)
  auto& keys = inputs[0];
  uint32_t num_keys = keys.size() / 2;
  uint32_t elems_per_key = out.size() / num_keys;
  uint32_t bytes_per_key = out.itemsize() * elems_per_key;
  out.set_data(allocator::malloc(out.nbytes()));
  if (out.size() == 0) {
    return;
  }
  uint32_t out_per_key = (bytes_per_key + 4 - 1) / 4;
  uint32_t half_size = out_per_key / 2;
  bool odd = out_per_key % 2;
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(keys);
  encoder.set_output_array(out);
  encoder.launch_kernel([&](cudaStream_t stream) {
    dim3 grid_dims{num_keys, half_size + odd};
    dim3 block_dims = get_block_dims(grid_dims.x, grid_dims.y, 1);
    dim3 num_blocks{
        cuda::ceil_div(grid_dims.x, block_dims.x),
        cuda::ceil_div(grid_dims.y, block_dims.y)};
    if (keys.flags().row_contiguous) {
      cu::rbitsc<<<num_blocks, block_dims, 0, stream>>>(
          keys.data<uint32_t>(),
          out.data<uint8_t>(),
          grid_dims,
          odd,
          bytes_per_key);
    } else {
      cu::rbits<<<num_blocks, block_dims, 0, stream>>>(
          keys.data<uint32_t>(),
          out.data<uint8_t>(),
          grid_dims,
          odd,
          bytes_per_key,
          keys.ndim(),
          const_param(keys.shape()),
          const_param(keys.strides()));
    }
  });
 }
 } // namespace mlx::core
--- a/mlx/backend/cuda/slicing.cpp
+++ b/mlx/backend/cuda/slicing.cpp
@@ -1,7 +1,11 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/common/slicing.h"
 #include "mlx/backend/gpu/copy.h"
 #include "mlx/backend/gpu/slicing.h"
 #include <numeric>
 namespace mlx::core {
 void concatenate_gpu(
@@ -9,7 +13,29 @@ void concatenate_gpu(
    array& out,
    int axis,
    const Stream& s) {
-  throw std::runtime_error("concatenate_gpu not implemented in CUDA backend.");
+  std::vector<int> sizes;
  sizes.push_back(0);
  for (auto& p : inputs) {
    sizes.push_back(p.shape(axis));
  }
  std::partial_sum(sizes.cbegin(), sizes.cend(), sizes.begin());
  out.set_data(allocator::malloc(out.nbytes()));
  auto strides = out.strides();
  auto flags = out.flags();
  flags.row_contiguous = false;
  flags.col_contiguous = false;
  flags.contiguous = false;
  // TODO: Handle concurrent outputs:
  // https://github.com/ml-explore/mlx/pull/2145#discussion_r2070753816
  for (int i = 0; i < inputs.size(); i++) {
    array out_slice(inputs[i].shape(), out.dtype(), nullptr, {});
    size_t data_offset = strides[axis] * sizes[i];
    out_slice.copy_shared_buffer(
        out, strides, flags, out_slice.size(), data_offset);
    copy_gpu_inplace(inputs[i], out_slice, CopyType::GeneralGeneral, s);
  }
 }
 } // namespace mlx::core
--- a/mlx/backend/cuda/sort.cu
+++ b/mlx/backend/cuda/sort.cu
@@ -0,0 +1,180 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/common/utils.h"
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/backend/gpu/copy.h"
 #include "mlx/dtype_utils.h"
 #include "mlx/primitives.h"
 #include <nvtx3/nvtx3.hpp>
 #include <thrust/device_ptr.h>
 #include <thrust/transform.h>
 #include <cub/device/device_segmented_sort.cuh>
 #include <cassert>
 #include <numeric>
 namespace mlx::core {
 namespace {
 template <typename T>
 struct ModOp {
  T divisor;
  __device__ T operator()(T x) {
    return x % divisor;
  }
 };
 // We can not use any op in eval, make an utility.
 array swapaxes_in_eval(const array& in, int axis1, int axis2) {
  std::vector<int> axes(in.ndim());
  std::iota(axes.begin(), axes.end(), 0);
  std::swap(axes[axis1], axes[axis2]);
  // TODO: Share the code with Transpose::eval.
  Shape shape(axes.size());
  Strides strides(in.ndim());
  for (size_t ax = 0; ax < axes.size(); ++ax) {
    shape[ax] = in.shape()[axes[ax]];
    strides[ax] = in.strides()[axes[ax]];
  }
  auto flags = in.flags();
  if (flags.contiguous) {
    auto [_, row_contiguous, col_contiguous] = check_contiguity(shape, strides);
    flags.row_contiguous = row_contiguous;
    flags.col_contiguous = col_contiguous;
  }
  array out(shape, in.dtype(), nullptr, {});
  out.copy_shared_buffer(in, strides, flags, in.data_size());
  return out;
 }
 template <typename... Args>
 void segmented_sort_pairs(cu::CommandEncoder& encoder, Args&&... args) {
  // Allocate temporary storage.
  size_t size;
  CHECK_CUDA_ERROR(
      cub::DeviceSegmentedSort::StableSortPairs(nullptr, size, args...));
  array temp(allocator::malloc(size), {static_cast<int>(size)}, uint8);
  encoder.add_temporary(temp);
  // Run op.
  CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortPairs(
      temp.data<void>(), size, args...));
 }
 template <typename... Args>
 void segmented_sort(cu::CommandEncoder& encoder, Args&&... args) {
  // Allocate temporary storage.
  size_t size;
  CHECK_CUDA_ERROR(
      cub::DeviceSegmentedSort::StableSortKeys(nullptr, size, args...));
  array temp(allocator::malloc(size), {static_cast<int>(size)}, uint8);
  encoder.add_temporary(temp);
  // Run op.
  CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortKeys(
      temp.data<void>(), size, args...));
 }
 void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
  array out = out_;
  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  if (axis < 0) {
    axis += in.ndim();
  }
  int nsort = in.shape(axis);
  int nsegments = in.data_size() / nsort;
  int last_dim = in.ndim() - 1;
  // If we are not sorting the innermost dimension of a contiguous array,
  // transpose and make a copy.
  bool is_segmented_sort = in.flags().contiguous && in.strides()[axis] == 1;
  if (!is_segmented_sort) {
    array trans = swapaxes_in_eval(in, axis, last_dim);
    in = array(trans.shape(), trans.dtype(), nullptr, {});
    copy_gpu(trans, in, CopyType::General, s);
    encoder.add_temporary(in);
    out = array(allocator::malloc(out.nbytes()), in.shape(), out.dtype());
    encoder.add_temporary(out);
  } else {
    out.set_data(allocator::malloc(out.nbytes()));
  }
  encoder.launch_kernel([&](cudaStream_t stream) {
    MLX_SWITCH_ALL_TYPES(in.dtype(), CTYPE, {
      if constexpr (!std::is_same_v<CTYPE, complex64_t>) {
        using Type = cuda_type_t<CTYPE>;
        auto offsets = thrust::make_transform_iterator(
            thrust::make_counting_iterator(0),
            [nsort] __device__(int i) { return i * nsort; });
        if (argsort) {
          // Indices in the sorted dimension.
          array indices(
              allocator::malloc(out.nbytes()), in.shape(), out.dtype());
          encoder.add_temporary(indices);
          thrust::transform(
              cu::thrust_policy(stream),
              thrust::counting_iterator<uint32_t>(0),
              thrust::counting_iterator<uint32_t>(indices.data_size()),
              thrust::device_pointer_cast(indices.data<uint32_t>()),
              ModOp<uint32_t>{static_cast<uint32_t>(nsort)});
          // In argsort though we don't need the result of sorted values, the
          // API requires us to provide an array to store it.
          array discard(allocator::malloc(in.nbytes()), in.shape(), in.dtype());
          encoder.add_temporary(discard);
          segmented_sort_pairs(
              encoder,
              in.data<Type>(),
              discard.data<Type>(),
              indices.data<uint32_t>(),
              out.data<uint32_t>(),
              in.data_size(),
              nsegments,
              offsets,
              offsets + 1,
              stream);
        } else {
          segmented_sort(
              encoder,
              in.data<Type>(),
              out.data<Type>(),
              in.data_size(),
              nsegments,
              offsets,
              offsets + 1,
              stream);
        }
      } else {
        throw std::runtime_error(
            "CUDA backend does not support sorting complex numbers");
      }
    });
  });
  if (!is_segmented_sort) {
    // Swap the sorted axis back.
    // TODO: Do in-place transpose instead of using a temporary out array.
    copy_gpu(swapaxes_in_eval(out, axis, last_dim), out_, CopyType::General, s);
  }
 }
 } // namespace
 void ArgSort::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("ArgSort::eval_gpu");
  assert(inputs.size() == 1);
  gpu_sort(stream(), inputs[0], out, axis_, true);
 }
 void Sort::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("Sort::eval_gpu");
  assert(inputs.size() == 1);
  gpu_sort(stream(), inputs[0], out, axis_, false);
 }
 } // namespace mlx::core
--- a/mlx/primitives.h
+++ b/mlx/primitives.h
@@ -719,9 +719,9 @@ class Convolution : public UnaryPrimitive {
  bool is_equivalent(const Primitive& other) const override;
  auto state() const {
    return std::make_tuple(
        kernel_strides_,
        padding_lo_,
        padding_hi_,
        kernel_strides_,
        kernel_dilation_,
        input_dilation_,
        groups_,
--- a/python/tests/test_export_import.py
+++ b/python/tests/test_export_import.py
@@ -6,6 +6,7 @@ import tempfile
 import unittest
 import mlx.core as mx
 import mlx.nn as nn
 import mlx_tests
@@ -312,6 +313,39 @@ class TestExportImport(mlx_tests.MLXTestCase):
        out = imported_fun(x, y, z)[0]
        self.assertTrue(mx.array_equal(expected, out))
    def test_export_conv(self):
        path = os.path.join(self.test_dir, "fn.mlxfn")
        class Model(nn.Module):
            def __init__(self):
                super().__init__()
                self.c1 = nn.Conv2d(
                    3, 16, kernel_size=3, stride=1, padding=1, bias=False
                )
                self.c2 = nn.Conv2d(
                    16, 16, kernel_size=3, stride=2, padding=1, bias=False
                )
                self.c3 = nn.Conv2d(
                    16, 16, kernel_size=3, stride=1, padding=2, bias=False
                )
            def __call__(self, x):
                return self.c3(self.c2(self.c1(x)))
        model = Model()
        mx.eval(model.parameters())
        def forward(x):
            return model(x)
        input_data = mx.random.normal(shape=(4, 32, 32, 3))
        mx.export_function(path, forward, input_data)
        imported_fn = mx.import_function(path)
        out = imported_fn(input_data)[0]
        expected = forward(input_data)
        self.assertTrue(mx.allclose(expected, out))
 if __name__ == "__main__":
    unittest.main()
Author	SHA1	Message	Date
Cheng	99c33d011d	rebase + nit (#2260 ) Co-authored-by: Awni Hannun <awni@apple.com>	2025-06-10 10:51:51 -07:00
Awni Hannun	62fecf3e13	fix conv export (#2265 )	2025-06-10 09:34:01 -07:00
Cheng	7c4eb5d03e	CUDA backend: random (#2261 )	2025-06-10 08:59:56 -07:00
Cheng	bae9a6b404	CUDA backend: sort (#2262 ) Co-authored-by: Awni Hannun <awni@apple.com>	2025-06-10 08:59:47 -07:00