rebase + nit (#2260 )

Co-authored-by: Awni Hannun <awni@apple.com>
fix conv export (#2265 )
2025-12-16 01:49:05 +08:00 · 2025-06-10 10:51:51 -07:00 · 2025-06-10 09:34:01 -07:00 · 2025-06-10 08:59:56 -07:00 · 2025-06-10 08:59:47 -07:00
15 changed files with 1002 additions and 32 deletions
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -7,7 +7,11 @@ target_sources(
  mlx
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_contiguous.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general_dynamic.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general_input.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/eval.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/event.cu
@@ -15,7 +19,9 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/kernel_utils.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/random.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/sort.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/unary.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/worker.cpp)
@@ -26,6 +32,15 @@ target_compile_definitions(mlx PRIVATE MLX_USE_CUDA)
 target_compile_options(mlx
                       PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>")

+# CUDA 12.8 emits warning #20280-D for copy kernels which is a false positive.
+# Explicitly pass this flag to suppress the warning, it is safe to set it to
+# true but the warning wouldn't be suppressed.
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
+  target_compile_options(
+    mlx
+    PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--static-global-template-stub=false>")
+endif()
+
 # Compute capability 7 is required for synchronization between CPU/GPU with
 # managed memory. TODO: Add more architectures for potential performance gain.
 set(MLX_CUDA_ARCHITECTURES
--- a/mlx/backend/cuda/copy.cpp
+++ b/mlx/backend/cuda/copy.cpp
@@ -1,26 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/gpu/copy.h"
-
-namespace mlx::core {
-
-void copy_gpu_inplace(
-    const array& in,
-    array& out,
-    const Shape& data_shape,
-    const Strides& strides_in_pre,
-    const Strides& strides_out_pre,
-    int64_t inp_offset,
-    int64_t out_offset,
-    CopyType ctype,
-    const Stream& s,
-    const std::optional<array>& dynamic_i_offset /* = std::nullopt */,
-    const std::optional<array>& dynamic_o_offset /* = std::nullopt */) {
-  throw std::runtime_error("copy_gpu_inplace not implemented in CUDA backend.");
-}
-
-void fill_gpu(const array& val, array& out, const Stream& s) {
-  throw std::runtime_error("fill_gpu not implemented in CUDA backend.");
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cuda/copy.cu
+++ b/mlx/backend/cuda/copy.cu
@@ -0,0 +1,89 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/common/utils.h"
+#include "mlx/backend/cuda/copy/copy.cuh"
+
+namespace mlx::core {
+
+void copy_gpu_inplace(
+    const array& in_,
+    array& out,
+    const Shape& shape,
+    const Strides& strides_in,
+    const Strides& strides_out,
+    int64_t offset_in,
+    int64_t offset_out,
+    CopyType ctype,
+    const Stream& s,
+    const std::optional<array>& dynamic_offset_in,
+    const std::optional<array>& dynamic_offset_out) {
+  if (out.size() == 0) {
+    return;
+  }
+  const array& in = in_.data_shared_ptr() ? in_ : out;
+
+  auto& encoder = cu::get_command_encoder(s);
+  encoder.set_input_array(in);
+  encoder.set_output_array(out);
+
+  if (ctype == CopyType::Scalar || ctype == CopyType::Vector) {
+    copy_contiguous(encoder, ctype, in, out, offset_in, offset_out);
+    return;
+  }
+
+  if (ctype == CopyType::General || ctype == CopyType::GeneralGeneral) {
+    auto [shape_collapsed, strides_vec] = collapse_contiguous_dims(
+        shape, std::vector{strides_in, strides_out}, INT32_MAX);
+    if (ctype == CopyType::General) {
+      copy_general_input(
+          encoder,
+          ctype,
+          in,
+          out,
+          offset_in,
+          offset_out,
+          shape_collapsed,
+          strides_vec[0]);
+    } else {
+      if (dynamic_offset_in || dynamic_offset_out) {
+        copy_general_dynamic(
+            encoder,
+            ctype,
+            in,
+            out,
+            offset_in,
+            offset_out,
+            shape_collapsed,
+            strides_vec[0],
+            strides_vec[1],
+            dynamic_offset_in ? *dynamic_offset_in : array(0, int64),
+            dynamic_offset_out ? *dynamic_offset_out : array(0, int64));
+      } else {
+        copy_general(
+            encoder,
+            ctype,
+            in,
+            out,
+            offset_in,
+            offset_out,
+            shape_collapsed,
+            strides_vec[0],
+            strides_vec[1]);
+      }
+    }
+    return;
+  }
+}
+
+void fill_gpu(const array& in, array& out, const Stream& s) {
+  if (out.size() == 0) {
+    return;
+  }
+  out.set_data(allocator::malloc(out.nbytes()));
+  auto& encoder = cu::get_command_encoder(s);
+  encoder.set_input_array(in);
+  encoder.set_output_array(out);
+  copy_contiguous(encoder, CopyType::Scalar, in, out, 0, 0);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy.cuh
+++ b/mlx/backend/cuda/copy/copy.cuh
@@ -0,0 +1,71 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/backend/cuda/kernels/cast_op.cuh"
+#include "mlx/backend/gpu/copy.h"
+#include "mlx/dtype_utils.h"
+
+namespace mlx::core {
+
+#define MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, ...)    \
+  MLX_SWITCH_ALL_TYPES(in.dtype(), CTYPE_IN, {                  \
+    MLX_SWITCH_ALL_TYPES(out.dtype(), CTYPE_OUT, {              \
+      using InType = cuda_type_t<CTYPE_IN>;                     \
+      using OutType = cuda_type_t<CTYPE_OUT>;                   \
+      if constexpr (cu::CastOp<InType, OutType>::is_castable) { \
+        __VA_ARGS__;                                            \
+      } else {                                                  \
+        throw std::runtime_error(fmt::format(                   \
+            "Can not copy data from dtype {} to {}.",           \
+            dtype_to_string(out.dtype()),                       \
+            dtype_to_string(in.dtype())));                      \
+      }                                                         \
+    });                                                         \
+  })
+
+void copy_contiguous(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t offset_in,
+    int64_t offset_out);
+
+void copy_general(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t offset_in,
+    int64_t offset_out,
+    const Shape& shape,
+    const Strides& strides_in,
+    const Strides& strides_out);
+
+void copy_general_dynamic(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t offset_in,
+    int64_t offset_out,
+    const Shape& shape,
+    const Strides& strides_in,
+    const Strides& strides_out,
+    const array& dynamic_offset_in,
+    const array& dynamic_offset_out);
+
+void copy_general_input(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t offset_in,
+    int64_t offset_out,
+    const Shape& shape,
+    const Strides& strides_in);
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_contiguous.cu
+++ b/mlx/backend/cuda/copy/copy_contiguous.cu
@@ -0,0 +1,56 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/copy/copy.cuh"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename In, typename Out, typename IdxT>
+__global__ void copy_s(const In* in, Out* out, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    out[index] = CastOp<In, Out>{}(in[0]);
+  }
+}
+
+template <typename In, typename Out, typename IdxT>
+__global__ void copy_v(const In* in, Out* out, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    out[index] = CastOp<In, Out>{}(in[index]);
+  }
+}
+
+} // namespace cu
+
+void copy_contiguous(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t in_offset,
+    int64_t out_offset) {
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, {
+      MLX_SWITCH_BOOL(out.data_size() > UINT32_MAX, LARGE, {
+        using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
+        auto kernel = cu::copy_s<InType, OutType, IdxT>;
+        if (ctype == CopyType::Vector) {
+          kernel = cu::copy_v<InType, OutType, IdxT>;
+        }
+        auto [num_blocks, block_dims] = get_launch_args(kernel, out, LARGE);
+        kernel<<<num_blocks, block_dims, 0, stream>>>(
+            in.data<InType>() + in_offset,
+            out.data<OutType>() + out_offset,
+            out.data_size());
+      });
+    });
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_general.cu
+++ b/mlx/backend/cuda/copy/copy_general.cu
@@ -0,0 +1,95 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/copy/copy.cuh"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename In, typename Out, typename IdxT, int NDIM>
+__global__ void copy_gg_nd(
+    const In* in,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_out) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto [idx_in, idx_out] = elem_to_loc_nd<NDIM>(
+        index, shape.data(), strides_in.data(), strides_out.data());
+    out[idx_out] = CastOp<In, Out>{}(in[idx_in]);
+  }
+}
+
+template <typename In, typename Out, typename IdxT>
+__global__ void copy_gg(
+    const In* in,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ Shape shape,
+    const __grid_constant__ Strides strides_in,
+    const __grid_constant__ Strides strides_out,
+    int ndim) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto [idx_in, idx_out] = elem_to_loc_4d(
+        index, shape.data(), strides_in.data(), strides_out.data(), ndim);
+    out[idx_out] = CastOp<In, Out>{}(in[idx_in]);
+  }
+}
+
+} // namespace cu
+
+void copy_general(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t offset_in,
+    int64_t offset_out,
+    const Shape& shape,
+    const Strides& strides_in,
+    const Strides& strides_out) {
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, {
+      const InType* in_ptr = in.data<InType>() + offset_in;
+      OutType* out_ptr = out.data<OutType>() + offset_out;
+      bool large = in.data_size() > UINT32_MAX || out.data_size() > UINT32_MAX;
+      MLX_SWITCH_BOOL(large, LARGE, {
+        using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
+        int ndim = shape.size();
+        if (ndim <= 3) {
+          MLX_SWITCH_1_2_3(ndim, NDIM, {
+            auto kernel = cu::copy_gg_nd<InType, OutType, IdxT, NDIM>;
+            auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
+            kernel<<<num_blocks, block_dims, 0, stream>>>(
+                in_ptr,
+                out_ptr,
+                out.data_size(),
+                const_param<NDIM>(shape),
+                const_param<NDIM>(strides_in),
+                const_param<NDIM>(strides_out));
+          });
+        } else { // ndim >= 4
+          auto kernel = cu::copy_gg<InType, OutType, IdxT>;
+          auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
+          kernel<<<num_blocks, block_dims, 0, stream>>>(
+              in_ptr,
+              out_ptr,
+              out.data_size(),
+              const_param(shape),
+              const_param(strides_in),
+              const_param(strides_out),
+              ndim);
+        }
+      });
+    });
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_general_dynamic.cu
+++ b/mlx/backend/cuda/copy/copy_general_dynamic.cu
@@ -0,0 +1,105 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/copy/copy.cuh"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename In, typename Out, typename IdxT, int NDIM>
+__global__ void copy_gg_dynamic_nd(
+    const In* in,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_out,
+    const int64_t* offset_in,
+    const int64_t* offset_out) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto [idx_in, idx_out] = elem_to_loc_nd<NDIM>(
+        index, shape.data(), strides_in.data(), strides_out.data());
+    out[idx_out + *offset_out] = CastOp<In, Out>{}(in[idx_in + *offset_in]);
+  }
+}
+
+template <typename In, typename Out, typename IdxT>
+__global__ void copy_gg_dynamic(
+    const In* in,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ Shape shape,
+    const __grid_constant__ Strides strides_in,
+    const __grid_constant__ Strides strides_out,
+    int ndim,
+    const int64_t* offset_in,
+    const int64_t* offset_out) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto [idx_in, idx_out] = elem_to_loc_4d(
+        index, shape.data(), strides_in.data(), strides_out.data(), ndim);
+    out[idx_out + *offset_out] = CastOp<In, Out>{}(in[idx_in + *offset_in]);
+  }
+}
+
+} // namespace cu
+
+void copy_general_dynamic(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t offset_in,
+    int64_t offset_out,
+    const Shape& shape,
+    const Strides& strides_in,
+    const Strides& strides_out,
+    const array& dynamic_offset_in,
+    const array& dynamic_offset_out) {
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, {
+      const InType* in_ptr = in.data<InType>() + offset_in;
+      OutType* out_ptr = out.data<OutType>() + offset_out;
+      bool large = in.data_size() > UINT32_MAX || out.data_size() > UINT32_MAX;
+      MLX_SWITCH_BOOL(large, LARGE, {
+        using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
+        int ndim = shape.size();
+        if (ndim <= 3) {
+          MLX_SWITCH_1_2_3(ndim, NDIM, {
+            auto kernel = cu::copy_gg_dynamic_nd<InType, OutType, IdxT, NDIM>;
+            auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
+            kernel<<<num_blocks, block_dims, 0, stream>>>(
+                in_ptr,
+                out_ptr,
+                out.data_size(),
+                const_param<NDIM>(shape),
+                const_param<NDIM>(strides_in),
+                const_param<NDIM>(strides_out),
+                dynamic_offset_in.data<int64_t>(),
+                dynamic_offset_out.data<int64_t>());
+          });
+        } else { // ndim >= 4
+          auto kernel = cu::copy_gg_dynamic<InType, OutType, IdxT>;
+          auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
+          kernel<<<num_blocks, block_dims, 0, stream>>>(
+              in_ptr,
+              out_ptr,
+              out.data_size(),
+              const_param(shape),
+              const_param(strides_in),
+              const_param(strides_out),
+              ndim,
+              dynamic_offset_in.data<int64_t>(),
+              dynamic_offset_out.data<int64_t>());
+        }
+      });
+    });
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_general_input.cu
+++ b/mlx/backend/cuda/copy/copy_general_input.cu
@@ -0,0 +1,88 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/copy/copy.cuh"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename In, typename Out, typename IdxT, int NDIM>
+__global__ void copy_g_nd(
+    const In* in,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    IdxT idx_in = elem_to_loc_nd<NDIM>(index, shape.data(), strides_in.data());
+    out[index] = CastOp<In, Out>{}(in[idx_in]);
+  }
+}
+
+template <typename In, typename Out, typename IdxT>
+__global__ void copy_g(
+    const In* in,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ Shape shape,
+    const __grid_constant__ Strides strides_in,
+    int ndim) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    IdxT idx_in = elem_to_loc_4d(index, shape.data(), strides_in.data(), ndim);
+    out[index] = CastOp<In, Out>{}(in[idx_in]);
+  }
+}
+
+} // namespace cu
+
+void copy_general_input(
+    cu::CommandEncoder& encoder,
+    CopyType ctype,
+    const array& in,
+    array& out,
+    int64_t offset_in,
+    int64_t offset_out,
+    const Shape& shape,
+    const Strides& strides_in) {
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, {
+      const InType* in_ptr = in.data<InType>() + offset_in;
+      OutType* out_ptr = out.data<OutType>() + offset_out;
+      bool large = in.data_size() > UINT32_MAX || out.data_size() > UINT32_MAX;
+      MLX_SWITCH_BOOL(large, LARGE, {
+        using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
+        int ndim = shape.size();
+        if (ndim <= 3) {
+          MLX_SWITCH_1_2_3(ndim, NDIM, {
+            auto kernel = cu::copy_g_nd<InType, OutType, IdxT, NDIM>;
+            auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
+            kernel<<<num_blocks, block_dims, 0, stream>>>(
+                in_ptr,
+                out_ptr,
+                out.data_size(),
+                const_param<NDIM>(shape),
+                const_param<NDIM>(strides_in));
+          });
+        } else { // ndim >= 4
+          auto kernel = cu::copy_g<InType, OutType, IdxT>;
+          auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
+          kernel<<<num_blocks, block_dims, 0, stream>>>(
+              in_ptr,
+              out_ptr,
+              out.data_size(),
+              const_param(shape),
+              const_param(strides_in),
+              ndim);
+        }
+      });
+    });
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/kernels/cast_op.cuh
+++ b/mlx/backend/cuda/kernels/cast_op.cuh
@@ -0,0 +1,59 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include <cuComplex.h>
+#include <thrust/iterator/transform_iterator.h>
+
+namespace mlx::core::cu {
+
+// An op that does static_cast, with custom conversions for some types.
+template <typename SrcT, typename DstT, typename = void>
+struct CastOp {
+  static constexpr bool is_castable = cuda::std::is_convertible_v<SrcT, DstT>;
+
+  __device__ DstT operator()(SrcT x) {
+    return static_cast<DstT>(x);
+  }
+};
+
+// Converting a complex number to real number discards the imaginary part.
+template <typename DstT>
+struct CastOp<
+    cuComplex,
+    DstT,
+    cuda::std::enable_if_t<!cuda::std::is_same_v<cuComplex, DstT>>> {
+  static constexpr bool is_castable = cuda::std::is_convertible_v<float, DstT>;
+
+  __device__ DstT operator()(cuComplex x) {
+    static_assert(!cuda::std::is_same_v<cuComplex, DstT>);
+    return static_cast<DstT>(cuCrealf(x));
+  }
+};
+
+// Allow converting a real number to complex number.
+template <typename SrcT>
+struct CastOp<
+    SrcT,
+    cuComplex,
+    cuda::std::enable_if_t<!cuda::std::is_same_v<SrcT, cuComplex>>> {
+  static constexpr bool is_castable = cuda::std::is_convertible_v<SrcT, float>;
+
+  __device__ cuComplex operator()(SrcT x) {
+    static_assert(!cuda::std::is_same_v<SrcT, cuComplex>);
+    return cuComplex{static_cast<float>(x), 0};
+  }
+};
+
+// Return an iterator that cast the value to DstT using CastOp.
+template <typename DstT, typename Iterator>
+__host__ __device__ auto make_cast_iterator(Iterator it) {
+  using SrcT = typename cuda::std::iterator_traits<Iterator>::value_type;
+  if constexpr (std::is_same_v<SrcT, DstT>) {
+    return it;
+  } else {
+    return thrust::make_transform_iterator(it, CastOp<SrcT, DstT>{});
+  }
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/primitives.cu
+++ b/mlx/backend/cuda/primitives.cu
@@ -73,7 +73,6 @@ bool fast::ScaledDotProductAttention::use_fallback(

 NO_GPU(ArgPartition)
 NO_GPU(ArgReduce)
-NO_GPU(ArgSort)
 NO_GPU(BlockMaskedMM)
 NO_GPU_MULTI(Compiled)
 NO_GPU(Convolution)
@@ -92,7 +91,6 @@ NO_GPU_MULTI(LUF)
 NO_GPU(Partition)
 NO_GPU_MULTI(QRF)
 NO_GPU(QuantizedMatmul)
-NO_GPU(RandomBits)
 NO_GPU(Reduce)
 NO_GPU(Scan)
 NO_GPU(Scatter)
@@ -100,7 +98,6 @@ NO_GPU(ScatterAxis)
 NO_GPU(Select)
 NO_GPU(SliceUpdate)
 NO_GPU(Softmax)
-NO_GPU(Sort)
 NO_GPU_MULTI(SVD)
 NO_GPU(Inverse)
 NO_GPU(Cholesky)
--- a/mlx/backend/cuda/random.cu
+++ b/mlx/backend/cuda/random.cu
@@ -0,0 +1,181 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/primitives.h"
+
+#include <nvtx3/nvtx3.hpp>
+
+#include <cassert>
+
+namespace mlx::core {
+
+namespace cu {
+
+__constant__ constexpr uint32_t rotations[2][4] = {
+    {13, 15, 26, 6},
+    {17, 29, 16, 24}};
+
+union rbits {
+  uint2 val;
+  uint8_t bytes[2][4];
+};
+
+__device__ rbits threefry2x32_hash(uint2 key, uint2 count) {
+  uint32_t ks[] = {key.x, key.y, key.x ^ key.y ^ 0x1BD11BDA};
+
+  rbits v;
+  v.val.x = count.x + ks[0];
+  v.val.y = count.y + ks[1];
+
+  for (int i = 0; i < 5; ++i) {
+    for (auto r : rotations[i % 2]) {
+      v.val.x += v.val.y;
+      v.val.y = (v.val.y << r) | (v.val.y >> (32 - r));
+      v.val.y ^= v.val.x;
+    }
+    v.val.x += ks[(i + 1) % 3];
+    v.val.y += ks[(i + 2) % 3] + i + 1;
+  }
+
+  return v;
+}
+
+__global__ void rbitsc(
+    const uint32_t* keys,
+    uint8_t* out,
+    dim3 grid_dims,
+    bool odd,
+    uint32_t bytes_per_key) {
+  uint2 index{
+      blockIdx.x * blockDim.x + threadIdx.x,
+      blockIdx.y * blockDim.y + threadIdx.y};
+  if (index.x >= grid_dims.x || index.y >= grid_dims.y) {
+    return;
+  }
+
+  auto kidx = 2 * index.x;
+  auto key = uint2{keys[kidx], keys[kidx + 1]};
+  auto half_size = grid_dims.y - odd;
+  out += index.x * bytes_per_key;
+  bool drop_last = odd && (index.y == half_size);
+  auto bits = threefry2x32_hash(
+      key, uint2{index.y, drop_last ? 0 : index.y + grid_dims.y});
+  size_t idx = size_t(index.y) << 2;
+  for (int i = 0; i < 4; ++i) {
+    out[idx + i] = bits.bytes[0][i];
+  }
+  if (!drop_last) {
+    idx = (drop_last ? 0 : size_t(index.y) + grid_dims.y) << 2;
+    if ((index.y + 1) == half_size && (bytes_per_key % 4) > 0) {
+      int edge_bytes = (bytes_per_key % 4);
+      for (int i = 0; i < edge_bytes; ++i) {
+        out[idx + i] = bits.bytes[1][i];
+      }
+    } else {
+      for (int i = 0; i < 4; ++i) {
+        out[idx + i] = bits.bytes[1][i];
+      }
+    }
+  }
+}
+
+__global__ void rbits(
+    const uint32_t* keys,
+    uint8_t* out,
+    dim3 grid_dims,
+    bool odd,
+    uint32_t bytes_per_key,
+    int32_t ndim,
+    const __grid_constant__ Shape key_shape,
+    const __grid_constant__ Strides key_strides) {
+  uint2 index{
+      blockIdx.x * blockDim.x + threadIdx.x,
+      blockIdx.y * blockDim.y + threadIdx.y};
+  if (index.x >= grid_dims.x || index.y >= grid_dims.y) {
+    return;
+  }
+
+  auto kidx = 2 * index.x;
+  auto k1_elem = elem_to_loc(kidx, key_shape.data(), key_strides.data(), ndim);
+  auto k2_elem =
+      elem_to_loc(kidx + 1, key_shape.data(), key_strides.data(), ndim);
+  auto key = uint2{keys[k1_elem], keys[k2_elem]};
+  auto half_size = grid_dims.y - odd;
+  out += size_t(index.x) * bytes_per_key;
+  bool drop_last = odd && (index.y == half_size);
+  auto bits = threefry2x32_hash(
+      key, uint2{index.y, drop_last ? 0 : index.y + grid_dims.y});
+  size_t idx = size_t(index.y) << 2;
+  for (int i = 0; i < 4; ++i) {
+    out[idx + i] = bits.bytes[0][i];
+  }
+  if (!drop_last) {
+    idx = (drop_last ? 0 : size_t(index.y) + grid_dims.y) << 2;
+    if ((index.y + 1) == half_size && (bytes_per_key % 4) > 0) {
+      int edge_bytes = (bytes_per_key % 4);
+      for (int i = 0; i < edge_bytes; ++i) {
+        out[idx + i] = bits.bytes[1][i];
+      }
+    } else {
+      for (int i = 0; i < 4; ++i) {
+        out[idx + i] = bits.bytes[1][i];
+      }
+    }
+  }
+}
+
+} // namespace cu
+
+void RandomBits::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("RandomBits::eval_gpu");
+  assert(inputs.size() == 1);
+
+  // keys has shape (N1, ..., NK, 2)
+  // out has shape (N1, ..., NK, M1, M2, ...)
+  auto& keys = inputs[0];
+  uint32_t num_keys = keys.size() / 2;
+
+  uint32_t elems_per_key = out.size() / num_keys;
+  uint32_t bytes_per_key = out.itemsize() * elems_per_key;
+  out.set_data(allocator::malloc(out.nbytes()));
+  if (out.size() == 0) {
+    return;
+  }
+
+  uint32_t out_per_key = (bytes_per_key + 4 - 1) / 4;
+  uint32_t half_size = out_per_key / 2;
+  bool odd = out_per_key % 2;
+
+  auto& s = stream();
+  auto& encoder = cu::get_command_encoder(s);
+  encoder.set_input_array(keys);
+  encoder.set_output_array(out);
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    dim3 grid_dims{num_keys, half_size + odd};
+    dim3 block_dims = get_block_dims(grid_dims.x, grid_dims.y, 1);
+    dim3 num_blocks{
+        cuda::ceil_div(grid_dims.x, block_dims.x),
+        cuda::ceil_div(grid_dims.y, block_dims.y)};
+    if (keys.flags().row_contiguous) {
+      cu::rbitsc<<<num_blocks, block_dims, 0, stream>>>(
+          keys.data<uint32_t>(),
+          out.data<uint8_t>(),
+          grid_dims,
+          odd,
+          bytes_per_key);
+    } else {
+      cu::rbits<<<num_blocks, block_dims, 0, stream>>>(
+          keys.data<uint32_t>(),
+          out.data<uint8_t>(),
+          grid_dims,
+          odd,
+          bytes_per_key,
+          keys.ndim(),
+          const_param(keys.shape()),
+          const_param(keys.strides()));
+    }
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/slicing.cpp
+++ b/mlx/backend/cuda/slicing.cpp
@@ -1,7 +1,11 @@
 // Copyright © 2025 Apple Inc.

+#include "mlx/backend/common/slicing.h"
+#include "mlx/backend/gpu/copy.h"
 #include "mlx/backend/gpu/slicing.h"

+#include <numeric>
+
 namespace mlx::core {

 void concatenate_gpu(
@@ -9,7 +13,29 @@ void concatenate_gpu(
    array& out,
    int axis,
    const Stream& s) {
-  throw std::runtime_error("concatenate_gpu not implemented in CUDA backend.");
+  std::vector<int> sizes;
+  sizes.push_back(0);
+  for (auto& p : inputs) {
+    sizes.push_back(p.shape(axis));
+  }
+  std::partial_sum(sizes.cbegin(), sizes.cend(), sizes.begin());
+
+  out.set_data(allocator::malloc(out.nbytes()));
+
+  auto strides = out.strides();
+  auto flags = out.flags();
+  flags.row_contiguous = false;
+  flags.col_contiguous = false;
+  flags.contiguous = false;
+  // TODO: Handle concurrent outputs:
+  // https://github.com/ml-explore/mlx/pull/2145#discussion_r2070753816
+  for (int i = 0; i < inputs.size(); i++) {
+    array out_slice(inputs[i].shape(), out.dtype(), nullptr, {});
+    size_t data_offset = strides[axis] * sizes[i];
+    out_slice.copy_shared_buffer(
+        out, strides, flags, out_slice.size(), data_offset);
+    copy_gpu_inplace(inputs[i], out_slice, CopyType::GeneralGeneral, s);
+  }
 }

 } // namespace mlx::core
--- a/mlx/backend/cuda/sort.cu
+++ b/mlx/backend/cuda/sort.cu
@@ -0,0 +1,180 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/common/utils.h"
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/backend/gpu/copy.h"
+#include "mlx/dtype_utils.h"
+#include "mlx/primitives.h"
+
+#include <nvtx3/nvtx3.hpp>
+#include <thrust/device_ptr.h>
+#include <thrust/transform.h>
+#include <cub/device/device_segmented_sort.cuh>
+
+#include <cassert>
+#include <numeric>
+
+namespace mlx::core {
+
+namespace {
+
+template <typename T>
+struct ModOp {
+  T divisor;
+  __device__ T operator()(T x) {
+    return x % divisor;
+  }
+};
+
+// We can not use any op in eval, make an utility.
+array swapaxes_in_eval(const array& in, int axis1, int axis2) {
+  std::vector<int> axes(in.ndim());
+  std::iota(axes.begin(), axes.end(), 0);
+  std::swap(axes[axis1], axes[axis2]);
+  // TODO: Share the code with Transpose::eval.
+  Shape shape(axes.size());
+  Strides strides(in.ndim());
+  for (size_t ax = 0; ax < axes.size(); ++ax) {
+    shape[ax] = in.shape()[axes[ax]];
+    strides[ax] = in.strides()[axes[ax]];
+  }
+  auto flags = in.flags();
+  if (flags.contiguous) {
+    auto [_, row_contiguous, col_contiguous] = check_contiguity(shape, strides);
+    flags.row_contiguous = row_contiguous;
+    flags.col_contiguous = col_contiguous;
+  }
+  array out(shape, in.dtype(), nullptr, {});
+  out.copy_shared_buffer(in, strides, flags, in.data_size());
+  return out;
+}
+
+template <typename... Args>
+void segmented_sort_pairs(cu::CommandEncoder& encoder, Args&&... args) {
+  // Allocate temporary storage.
+  size_t size;
+  CHECK_CUDA_ERROR(
+      cub::DeviceSegmentedSort::StableSortPairs(nullptr, size, args...));
+  array temp(allocator::malloc(size), {static_cast<int>(size)}, uint8);
+  encoder.add_temporary(temp);
+  // Run op.
+  CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortPairs(
+      temp.data<void>(), size, args...));
+}
+
+template <typename... Args>
+void segmented_sort(cu::CommandEncoder& encoder, Args&&... args) {
+  // Allocate temporary storage.
+  size_t size;
+  CHECK_CUDA_ERROR(
+      cub::DeviceSegmentedSort::StableSortKeys(nullptr, size, args...));
+  array temp(allocator::malloc(size), {static_cast<int>(size)}, uint8);
+  encoder.add_temporary(temp);
+  // Run op.
+  CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortKeys(
+      temp.data<void>(), size, args...));
+}
+
+void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
+  array out = out_;
+  auto& encoder = cu::get_command_encoder(s);
+  encoder.set_input_array(in);
+  encoder.set_output_array(out);
+
+  if (axis < 0) {
+    axis += in.ndim();
+  }
+  int nsort = in.shape(axis);
+  int nsegments = in.data_size() / nsort;
+  int last_dim = in.ndim() - 1;
+
+  // If we are not sorting the innermost dimension of a contiguous array,
+  // transpose and make a copy.
+  bool is_segmented_sort = in.flags().contiguous && in.strides()[axis] == 1;
+  if (!is_segmented_sort) {
+    array trans = swapaxes_in_eval(in, axis, last_dim);
+    in = array(trans.shape(), trans.dtype(), nullptr, {});
+    copy_gpu(trans, in, CopyType::General, s);
+    encoder.add_temporary(in);
+    out = array(allocator::malloc(out.nbytes()), in.shape(), out.dtype());
+    encoder.add_temporary(out);
+  } else {
+    out.set_data(allocator::malloc(out.nbytes()));
+  }
+
+  encoder.launch_kernel([&](cudaStream_t stream) {
+    MLX_SWITCH_ALL_TYPES(in.dtype(), CTYPE, {
+      if constexpr (!std::is_same_v<CTYPE, complex64_t>) {
+        using Type = cuda_type_t<CTYPE>;
+        auto offsets = thrust::make_transform_iterator(
+            thrust::make_counting_iterator(0),
+            [nsort] __device__(int i) { return i * nsort; });
+        if (argsort) {
+          // Indices in the sorted dimension.
+          array indices(
+              allocator::malloc(out.nbytes()), in.shape(), out.dtype());
+          encoder.add_temporary(indices);
+          thrust::transform(
+              cu::thrust_policy(stream),
+              thrust::counting_iterator<uint32_t>(0),
+              thrust::counting_iterator<uint32_t>(indices.data_size()),
+              thrust::device_pointer_cast(indices.data<uint32_t>()),
+              ModOp<uint32_t>{static_cast<uint32_t>(nsort)});
+
+          // In argsort though we don't need the result of sorted values, the
+          // API requires us to provide an array to store it.
+          array discard(allocator::malloc(in.nbytes()), in.shape(), in.dtype());
+          encoder.add_temporary(discard);
+
+          segmented_sort_pairs(
+              encoder,
+              in.data<Type>(),
+              discard.data<Type>(),
+              indices.data<uint32_t>(),
+              out.data<uint32_t>(),
+              in.data_size(),
+              nsegments,
+              offsets,
+              offsets + 1,
+              stream);
+        } else {
+          segmented_sort(
+              encoder,
+              in.data<Type>(),
+              out.data<Type>(),
+              in.data_size(),
+              nsegments,
+              offsets,
+              offsets + 1,
+              stream);
+        }
+      } else {
+        throw std::runtime_error(
+            "CUDA backend does not support sorting complex numbers");
+      }
+    });
+  });
+
+  if (!is_segmented_sort) {
+    // Swap the sorted axis back.
+    // TODO: Do in-place transpose instead of using a temporary out array.
+    copy_gpu(swapaxes_in_eval(out, axis, last_dim), out_, CopyType::General, s);
+  }
+}
+
+} // namespace
+
+void ArgSort::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("ArgSort::eval_gpu");
+  assert(inputs.size() == 1);
+  gpu_sort(stream(), inputs[0], out, axis_, true);
+}
+
+void Sort::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("Sort::eval_gpu");
+  assert(inputs.size() == 1);
+  gpu_sort(stream(), inputs[0], out, axis_, false);
+}
+
+} // namespace mlx::core
--- a/mlx/primitives.h
+++ b/mlx/primitives.h
@@ -719,9 +719,9 @@ class Convolution : public UnaryPrimitive {
  bool is_equivalent(const Primitive& other) const override;
  auto state() const {
    return std::make_tuple(
+        kernel_strides_,
        padding_lo_,
        padding_hi_,
-        kernel_strides_,
        kernel_dilation_,
        input_dilation_,
        groups_,
--- a/python/tests/test_export_import.py
+++ b/python/tests/test_export_import.py
@@ -6,6 +6,7 @@ import tempfile
 import unittest

 import mlx.core as mx
+import mlx.nn as nn
 import mlx_tests


@@ -312,6 +313,39 @@ class TestExportImport(mlx_tests.MLXTestCase):
        out = imported_fun(x, y, z)[0]
        self.assertTrue(mx.array_equal(expected, out))

+    def test_export_conv(self):
+        path = os.path.join(self.test_dir, "fn.mlxfn")
+
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.c1 = nn.Conv2d(
+                    3, 16, kernel_size=3, stride=1, padding=1, bias=False
+                )
+                self.c2 = nn.Conv2d(
+                    16, 16, kernel_size=3, stride=2, padding=1, bias=False
+                )
+                self.c3 = nn.Conv2d(
+                    16, 16, kernel_size=3, stride=1, padding=2, bias=False
+                )
+
+            def __call__(self, x):
+                return self.c3(self.c2(self.c1(x)))
+
+        model = Model()
+        mx.eval(model.parameters())
+
+        def forward(x):
+            return model(x)
+
+        input_data = mx.random.normal(shape=(4, 32, 32, 3))
+        mx.export_function(path, forward, input_data)
+
+        imported_fn = mx.import_function(path)
+        out = imported_fn(input_data)[0]
+        expected = forward(input_data)
+        self.assertTrue(mx.allclose(expected, out))
+

 if __name__ == "__main__":
    unittest.main()
Author	SHA1	Message	Date
Cheng	99c33d011d	rebase + nit (#2260 ) Co-authored-by: Awni Hannun <awni@apple.com>	2025-06-10 10:51:51 -07:00
Awni Hannun	62fecf3e13	fix conv export (#2265 )	2025-06-10 09:34:01 -07:00
Cheng	7c4eb5d03e	CUDA backend: random (#2261 )	2025-06-10 08:59:56 -07:00
Cheng	bae9a6b404	CUDA backend: sort (#2262 ) Co-authored-by: Awni Hannun <awni@apple.com>	2025-06-10 08:59:47 -07:00