[CUDA] Fix back-end bugs and enable corresponding tests (#2296)

* Fix some cuda back-end bugs and enable corresponding tests * more fixes * enable more tests * format
2025-12-16 01:49:05 +08:00 · 2025-06-16 08:45:40 -07:00
parent 4fda5fbdf9
commit c552ff2451
16 changed files with 115 additions and 98 deletions
--- a/mlx/backend/cuda/binary.cu
+++ b/mlx/backend/cuda/binary.cu
@@ -165,7 +165,7 @@ void binary_op_gpu_inplace(
                      a.data<InType>(),
                      b.data<InType>(),
                      out.data<OutType>(),
-                      out.data_size(),
+                      out.size(),
                      const_param<NDIM>(shape),
                      const_param<NDIM>(a_strides),
                      const_param<NDIM>(b_strides));
@@ -178,7 +178,7 @@ void binary_op_gpu_inplace(
                    a.data<InType>(),
                    b.data<InType>(),
                    out.data<OutType>(),
-                    out.data_size(),
+                    out.size(),
                    const_param(shape),
                    const_param(a_strides),
                    const_param(b_strides),
@@ -196,8 +196,8 @@ void binary_op_gpu_inplace(
              } else if (bopt == BinaryOpType::VectorVector) {
                kernel = cu::binary_vv<Op, InType, OutType, IdxT>;
              }
-              auto [num_blocks, block_dims] =
-                  get_launch_args(kernel, out, LARGE);
+              auto [num_blocks, block_dims] = get_launch_args(
+                  kernel, out.data_size(), out.shape(), out.strides(), LARGE);
              kernel<<<num_blocks, block_dims, 0, stream>>>(
                  a.data<InType>(),
                  b.data<InType>(),
@@ -264,7 +264,6 @@ BINARY_GPU(Add)
 BINARY_GPU(ArcTan2)
 BINARY_GPU(Divide)
 BINARY_GPU(Remainder)
-BINARY_GPU(Equal)
 BINARY_GPU(Greater)
 BINARY_GPU(GreaterEqual)
 BINARY_GPU(Less)
@@ -279,6 +278,17 @@ BINARY_GPU(NotEqual)
 BINARY_GPU(Power)
 BINARY_GPU(Subtract)

+void Equal::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("Equal::eval_gpu");
+  auto& s = out.primitive().stream();
+  auto op = get_primitive_string(this);
+  if (equal_nan_) {
+    binary_op_gpu<cu::NaNEqual>(inputs, out, op, s);
+  } else {
+    binary_op_gpu<cu::Equal>(inputs, out, op, s);
+  }
+}
+
 void BitwiseBinary::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("BitwiseBinary::eval_gpu");
  auto& s = out.primitive().stream();
--- a/mlx/backend/cuda/copy.cu
+++ b/mlx/backend/cuda/copy.cu
@@ -6,7 +6,7 @@
 namespace mlx::core {

 void copy_gpu_inplace(
-    const array& in_,
+    const array& in,
    array& out,
    const Shape& shape,
    const Strides& strides_in,
@@ -20,7 +20,6 @@ void copy_gpu_inplace(
  if (out.size() == 0) {
    return;
  }
-  const array& in = in_.data_shared_ptr() ? in_ : out;

  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(in);
--- a/mlx/backend/cuda/copy/copy.cuh
+++ b/mlx/backend/cuda/copy/copy.cuh
@@ -10,20 +10,13 @@

 namespace mlx::core {

-#define MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, ...)    \
-  MLX_SWITCH_ALL_TYPES(in.dtype(), CTYPE_IN, {                  \
-    MLX_SWITCH_ALL_TYPES(out.dtype(), CTYPE_OUT, {              \
-      using InType = cuda_type_t<CTYPE_IN>;                     \
-      using OutType = cuda_type_t<CTYPE_OUT>;                   \
-      if constexpr (cu::CastOp<InType, OutType>::is_castable) { \
-        __VA_ARGS__;                                            \
-      } else {                                                  \
-        throw std::runtime_error(fmt::format(                   \
-            "Can not copy data from dtype {} to {}.",           \
-            dtype_to_string(out.dtype()),                       \
-            dtype_to_string(in.dtype())));                      \
-      }                                                         \
-    });                                                         \
+#define MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, ...) \
+  MLX_SWITCH_ALL_TYPES(in.dtype(), CTYPE_IN, {               \
+    MLX_SWITCH_ALL_TYPES(out.dtype(), CTYPE_OUT, {           \
+      using InType = cuda_type_t<CTYPE_IN>;                  \
+      using OutType = cuda_type_t<CTYPE_OUT>;                \
+      __VA_ARGS__;                                           \
+    });                                                      \
  })

 void copy_contiguous(
--- a/mlx/backend/cuda/copy/copy_contiguous.cu
+++ b/mlx/backend/cuda/copy/copy_contiguous.cu
@@ -43,7 +43,8 @@ void copy_contiguous(
        if (ctype == CopyType::Vector) {
          kernel = cu::copy_v<InType, OutType, IdxT>;
        }
-        auto [num_blocks, block_dims] = get_launch_args(kernel, out, LARGE);
+        auto [num_blocks, block_dims] = get_launch_args(
+            kernel, out.data_size(), out.shape(), out.strides(), LARGE);
        kernel<<<num_blocks, block_dims, 0, stream>>>(
            in.data<InType>() + in_offset,
            out.data<OutType>() + out_offset,
--- a/mlx/backend/cuda/copy/copy_general.cu
+++ b/mlx/backend/cuda/copy/copy_general.cu
@@ -59,9 +59,9 @@ void copy_general(
    MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, {
      const InType* in_ptr = in.data<InType>() + offset_in;
      OutType* out_ptr = out.data<OutType>() + offset_out;
-      bool large = in.data_size() > UINT32_MAX || out.data_size() > UINT32_MAX;
+      bool large = in.data_size() > INT32_MAX || out.data_size() > INT32_MAX;
      MLX_SWITCH_BOOL(large, LARGE, {
-        using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
+        using IdxT = std::conditional_t<LARGE, int64_t, int32_t>;
        int ndim = shape.size();
        if (ndim <= 3) {
          MLX_SWITCH_1_2_3(ndim, NDIM, {
@@ -70,7 +70,7 @@ void copy_general(
            kernel<<<num_blocks, block_dims, 0, stream>>>(
                in_ptr,
                out_ptr,
-                out.data_size(),
+                out.size(),
                const_param<NDIM>(shape),
                const_param<NDIM>(strides_in),
                const_param<NDIM>(strides_out));
@@ -81,7 +81,7 @@ void copy_general(
          kernel<<<num_blocks, block_dims, 0, stream>>>(
              in_ptr,
              out_ptr,
-              out.data_size(),
+              out.size(),
              const_param(shape),
              const_param(strides_in),
              const_param(strides_out),
--- a/mlx/backend/cuda/copy/copy_general_dynamic.cu
+++ b/mlx/backend/cuda/copy/copy_general_dynamic.cu
@@ -65,9 +65,9 @@ void copy_general_dynamic(
    MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, {
      const InType* in_ptr = in.data<InType>() + offset_in;
      OutType* out_ptr = out.data<OutType>() + offset_out;
-      bool large = in.data_size() > UINT32_MAX || out.data_size() > UINT32_MAX;
+      bool large = in.data_size() > INT32_MAX || out.data_size() > INT32_MAX;
      MLX_SWITCH_BOOL(large, LARGE, {
-        using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
+        using IdxT = std::conditional_t<LARGE, int64_t, int32_t>;
        int ndim = shape.size();
        if (ndim <= 3) {
          MLX_SWITCH_1_2_3(ndim, NDIM, {
@@ -76,7 +76,7 @@ void copy_general_dynamic(
            kernel<<<num_blocks, block_dims, 0, stream>>>(
                in_ptr,
                out_ptr,
-                out.data_size(),
+                out.size(),
                const_param<NDIM>(shape),
                const_param<NDIM>(strides_in),
                const_param<NDIM>(strides_out),
@@ -89,7 +89,7 @@ void copy_general_dynamic(
          kernel<<<num_blocks, block_dims, 0, stream>>>(
              in_ptr,
              out_ptr,
-              out.data_size(),
+              out.size(),
              const_param(shape),
              const_param(strides_in),
              const_param(strides_out),
--- a/mlx/backend/cuda/copy/copy_general_input.cu
+++ b/mlx/backend/cuda/copy/copy_general_input.cu
@@ -54,9 +54,9 @@ void copy_general_input(
    MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, {
      const InType* in_ptr = in.data<InType>() + offset_in;
      OutType* out_ptr = out.data<OutType>() + offset_out;
-      bool large = in.data_size() > UINT32_MAX || out.data_size() > UINT32_MAX;
+      bool large = in.data_size() > INT32_MAX || out.data_size() > INT32_MAX;
      MLX_SWITCH_BOOL(large, LARGE, {
-        using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
+        using IdxT = std::conditional_t<LARGE, int64_t, int32_t>;
        int ndim = shape.size();
        if (ndim <= 3) {
          MLX_SWITCH_1_2_3(ndim, NDIM, {
@@ -65,7 +65,7 @@ void copy_general_input(
            kernel<<<num_blocks, block_dims, 0, stream>>>(
                in_ptr,
                out_ptr,
-                out.data_size(),
+                out.size(),
                const_param<NDIM>(shape),
                const_param<NDIM>(strides_in));
          });
@@ -75,7 +75,7 @@ void copy_general_input(
          kernel<<<num_blocks, block_dims, 0, stream>>>(
              in_ptr,
              out_ptr,
-              out.data_size(),
+              out.size(),
              const_param(shape),
              const_param(strides_in),
              ndim);
--- a/mlx/backend/cuda/device/cast_op.cuh
+++ b/mlx/backend/cuda/device/cast_op.cuh
@@ -45,6 +45,18 @@ struct CastOp<
  }
 };

+template <typename SrcT, typename DstT>
+struct CastOp<
+    SrcT,
+    DstT,
+    cuda::std::enable_if_t<cuda::std::is_same_v<SrcT, DstT>>> {
+  static constexpr bool is_castable = true;
+
+  __device__ SrcT operator()(SrcT x) {
+    return x;
+  }
+};
+
 // Return an iterator that cast the value to DstT using CastOp.
 template <typename DstT, typename Iterator>
 __host__ __device__ auto make_cast_iterator(Iterator it) {
--- a/mlx/backend/cuda/device/unary_ops.cuh
+++ b/mlx/backend/cuda/device/unary_ops.cuh
@@ -5,6 +5,8 @@
 #include "mlx/backend/cuda/device/fp16_math.cuh"
 #include "mlx/backend/cuda/device/utils.cuh"

+#include <math_constants.h>
+
 namespace mlx::core::cu {

 struct Abs {
@@ -183,21 +185,38 @@ struct Imag {
 struct Log {
  template <typename T>
  __device__ T operator()(T x) {
-    return log(x);
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      auto r = log(cuCrealf(Abs{}(x)));
+      auto i = atan2f(cuCimagf(x), cuCrealf(x));
+      return {r, i};
+    } else {
+      return log(x);
+    }
  }
 };

 struct Log2 {
  template <typename T>
  __device__ T operator()(T x) {
-    return log2(x);
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      auto y = Log{}(x);
+      return {cuCrealf(y) / CUDART_LN2_F, cuCimagf(y) / CUDART_LN2_F};
+    } else {
+      return log2(x);
+    }
  }
 };

 struct Log10 {
  template <typename T>
  __device__ T operator()(T x) {
-    return log10(x);
+    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+      auto y = Log{}(x);
+      return {cuCrealf(y) / CUDART_LNT_F, cuCimagf(y) / CUDART_LNT_F};
+      return y;
+    } else {
+      return log10(x);
+    }
  }
 };

--- a/mlx/backend/cuda/kernel_utils.cuh
+++ b/mlx/backend/cuda/kernel_utils.cuh
@@ -102,6 +102,11 @@ inline constexpr bool is_floating_v =
    cuda::std::is_same_v<T, float> || cuda::std::is_same_v<T, double> ||
    cuda::std::is_same_v<T, float16_t> || cuda::std::is_same_v<T, bfloat16_t>;

+// Type traits for detecting complex or real floating point numbers.
+template <typename T>
+inline constexpr bool is_inexact_v =
+    is_floating_v<T> || cuda::std::is_same_v<T, complex64_t>;
+
 // Utility to copy data from vector to array in host.
 template <int NDIM = MAX_NDIM, typename T = int32_t>
 inline cuda::std::array<T, NDIM> const_param(const std::vector<T>& vec) {
@@ -136,17 +141,19 @@ inline uint max_occupancy_block_dim(T kernel) {
 template <typename T>
 inline std::tuple<dim3, uint> get_launch_args(
    T kernel,
-    const array& arr,
+    size_t size,
+    const Shape& shape,
+    const Strides& strides,
    bool large,
    int work_per_thread = 1) {
-  size_t nthreads = cuda::ceil_div(arr.size(), work_per_thread);
+  size_t nthreads = cuda::ceil_div(size, work_per_thread);
  uint block_dim = max_occupancy_block_dim(kernel);
  if (block_dim > nthreads) {
    block_dim = nthreads;
  }
  dim3 num_blocks;
  if (large) {
-    num_blocks = get_2d_grid_dims(arr.shape(), arr.strides(), work_per_thread);
+    num_blocks = get_2d_grid_dims(shape, strides, work_per_thread);
    num_blocks.x = cuda::ceil_div(num_blocks.x, block_dim);
  } else {
    num_blocks.x = cuda::ceil_div(nthreads, block_dim);
@@ -154,4 +161,14 @@ inline std::tuple<dim3, uint> get_launch_args(
  return std::make_tuple(num_blocks, block_dim);
 }

+template <typename T>
+inline std::tuple<dim3, uint> get_launch_args(
+    T kernel,
+    const array& arr,
+    bool large,
+    int work_per_thread = 1) {
+  return get_launch_args(
+      kernel, arr.size(), arr.shape(), arr.strides(), large, work_per_thread);
+}
+
 } // namespace mlx::core
--- a/mlx/backend/cuda/ternary.cu
+++ b/mlx/backend/cuda/ternary.cu
@@ -116,7 +116,7 @@ void ternary_op_gpu_inplace(
                  b.data<DType>(),
                  c.data<DType>(),
                  out.data<DType>(),
-                  out.data_size(),
+                  out.size(),
                  const_param<NDIM>(shape),
                  const_param<NDIM>(a_strides),
                  const_param<NDIM>(b_strides),
@@ -142,7 +142,8 @@ void ternary_op_gpu_inplace(
        MLX_SWITCH_BOOL(out.data_size() > UINT32_MAX, LARGE, {
          using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
          auto kernel = cu::ternary_v<Op, DType, IdxT>;
-          auto [num_blocks, block_dims] = get_launch_args(kernel, out, LARGE);
+          auto [num_blocks, block_dims] = get_launch_args(
+              kernel, out.data_size(), out.shape(), out.strides(), LARGE);
          kernel<<<num_blocks, block_dims, 0, stream>>>(
              a.data<bool>(),
              b.data<DType>(),
--- a/mlx/backend/cuda/unary.cu
+++ b/mlx/backend/cuda/unary.cu
@@ -28,11 +28,14 @@ constexpr bool supports_unary_op() {
      std::is_same_v<Op, ArcTan> || std::is_same_v<Op, ArcTanh> ||
      std::is_same_v<Op, Erf> || std::is_same_v<Op, ErfInv> ||
      std::is_same_v<Op, Expm1> || std::is_same_v<Op, Log1p> ||
-      std::is_same_v<Op, Log> || std::is_same_v<Op, Log2> ||
-      std::is_same_v<Op, Log10> || std::is_same_v<Op, Sigmoid> ||
-      std::is_same_v<Op, Sqrt> || std::is_same_v<Op, Rsqrt>) {
+      std::is_same_v<Op, Sigmoid> || std::is_same_v<Op, Sqrt> ||
+      std::is_same_v<Op, Rsqrt>) {
    return std::is_same_v<In, Out> && is_floating_v<In>;
  }
+  if (std::is_same_v<Op, Log> || std::is_same_v<Op, Log2> ||
+      std::is_same_v<Op, Log10>) {
+    return std::is_same_v<In, Out> && is_inexact_v<In>;
+  }
  if (std::is_same_v<Op, BitwiseInvert>) {
    return std::is_same_v<In, Out> && std::is_integral_v<In> &&
        !std::is_same_v<In, bool>;
@@ -91,7 +94,7 @@ void unary_op_gpu_inplace(
          } else {
            auto [shape, strides] = collapse_contiguous_dims(in);
            auto [in_begin, in_end] = cu::make_general_iterators<int64_t>(
-                in_ptr, in.data_size(), shape, strides);
+                in_ptr, in.size(), shape, strides);
            thrust::transform(policy, in_begin, in_end, out_ptr, Op());
          }
        } else {