Contig uses uint as index and non-contig uses int

2025-12-16 01:49:05 +08:00 · 2025-07-09 23:05:21 +00:00
parent 5c932c7bb0
commit e3534c2db8
4 changed files with 5 additions and 4 deletions
--- a/mlx/backend/cuda/binary.cu
+++ b/mlx/backend/cuda/binary.cu
@@ -247,7 +247,7 @@ void binary_op_gpu_inplace(
                }
              });
        } else {
-          dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
+          dispatch_bool(out.data_size() > UINT32_MAX, [&](auto large) {
            using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
            // TODO: Choose optimized value based on type size.
            constexpr int N_READS = 4;
--- a/mlx/backend/cuda/binary_two.cu
+++ b/mlx/backend/cuda/binary_two.cu
@@ -269,7 +269,7 @@ void binary_two_op_gpu_inplace(
                }
              });
        } else {
-          dispatch_bool(out_a.data_size() > INT32_MAX, [&](auto large) {
+          dispatch_bool(out_a.data_size() > UINT32_MAX, [&](auto large) {
            using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
            // TODO: Choose optimized value based on type size.
            constexpr int N_READS = 4;
--- a/mlx/backend/cuda/ternary.cu
+++ b/mlx/backend/cuda/ternary.cu
@@ -164,7 +164,7 @@ void ternary_op_gpu_inplace(
            }
          });
    } else {
-      dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
+      dispatch_bool(out.data_size() > UINT32_MAX, [&](auto large) {
        using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
        // TODO: Choose optimized value based on type size.
        constexpr int N_READS = 4;
--- a/mlx/backend/cuda/unary.cu
+++ b/mlx/backend/cuda/unary.cu
@@ -127,8 +127,8 @@ void unary_op_gpu_inplace(
        dispatch_bool(large, [&](auto large) {
          using InType = cuda_type_t<CTYPE_IN>;
          using OutType = cuda_type_t<CTYPE_OUT>;
          using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
          if (contig) {
            using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
            // TODO: Choose optimized value based on type size.
            constexpr int N_READS = 4;
            auto kernel = cu::unary_v<Op, InType, OutType, IdxT, N_READS>;
@@ -147,6 +147,7 @@ void unary_op_gpu_inplace(
                out.data<OutType>(),
                out.data_size());
          } else {
            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
            auto [shape, strides] = collapse_contiguous_dims(in);
            auto kernel = cu::unary_g<Op, InType, OutType, IdxT>;
            auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);