From 70ade3015fac472db14f11154c33e04945c4acd7 Mon Sep 17 00:00:00 2001 From: Cheng Date: Tue, 8 Jul 2025 00:43:39 +0000 Subject: [PATCH] Use int32_t for IdxT --- mlx/backend/cuda/binary.cu | 2 +- mlx/backend/cuda/binary_two.cu | 2 +- mlx/backend/cuda/copy/copy_contiguous.cu | 4 ++-- mlx/backend/cuda/ternary.cu | 5 ++--- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/mlx/backend/cuda/binary.cu b/mlx/backend/cuda/binary.cu index 0585dc76a..5f3f29094 100644 --- a/mlx/backend/cuda/binary.cu +++ b/mlx/backend/cuda/binary.cu @@ -268,7 +268,7 @@ void binary_op_gpu_inplace( }); } else { dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) { - using IdxT = std::conditional_t; + using IdxT = std::conditional_t; // TODO: Choose optimized value based on type size. constexpr int N_READS = 4; auto kernel = cu::binary_ss; diff --git a/mlx/backend/cuda/binary_two.cu b/mlx/backend/cuda/binary_two.cu index 9776278ec..168b390e3 100644 --- a/mlx/backend/cuda/binary_two.cu +++ b/mlx/backend/cuda/binary_two.cu @@ -286,7 +286,7 @@ void binary_op_gpu_inplace( }); } else { dispatch_bool(out_a.data_size() > INT32_MAX, [&](auto large) { - using IdxT = std::conditional_t; + using IdxT = std::conditional_t; // TODO: Choose optimized value based on type size. constexpr int N_READS = 4; auto kernel = cu::binary_ss; diff --git a/mlx/backend/cuda/copy/copy_contiguous.cu b/mlx/backend/cuda/copy/copy_contiguous.cu index 60f66f984..2868b13a3 100644 --- a/mlx/backend/cuda/copy/copy_contiguous.cu +++ b/mlx/backend/cuda/copy/copy_contiguous.cu @@ -71,10 +71,10 @@ void copy_contiguous( int64_t out_offset) { dispatch_all_types(in.dtype(), [&](auto in_type_tag) { dispatch_all_types(out.dtype(), [&](auto out_type_tag) { - dispatch_bool(out.data_size() > UINT32_MAX, [&](auto large) { + dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) { using InType = cuda_type_t; using OutType = cuda_type_t; - using IdxT = std::conditional_t; + using IdxT = std::conditional_t; // TODO: Choose optimized value based on type size. constexpr int N_READS = 4; auto kernel = cu::copy_s; diff --git a/mlx/backend/cuda/ternary.cu b/mlx/backend/cuda/ternary.cu index d2ba3e6a0..696eaa8ce 100644 --- a/mlx/backend/cuda/ternary.cu +++ b/mlx/backend/cuda/ternary.cu @@ -34,10 +34,9 @@ ternary_v(const bool* a, const T* b, const T* c, T* out, IdxT size) { auto b_vec = load_vector(b, index); auto c_vec = load_vector(c, index); - AlignedVector out_vec; + AlignedVector out_vec; #pragma unroll for (int i = 0; i < N_READS; ++i) { - out_vec.val[i] = CastOp{}(a_vec.val[i]); out_vec.val[i] = Op{}(a_vec.val[i], b_vec.val[i], c_vec.val[i]); } @@ -171,7 +170,7 @@ void ternary_op_gpu_inplace( }); } else { dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) { - using IdxT = std::conditional_t; + using IdxT = std::conditional_t; // TODO: Choose optimized value based on type size. constexpr int N_READS = 4; auto kernel = cu::ternary_v;