Move arange to its own file (#2438 )

Remove the kernel arg from get_launch_args (#2437 )
Cuda faster softmax (#2435 )
2025-12-16 01:49:05 +08:00 · 2025-07-30 13:05:51 +09:00 · 2025-07-30 11:43:02 +09:00 · 2025-07-29 17:18:12 -07:00
19 changed files with 164 additions and 203 deletions
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -6,6 +6,7 @@
 target_sources(
  mlx
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/arange.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/binary_two.cu
@@ -29,7 +30,7 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/layer_norm.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/logsumexp.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/random.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/all_reduce.cu
--- a/mlx/backend/cuda/arange.cu
+++ b/mlx/backend/cuda/arange.cu
@@ -0,0 +1,55 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/device/fp16_math.cuh"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/dtype_utils.h"
+#include "mlx/primitives.h"
+
+#include <nvtx3/nvtx3.hpp>
+#include <thrust/device_ptr.h>
+#include <thrust/transform.h>
+
+namespace mlx::core {
+
+namespace cu {
+
+template <typename T>
+struct Arange {
+  const T start;
+  const T step;
+
+  __device__ T operator()(uint32_t i) const {
+    return start + i * step;
+  }
+};
+
+} // namespace cu
+
+void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("Arange::eval_gpu");
+  if (out.size() == 0) {
+    return;
+  }
+  out.set_data(allocator::malloc(out.nbytes()));
+
+  auto& encoder = cu::get_command_encoder(stream());
+  encoder.set_output_array(out);
+
+  auto capture = encoder.capture_context();
+  dispatch_int_float_types(out.dtype(), "Arange", [&](auto type_tag) {
+    using CTYPE = MLX_GET_TYPE(type_tag);
+    using OutType = cuda_type_t<CTYPE>;
+    CTYPE step =
+        static_cast<CTYPE>(start_ + step_) - static_cast<CTYPE>(start_);
+    thrust::transform(
+        cu::thrust_policy(encoder.stream()),
+        thrust::counting_iterator<uint32_t>(0),
+        thrust::counting_iterator<uint32_t>(out.data_size()),
+        thrust::device_pointer_cast(out.data<OutType>()),
+        cu::Arange<OutType>{
+            static_cast<OutType>(start_), static_cast<OutType>(step)});
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary.cu
+++ b/mlx/backend/cuda/binary.cu
@@ -211,12 +211,15 @@ void binary_op_gpu_inplace(
                int ndim = shape.size();
                if (ndim <= 3) {
                  dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                    auto kernel = cu::
-                        binary_g_nd<Op, InType, OutType, IdxT, dims_constant()>;
                    auto [num_blocks, block_dims] =
-                        get_launch_args(kernel, out, large());
+                        get_launch_args(out, large());
                    encoder.add_kernel_node(
-                        kernel,
+                        cu::binary_g_nd<
+                            Op,
+                            InType,
+                            OutType,
+                            IdxT,
+                            dims_constant()>,
                        num_blocks,
                        block_dims,
                        a.data<InType>(),
@@ -228,11 +231,9 @@ void binary_op_gpu_inplace(
                        const_param<dims_constant()>(b_strides));
                  });
                } else {
-                  auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
-                  auto [num_blocks, block_dims] =
-                      get_launch_args(kernel, out, large());
+                  auto [num_blocks, block_dims] = get_launch_args(out, large());
                  encoder.add_kernel_node(
-                      kernel,
+                      cu::binary_g<Op, InType, OutType, IdxT>,
                      num_blocks,
                      block_dims,
                      a.data<InType>(),
@@ -258,12 +259,7 @@ void binary_op_gpu_inplace(
              kernel = cu::binary_vv<Op, InType, OutType, IdxT, N_READS>;
            }
            auto [num_blocks, block_dims] = get_launch_args(
-                kernel,
-                out.data_size(),
-                out.shape(),
-                out.strides(),
-                large(),
-                N_READS);
+                out.data_size(), out.shape(), out.strides(), large(), N_READS);
            encoder.add_kernel_node(
                kernel,
                num_blocks,
--- a/mlx/backend/cuda/binary_two.cu
+++ b/mlx/backend/cuda/binary_two.cu
@@ -227,16 +227,15 @@ void binary_two_op_gpu_inplace(
                int ndim = shape.size();
                if (ndim <= 3) {
                  dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                    auto kernel = cu::binary_two_g_nd<
-                        Op,
-                        InType,
-                        OutType,
-                        IdxT,
-                        dims_constant()>;
                    auto [num_blocks, block_dims] =
-                        get_launch_args(kernel, out_a, large());
+                        get_launch_args(out_a, large());
                    encoder.add_kernel_node(
-                        kernel,
+                        cu::binary_two_g_nd<
+                            Op,
+                            InType,
+                            OutType,
+                            IdxT,
+                            dims_constant()>,
                        num_blocks,
                        block_dims,
                        a.data<InType>(),
@@ -249,11 +248,10 @@ void binary_two_op_gpu_inplace(
                        const_param<dims_constant()>(b_strides));
                  });
                } else {
-                  auto kernel = cu::binary_two_g<Op, InType, OutType, IdxT>;
                  auto [num_blocks, block_dims] =
-                      get_launch_args(kernel, out_a, large());
+                      get_launch_args(out_a, large());
                  encoder.add_kernel_node(
-                      kernel,
+                      cu::binary_two_g<Op, InType, OutType, IdxT>,
                      num_blocks,
                      block_dims,
                      a.data<InType>(),
@@ -280,7 +278,6 @@ void binary_two_op_gpu_inplace(
              kernel = cu::binary_two_vv<Op, InType, OutType, IdxT, N_READS>;
            }
            auto [num_blocks, block_dims] = get_launch_args(
-                kernel,
                out_a.data_size(),
                out_a.shape(),
                out_a.strides(),
--- a/mlx/backend/cuda/compiled.cpp
+++ b/mlx/backend/cuda/compiled.cpp
@@ -294,7 +294,7 @@ void Compiled::eval_gpu(

  auto kernel = mod.get_kernel(kernel_name);
  auto [num_blocks, block_dims] =
-      get_launch_args(kernel, outputs[0], large, work_per_thread);
+      get_launch_args(outputs[0], large, work_per_thread);
  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
 }

--- a/mlx/backend/cuda/copy/copy_contiguous.cu
+++ b/mlx/backend/cuda/copy/copy_contiguous.cu
@@ -71,12 +71,7 @@ void copy_contiguous(
          kernel = cu::copy_v<InType, OutType, IdxT, N_READS>;
        }
        auto [num_blocks, block_dims] = get_launch_args(
-            kernel,
-            out.data_size(),
-            out.shape(),
-            out.strides(),
-            large(),
-            N_READS);
+            out.data_size(), out.shape(), out.strides(), large(), N_READS);
        encoder.add_kernel_node(
            kernel,
            num_blocks,
--- a/mlx/backend/cuda/copy/copy_general.cu
+++ b/mlx/backend/cuda/copy/copy_general.cu
@@ -71,12 +71,10 @@ void copy_general(
              data_size *= s;
            if (ndim <= 3) {
              dispatch_1_2_3(ndim, [&](auto ndim_constant) {
-                auto kernel =
-                    cu::copy_gg_nd<InType, OutType, IdxT, ndim_constant()>;
-                auto [num_blocks, block_dims] = get_launch_args(
-                    kernel, data_size, shape, out.strides(), large());
+                auto [num_blocks, block_dims] =
+                    get_launch_args(data_size, shape, out.strides(), large());
                encoder.add_kernel_node(
-                    kernel,
+                    cu::copy_gg_nd<InType, OutType, IdxT, ndim_constant()>,
                    num_blocks,
                    block_dims,
                    in_ptr,
@@ -87,11 +85,10 @@ void copy_general(
                    const_param<ndim_constant()>(strides_out));
              });
            } else { // ndim >= 4
-              auto kernel = cu::copy_gg<InType, OutType, IdxT>;
-              auto [num_blocks, block_dims] = get_launch_args(
-                  kernel, data_size, shape, out.strides(), large());
+              auto [num_blocks, block_dims] =
+                  get_launch_args(data_size, shape, out.strides(), large());
              encoder.add_kernel_node(
-                  kernel,
+                  cu::copy_gg<InType, OutType, IdxT>,
                  num_blocks,
                  block_dims,
                  in_ptr,
--- a/mlx/backend/cuda/copy/copy_general_dynamic.cu
+++ b/mlx/backend/cuda/copy/copy_general_dynamic.cu
@@ -74,12 +74,13 @@ void copy_general_dynamic(
            int ndim = shape.size();
            if (ndim <= 3) {
              dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                auto kernel = cu::
-                    copy_gg_dynamic_nd<InType, OutType, IdxT, dims_constant()>;
-                auto [num_blocks, block_dims] =
-                    get_launch_args(kernel, out, large());
+                auto [num_blocks, block_dims] = get_launch_args(out, large());
                encoder.add_kernel_node(
-                    kernel,
+                    cu::copy_gg_dynamic_nd<
+                        InType,
+                        OutType,
+                        IdxT,
+                        dims_constant()>,
                    num_blocks,
                    block_dims,
                    in_ptr,
@@ -92,11 +93,9 @@ void copy_general_dynamic(
                    dynamic_offset_out.data<int64_t>());
              });
            } else { // ndim >= 4
-              auto kernel = cu::copy_gg_dynamic<InType, OutType, IdxT>;
-              auto [num_blocks, block_dims] =
-                  get_launch_args(kernel, out, large());
+              auto [num_blocks, block_dims] = get_launch_args(out, large());
              encoder.add_kernel_node(
-                  kernel,
+                  cu::copy_gg_dynamic<InType, OutType, IdxT>,
                  num_blocks,
                  block_dims,
                  in_ptr,
--- a/mlx/backend/cuda/copy/copy_general_input.cu
+++ b/mlx/backend/cuda/copy/copy_general_input.cu
@@ -63,12 +63,9 @@ void copy_general_input(
            int ndim = shape.size();
            if (ndim <= 3) {
              dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                auto kernel =
-                    cu::copy_g_nd<InType, OutType, IdxT, dims_constant()>;
-                auto [num_blocks, block_dims] =
-                    get_launch_args(kernel, out, large());
+                auto [num_blocks, block_dims] = get_launch_args(out, large());
                encoder.add_kernel_node(
-                    kernel,
+                    cu::copy_g_nd<InType, OutType, IdxT, dims_constant()>,
                    num_blocks,
                    block_dims,
                    in_ptr,
@@ -78,11 +75,9 @@ void copy_general_input(
                    const_param<dims_constant()>(strides_in));
              });
            } else { // ndim >= 4
-              auto kernel = cu::copy_g<InType, OutType, IdxT>;
-              auto [num_blocks, block_dims] =
-                  get_launch_args(kernel, out, large());
+              auto [num_blocks, block_dims] = get_launch_args(out, large());
              encoder.add_kernel_node(
-                  kernel,
+                  cu::copy_g<InType, OutType, IdxT>,
                  num_blocks,
                  block_dims,
                  in_ptr,
--- a/mlx/backend/cuda/device/arange.cuh
+++ b/mlx/backend/cuda/device/arange.cuh
@@ -1,15 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-namespace mlx::core::cu {
-
-template <typename T>
-struct Arange {
-  const T start;
-  const T step;
-
-  __device__ T operator()(uint32_t i) const {
-    return start + i * step;
-  }
-};
-
-} // namespace mlx::core::cu
--- a/mlx/backend/cuda/indexing.cpp
+++ b/mlx/backend/cuda/indexing.cpp
@@ -128,7 +128,7 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
  encoder.set_output_array(out);

  auto kernel = mod.get_kernel(kernel_name);
-  auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
+  auto [num_blocks, block_dims] = get_launch_args(out, large);
  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
 }

@@ -229,7 +229,7 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
  }
  encoder.set_output_array(out);
  auto kernel = mod.get_kernel(kernel_name);
-  auto [num_blocks, block_dims] = get_launch_args(kernel, upd, large);
+  auto [num_blocks, block_dims] = get_launch_args(upd, large);
  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
 }

@@ -317,7 +317,7 @@ void GatherAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
  }
  encoder.set_output_array(out);
  auto kernel = mod.get_kernel(kernel_name);
-  auto [num_blocks, block_dims] = get_launch_args(kernel, idx, large);
+  auto [num_blocks, block_dims] = get_launch_args(idx, large);
  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
 }

@@ -421,7 +421,7 @@ void ScatterAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
  }
  encoder.set_output_array(out);
  auto kernel = mod.get_kernel(kernel_name);
-  auto [num_blocks, block_dims] = get_launch_args(kernel, idx, large);
+  auto [num_blocks, block_dims] = get_launch_args(idx, large);
  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
 }

--- a/mlx/backend/cuda/kernel_utils.cu
+++ b/mlx/backend/cuda/kernel_utils.cu
@@ -30,4 +30,25 @@ std::pair<dim3, dim3> get_grid_and_block(int dim0, int dim1, int dim2) {
  return std::make_pair(dim3(gx, gy, gz), dim3(bx, by, bz));
 }

+std::tuple<dim3, uint> get_launch_args(
+    size_t size,
+    const Shape& shape,
+    const Strides& strides,
+    bool large,
+    int work_per_thread) {
+  size_t nthreads = cuda::ceil_div(size, work_per_thread);
+  uint block_dim = 1024;
+  if (block_dim > nthreads) {
+    block_dim = nthreads;
+  }
+  dim3 num_blocks;
+  if (large) {
+    num_blocks = get_2d_grid_dims(shape, strides, work_per_thread);
+    num_blocks.x = cuda::ceil_div(num_blocks.x, block_dim);
+  } else {
+    num_blocks.x = cuda::ceil_div(nthreads, block_dim);
+  }
+  return std::make_tuple(num_blocks, block_dim);
+}
+
 } // namespace mlx::core
--- a/mlx/backend/cuda/kernel_utils.cuh
+++ b/mlx/backend/cuda/kernel_utils.cuh
@@ -122,37 +122,17 @@ std::pair<dim3, dim3> get_grid_and_block(int dim0, int dim1, int dim2);

 // Get the num_blocks and block_dims that maximize occupancy for |kernel|,
 // assuming each thread handles |work_per_thread| elements of |arr|.
-template <typename T>
-inline std::tuple<dim3, uint> get_launch_args(
-    T kernel,
+std::tuple<dim3, uint> get_launch_args(
    size_t size,
    const Shape& shape,
    const Strides& strides,
    bool large,
-    int work_per_thread = 1) {
-  size_t nthreads = cuda::ceil_div(size, work_per_thread);
-  uint block_dim = 1024;
-  if (block_dim > nthreads) {
-    block_dim = nthreads;
-  }
-  dim3 num_blocks;
-  if (large) {
-    num_blocks = get_2d_grid_dims(shape, strides, work_per_thread);
-    num_blocks.x = cuda::ceil_div(num_blocks.x, block_dim);
-  } else {
-    num_blocks.x = cuda::ceil_div(nthreads, block_dim);
-  }
-  return std::make_tuple(num_blocks, block_dim);
-}
+    int work_per_thread = 1);

-template <typename T>
-inline std::tuple<dim3, uint> get_launch_args(
-    T kernel,
-    const array& arr,
-    bool large,
-    int work_per_thread = 1) {
+inline std::tuple<dim3, uint>
+get_launch_args(const array& arr, bool large, int work_per_thread = 1) {
  return get_launch_args(
-      kernel, arr.size(), arr.shape(), arr.strides(), large, work_per_thread);
+      arr.size(), arr.shape(), arr.strides(), large, work_per_thread);
 }

 } // namespace mlx::core
--- a/mlx/backend/cuda/logsumexp.cu
+++ b/mlx/backend/cuda/logsumexp.cu
@@ -43,20 +43,19 @@ __global__ void logsumexp(const T* in, T* out, int axis_size) {
  AccT maxval = Limits<AccT>::finite_min();
  AccT normalizer = 0;
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); r++) {
-    AccT vals[N_READS];
-    cub::LoadDirectBlocked(
-        r * BLOCK_DIM + block.thread_rank(),
-        make_cast_iterator<AccT>(in),
-        vals,
-        axis_size,
-        Limits<AccT>::min());
+    auto index = r * BLOCK_DIM + block.thread_rank();
+    auto vals = load_vector<N_READS>(in, index, axis_size, Limits<T>::min());
    prevmax = maxval;
-    maxval = max_op(maxval, cub::ThreadReduce(vals, max_op));
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      maxval = max_op(maxval, static_cast<AccT>(vals[i]));
+    }
    // Online normalizer calculation for softmax:
    // https://github.com/NVIDIA/online-softmax
    normalizer = normalizer * softmax_exp(prevmax - maxval);
    for (int i = 0; i < N_READS; i++) {
-      normalizer = normalizer + softmax_exp(vals[i] - maxval);
+      normalizer =
+          normalizer + softmax_exp(static_cast<AccT>(vals[i]) - maxval);
    }
  }

@@ -143,9 +142,9 @@ void LogSumExp::eval_gpu(const std::vector<array>& inputs, array& out) {
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  dispatch_float_types(out.dtype(), "logsumexp", [&](auto type_tag) {
-    constexpr int N_READS = 4;
+    using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+    constexpr int N_READS = 16 / sizeof(DataType);
    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
-      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
      auto kernel = cu::logsumexp<DataType, float, block_dim(), N_READS>;
      encoder.add_kernel_node(
          kernel,
--- a/mlx/backend/cuda/primitives.cpp
+++ b/mlx/backend/cuda/primitives.cpp
@@ -1,47 +1,11 @@
 // Copyright © 2025 Apple Inc.

-#include "mlx/backend/cuda/device.h"
-#include "mlx/backend/cuda/device/arange.cuh"
-#include "mlx/backend/cuda/device/fp16_math.cuh"
-#include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/distributed/primitives.h"
-#include "mlx/dtype_utils.h"
 #include "mlx/fast_primitives.h"
 #include "mlx/primitives.h"

-#include <nvtx3/nvtx3.hpp>
-#include <thrust/device_ptr.h>
-#include <thrust/transform.h>
-
-#include <cassert>
-
 namespace mlx::core {

-void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
-  nvtx3::scoped_range r("Arange::eval_gpu");
-  assert(inputs.size() == 0);
-  out.set_data(allocator::malloc(out.nbytes()));
-  if (out.size() == 0) {
-    return;
-  }
-  auto& encoder = cu::get_command_encoder(stream());
-  encoder.set_output_array(out);
-  auto capture = encoder.capture_context();
-  dispatch_int_float_types(out.dtype(), "Arange", [&](auto type_tag) {
-    using CTYPE = MLX_GET_TYPE(type_tag);
-    using OutType = cuda_type_t<CTYPE>;
-    CTYPE step =
-        static_cast<CTYPE>(start_ + step_) - static_cast<CTYPE>(start_);
-    thrust::transform(
-        cu::thrust_policy(encoder.stream()),
-        thrust::counting_iterator<uint32_t>(0),
-        thrust::counting_iterator<uint32_t>(out.data_size()),
-        thrust::device_pointer_cast(out.data<OutType>()),
-        cu::Arange<OutType>{
-            static_cast<OutType>(start_), static_cast<OutType>(step)});
-  });
-}
-
 bool fast::ScaledDotProductAttention::use_fallback(
    const array& q,
    const array& k,
--- a/mlx/backend/cuda/quantized.cu
+++ b/mlx/backend/cuda/quantized.cu
@@ -350,12 +350,10 @@ void fast::AffineQuantize::eval_gpu(
      dispatch_bits(bits_, [&](auto bits) {
        using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
        if (dequantize_) {
-          auto kernel =
-              cu::affine_dequantize<DataType, group_size.value, bits.value>;
          auto [num_blocks, block_dims] =
-              get_launch_args(kernel, size, grid_shape, w.strides(), large);
+              get_launch_args(size, grid_shape, w.strides(), large);
          enc.add_kernel_node(
-              kernel,
+              cu::affine_dequantize<DataType, group_size.value, bits.value>,
              num_blocks,
              block_dims,
              w.data<uint8_t>(),
@@ -364,12 +362,10 @@ void fast::AffineQuantize::eval_gpu(
              out.data<DataType>(),
              out.size());
        } else {
-          auto kernel =
-              cu::affine_quantize<DataType, group_size.value, bits.value>;
          auto [num_blocks, block_dims] =
-              get_launch_args(kernel, size, grid_shape, w.strides(), large);
+              get_launch_args(size, grid_shape, w.strides(), large);
          enc.add_kernel_node(
-              kernel,
+              cu::affine_quantize<DataType, group_size.value, bits.value>,
              num_blocks,
              block_dims,
              w.data<DataType>(),
--- a/mlx/backend/cuda/softmax.cu
+++ b/mlx/backend/cuda/softmax.cu
@@ -11,7 +11,6 @@
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 #include <nvtx3/nvtx3.hpp>
-#include <cub/block/block_load.cuh>

 #include <cassert>

@@ -45,20 +44,21 @@ __global__ void softmax(const T* in, T* out, int axis_size) {
  AccT maxval = Limits<AccT>::finite_min();
  AccT normalizer = cast_to<AccT>(0);
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); r++) {
-    AccT vals[N_READS];
-    cub::LoadDirectBlocked(
-        r * BLOCK_DIM + block.thread_rank(),
-        make_cast_iterator<AccT>(in),
-        vals,
-        axis_size,
-        Limits<AccT>::min());
+    auto index = r * BLOCK_DIM + block.thread_rank();
+    auto vals = load_vector<N_READS>(in, index, axis_size, Limits<T>::min());
    prevmax = maxval;
-    maxval = max_op(maxval, cub::ThreadReduce(vals, max_op));
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      maxval = max_op(maxval, static_cast<AccT>(vals[i]));
+    }
+
    // Online normalizer calculation for softmax:
    // https://github.com/NVIDIA/online-softmax
    normalizer = normalizer * softmax_exp(prevmax - maxval);
+#pragma unroll
    for (int i = 0; i < N_READS; i++) {
-      normalizer = normalizer + softmax_exp(vals[i] - maxval);
+      normalizer =
+          normalizer + softmax_exp(static_cast<AccT>(vals[i]) - maxval);
    }
  }

@@ -95,12 +95,11 @@ __global__ void softmax(const T* in, T* out, int axis_size) {
  // Write output.
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); r++) {
    auto index = r * BLOCK_DIM + block.thread_rank();
-    T vals[N_READS];
-    cub::LoadDirectBlocked(index, in, vals, axis_size);
+    auto vals = load_vector<N_READS>(in, index, axis_size, T(0));
    for (int i = 0; i < N_READS; i++) {
      vals[i] = softmax_exp(static_cast<AccT>(vals[i]) - maxval) * normalizer;
    }
-    cub::StoreDirectBlocked(index, out, vals, axis_size);
+    store_vector<N_READS>(out, index, vals, axis_size);
  }
 }

@@ -141,9 +140,9 @@ void Softmax::eval_gpu(const std::vector<array>& inputs, array& out) {
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  dispatch_float_types(out.dtype(), "softmax", [&](auto type_tag) {
-    constexpr int N_READS = 4;
+    using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+    constexpr int N_READS = 16 / sizeof(DataType);
    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
-      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
      auto kernel = cu::softmax<DataType, DataType, block_dim(), N_READS>;
      if (precise) {
        kernel = cu::softmax<DataType, float, block_dim(), N_READS>;
--- a/mlx/backend/cuda/ternary.cu
+++ b/mlx/backend/cuda/ternary.cu
@@ -125,12 +125,9 @@ void ternary_op_gpu_inplace(
            int ndim = shape.size();
            if (ndim <= 3) {
              dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                auto kernel =
-                    cu::ternary_g_nd<Op, DType, IdxT, dims_constant()>;
-                auto [num_blocks, block_dims] =
-                    get_launch_args(kernel, out, large());
+                auto [num_blocks, block_dims] = get_launch_args(out, large());
                encoder.add_kernel_node(
-                    kernel,
+                    cu::ternary_g_nd<Op, DType, IdxT, dims_constant()>,
                    num_blocks,
                    block_dims,
                    a.data<bool>(),
@@ -144,11 +141,9 @@ void ternary_op_gpu_inplace(
                    const_param<dims_constant()>(c_strides));
              });
            } else {
-              auto kernel = cu::ternary_g<Op, DType, IdxT>;
-              auto [num_blocks, block_dims] =
-                  get_launch_args(kernel, out, large());
+              auto [num_blocks, block_dims] = get_launch_args(out, large());
              encoder.add_kernel_node(
-                  kernel,
+                  cu::ternary_g<Op, DType, IdxT>,
                  num_blocks,
                  block_dims,
                  a.data<bool>(),
@@ -167,16 +162,10 @@ void ternary_op_gpu_inplace(
      dispatch_bool(out.data_size() > UINT32_MAX, [&](auto large) {
        using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
        constexpr int N_READS = 16 / sizeof(DType);
-        auto kernel = cu::ternary_v<Op, DType, IdxT, N_READS>;
        auto [num_blocks, block_dims] = get_launch_args(
-            kernel,
-            out.data_size(),
-            out.shape(),
-            out.strides(),
-            large(),
-            N_READS);
+            out.data_size(), out.shape(), out.strides(), large(), N_READS);
        encoder.add_kernel_node(
-            kernel,
+            cu::ternary_v<Op, DType, IdxT, N_READS>,
            num_blocks,
            block_dims,
            a.data<bool>(),
--- a/mlx/backend/cuda/unary.cu
+++ b/mlx/backend/cuda/unary.cu
@@ -129,16 +129,10 @@ void unary_op_gpu_inplace(
            using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
            // TODO: Choose optimized value based on type size.
            constexpr int N_READS = 4;
-            auto kernel = cu::unary_v<Op, InType, OutType, IdxT, N_READS>;
            auto [num_blocks, block_dims] = get_launch_args(
-                kernel,
-                out.data_size(),
-                out.shape(),
-                out.strides(),
-                large,
-                N_READS);
+                out.data_size(), out.shape(), out.strides(), large, N_READS);
            encoder.add_kernel_node(
-                kernel,
+                cu::unary_v<Op, InType, OutType, IdxT, N_READS>,
                num_blocks,
                block_dims,
                in.data<InType>(),
@@ -147,10 +141,9 @@ void unary_op_gpu_inplace(
          } else {
            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
            auto [shape, strides] = collapse_contiguous_dims(in);
-            auto kernel = cu::unary_g<Op, InType, OutType, IdxT>;
-            auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
+            auto [num_blocks, block_dims] = get_launch_args(out, large);
            encoder.add_kernel_node(
-                kernel,
+                cu::unary_g<Op, InType, OutType, IdxT>,
                num_blocks,
                block_dims,
                in.data<InType>(),
Author	SHA1	Message	Date
Cheng	a0ae49d397	Move arange to its own file (#2438 )	2025-07-30 13:05:51 +09:00
Cheng	254476718b	Remove the kernel arg from get_launch_args (#2437 )	2025-07-30 11:43:02 +09:00
Awni Hannun	3adba92ebe	Cuda faster softmax (#2435 ) * faster softmax and logsumexp * faster softmax and logsumexp * format	2025-07-29 17:18:12 -07:00