Cuda faster softmax (#2435)

* faster softmax and logsumexp * faster softmax and logsumexp * format
2025-12-16 01:49:05 +08:00 · 2025-07-29 17:18:12 -07:00
parent ef631d63af
commit 3adba92ebe
2 changed files with 24 additions and 26 deletions
--- a/mlx/backend/cuda/logsumexp.cu
+++ b/mlx/backend/cuda/logsumexp.cu
@@ -43,20 +43,19 @@ __global__ void logsumexp(const T* in, T* out, int axis_size) {
  AccT maxval = Limits<AccT>::finite_min();
  AccT normalizer = 0;
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); r++) {
-    AccT vals[N_READS];
-    cub::LoadDirectBlocked(
-        r * BLOCK_DIM + block.thread_rank(),
-        make_cast_iterator<AccT>(in),
-        vals,
-        axis_size,
-        Limits<AccT>::min());
+    auto index = r * BLOCK_DIM + block.thread_rank();
+    auto vals = load_vector<N_READS>(in, index, axis_size, Limits<T>::min());
    prevmax = maxval;
-    maxval = max_op(maxval, cub::ThreadReduce(vals, max_op));
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      maxval = max_op(maxval, static_cast<AccT>(vals[i]));
+    }
    // Online normalizer calculation for softmax:
    // https://github.com/NVIDIA/online-softmax
    normalizer = normalizer * softmax_exp(prevmax - maxval);
    for (int i = 0; i < N_READS; i++) {
-      normalizer = normalizer + softmax_exp(vals[i] - maxval);
+      normalizer =
+          normalizer + softmax_exp(static_cast<AccT>(vals[i]) - maxval);
    }
  }

@@ -143,9 +142,9 @@ void LogSumExp::eval_gpu(const std::vector<array>& inputs, array& out) {
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  dispatch_float_types(out.dtype(), "logsumexp", [&](auto type_tag) {
-    constexpr int N_READS = 4;
+    using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+    constexpr int N_READS = 16 / sizeof(DataType);
    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
-      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
      auto kernel = cu::logsumexp<DataType, float, block_dim(), N_READS>;
      encoder.add_kernel_node(
          kernel,
--- a/mlx/backend/cuda/softmax.cu
+++ b/mlx/backend/cuda/softmax.cu
@@ -11,7 +11,6 @@
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 #include <nvtx3/nvtx3.hpp>
-#include <cub/block/block_load.cuh>

 #include <cassert>

@@ -45,20 +44,21 @@ __global__ void softmax(const T* in, T* out, int axis_size) {
  AccT maxval = Limits<AccT>::finite_min();
  AccT normalizer = cast_to<AccT>(0);
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); r++) {
-    AccT vals[N_READS];
-    cub::LoadDirectBlocked(
-        r * BLOCK_DIM + block.thread_rank(),
-        make_cast_iterator<AccT>(in),
-        vals,
-        axis_size,
-        Limits<AccT>::min());
+    auto index = r * BLOCK_DIM + block.thread_rank();
+    auto vals = load_vector<N_READS>(in, index, axis_size, Limits<T>::min());
    prevmax = maxval;
-    maxval = max_op(maxval, cub::ThreadReduce(vals, max_op));
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      maxval = max_op(maxval, static_cast<AccT>(vals[i]));
+    }
+
    // Online normalizer calculation for softmax:
    // https://github.com/NVIDIA/online-softmax
    normalizer = normalizer * softmax_exp(prevmax - maxval);
+#pragma unroll
    for (int i = 0; i < N_READS; i++) {
-      normalizer = normalizer + softmax_exp(vals[i] - maxval);
+      normalizer =
+          normalizer + softmax_exp(static_cast<AccT>(vals[i]) - maxval);
    }
  }

@@ -95,12 +95,11 @@ __global__ void softmax(const T* in, T* out, int axis_size) {
  // Write output.
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); r++) {
    auto index = r * BLOCK_DIM + block.thread_rank();
-    T vals[N_READS];
-    cub::LoadDirectBlocked(index, in, vals, axis_size);
+    auto vals = load_vector<N_READS>(in, index, axis_size, T(0));
    for (int i = 0; i < N_READS; i++) {
      vals[i] = softmax_exp(static_cast<AccT>(vals[i]) - maxval) * normalizer;
    }
-    cub::StoreDirectBlocked(index, out, vals, axis_size);
+    store_vector<N_READS>(out, index, vals, axis_size);
  }
 }

@@ -141,9 +140,9 @@ void Softmax::eval_gpu(const std::vector<array>& inputs, array& out) {
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  dispatch_float_types(out.dtype(), "softmax", [&](auto type_tag) {
-    constexpr int N_READS = 4;
+    using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+    constexpr int N_READS = 16 / sizeof(DataType);
    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
-      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
      auto kernel = cu::softmax<DataType, DataType, block_dim(), N_READS>;
      if (precise) {
        kernel = cu::softmax<DataType, float, block_dim(), N_READS>;