faster softmax and logsumexp

2025-12-16 01:49:05 +08:00 · 2025-07-29 14:34:00 -07:00
parent 483221631a
commit 5694f764fc
2 changed files with 3 additions and 2 deletions
--- a/mlx/backend/cuda/logsumexp.cu
+++ b/mlx/backend/cuda/logsumexp.cu
@@ -54,7 +54,7 @@ __global__ void logsumexp(const T* in, T* out, int axis_size) {
    // https://github.com/NVIDIA/online-softmax
    normalizer = normalizer * softmax_exp(prevmax - maxval);
    for (int i = 0; i < N_READS; i++) {
-      normalizer = normalizer + softmax_exp(vals[i] - maxval);
+      normalizer = normalizer + softmax_exp(static_cast<AccT>(vals[i]) - maxval);
    }
  }
--- a/mlx/backend/cuda/softmax.cu
+++ b/mlx/backend/cuda/softmax.cu
@@ -57,7 +57,8 @@ __global__ void softmax(const T* in, T* out, int axis_size) {
    normalizer = normalizer * softmax_exp(prevmax - maxval);
 #pragma unroll
    for (int i = 0; i < N_READS; i++) {
-      normalizer = normalizer + softmax_exp(static_cast<AccT>(vals[i]) - maxval);
+      normalizer =
          normalizer + softmax_exp(static_cast<AccT>(vals[i]) - maxval);
    }
  }