Custom logsumexp (#2028)

* initial custom logsumexp * more tests * comments + fix
2025-12-16 01:49:05 +08:00 · 2025-03-31 07:36:55 -07:00
parent ec2854b13a
commit de5f38fd48
27 changed files with 590 additions and 255 deletions
--- a/mlx/backend/metal/CMakeLists.txt
+++ b/mlx/backend/metal/CMakeLists.txt
@@ -47,6 +47,7 @@ if(MLX_METAL_JIT)
  make_jit_source(binary)
  make_jit_source(binary_two)
  make_jit_source(fft kernels/fft/radix.h kernels/fft/readwrite.h)
+  make_jit_source(logsumexp)
  make_jit_source(ternary)
  make_jit_source(softmax)
  make_jit_source(scan)
@@ -95,6 +96,7 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/hadamard.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/logsumexp.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/scaled_dot_product_attention.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/metal.cpp
--- a/mlx/backend/metal/jit/arange.h
+++ b/mlx/backend/metal/jit/arange.h
@@ -1,9 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-constexpr std::string_view arange_kernels = R"(
-template [[host_name("{0}")]] [[kernel]] void arange<{1}>(
-    constant const {1}& start,
-    constant const {1}& step,
-    device {1}* out,
-    uint index [[thread_position_in_grid]]);
-)";
--- a/mlx/backend/metal/jit/includes.h
+++ b/mlx/backend/metal/jit/includes.h
@@ -20,6 +20,7 @@ const char* copy();
 const char* fft();
 const char* gather_axis();
 const char* hadamard();
+const char* logsumexp();
 const char* quantized();
 const char* ternary();
 const char* scan();
--- a/mlx/backend/metal/jit/softmax.h
+++ b/mlx/backend/metal/jit/softmax.h
@@ -1,23 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-constexpr std::string_view softmax_kernels = R"(
-template [[host_name("block_{0}")]] [[kernel]] void
-softmax_single_row<{1}, {2}>(
-    const device {1}* in,
-    device {1}* out,
-    constant int& axis_size,
-    uint gid [[thread_position_in_grid]],
-    uint _lid [[thread_position_in_threadgroup]],
-    uint simd_lane_id [[thread_index_in_simdgroup]],
-    uint simd_group_id [[simdgroup_index_in_threadgroup]]);
-template [[host_name("looped_{0}")]] [[kernel]] void
-softmax_looped<{1}, {2}>(
-    const device {1}* in,
-    device {1}* out,
-    constant int& axis_size,
-    uint gid [[threadgroup_position_in_grid]],
-    uint lid [[thread_position_in_threadgroup]],
-    uint lsize [[threads_per_threadgroup]],
-    uint simd_lane_id [[thread_index_in_simdgroup]],
-    uint simd_group_id [[simdgroup_index_in_threadgroup]]);
-)";
--- a/mlx/backend/metal/jit_kernels.cpp
+++ b/mlx/backend/metal/jit_kernels.cpp
@@ -1,8 +1,6 @@
 // Copyright © 2024 Apple Inc.
 #include "mlx/backend/common/compiled.h"
-#include "mlx/backend/metal/jit/arange.h"
 #include "mlx/backend/metal/jit/includes.h"
-#include "mlx/backend/metal/jit/softmax.h"
 #include "mlx/backend/metal/kernels.h"
 #include "mlx/backend/metal/utils.h"

@@ -21,13 +19,11 @@ MTL::ComputePipelineState* get_arange_kernel(
    const std::string& kernel_name,
    const array& out) {
  auto lib = d.get_library(kernel_name, [&]() {
-    std::ostringstream kernel_source;
-    kernel_source << metal::utils() << metal::arange()
-                  << fmt::format(
-                         arange_kernels,
-                         kernel_name,
-                         get_type_string(out.dtype()));
-    return kernel_source.str();
+    std::string kernel_source = metal::utils();
+    kernel_source += metal::arange();
+    kernel_source += get_template_definition(
+        kernel_name, "arange", get_type_string(out.dtype()));
+    return kernel_source;
  });
  return d.get_kernel(kernel_name, lib);
 }
@@ -259,14 +255,34 @@ MTL::ComputePipelineState* get_softmax_kernel(
    const array& out) {
  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
  auto lib = d.get_library(lib_name, [&] {
-    std::ostringstream kernel_source;
-    kernel_source << metal::utils() << metal::softmax()
-                  << fmt::format(
-                         softmax_kernels,
-                         lib_name,
-                         get_type_string(out.dtype()),
-                         get_type_string(precise ? float32 : out.dtype()));
-    return kernel_source.str();
+    std::string kernel_source = metal::utils();
+    auto in_type = get_type_string(out.dtype());
+    auto acc_type = get_type_string(precise ? float32 : out.dtype());
+    kernel_source += metal::softmax();
+    kernel_source += get_template_definition(
+        "block_" + lib_name, "softmax_single_row", in_type, acc_type);
+    kernel_source += get_template_definition(
+        "looped_" + lib_name, "softmax_looped", in_type, acc_type);
+    return kernel_source;
+  });
+  return d.get_kernel(kernel_name, lib);
+}
+
+MTL::ComputePipelineState* get_logsumexp_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& out) {
+  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
+  auto lib = d.get_library(lib_name, [&] {
+    auto t_str = get_type_string(out.dtype());
+    std::string kernel_source;
+    kernel_source = metal::utils();
+    kernel_source += metal::logsumexp();
+    kernel_source +=
+        get_template_definition("block_" + lib_name, "logsumexp", t_str);
+    kernel_source += get_template_definition(
+        "looped_" + lib_name, "logsumexp_looped", t_str);
+    return kernel_source;
  });
  return d.get_kernel(kernel_name, lib);
 }
--- a/mlx/backend/metal/kernels.h
+++ b/mlx/backend/metal/kernels.h
@@ -59,6 +59,11 @@ MTL::ComputePipelineState* get_softmax_kernel(
    bool precise,
    const array& out);

+MTL::ComputePipelineState* get_logsumexp_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& out);
+
 MTL::ComputePipelineState* get_scan_kernel(
    metal::Device& d,
    const std::string& kernel_name,
--- a/mlx/backend/metal/kernels/CMakeLists.txt
+++ b/mlx/backend/metal/kernels/CMakeLists.txt
@@ -109,6 +109,7 @@ if(NOT MLX_METAL_JIT)
  build_kernel(quantized quantized.h ${STEEL_HEADERS})
  build_kernel(scan scan.h)
  build_kernel(softmax softmax.h)
+  build_kernel(logsumexp logsumexp.h)
  build_kernel(sort sort.h)
  build_kernel(ternary ternary.h ternary_ops.h)
  build_kernel(unary unary.h unary_ops.h)
--- a/mlx/backend/metal/kernels/arange.metal
+++ b/mlx/backend/metal/kernels/arange.metal
@@ -5,11 +5,7 @@
 #include "mlx/backend/metal/kernels/arange.h"

 #define instantiate_arange(tname, type)                                 \
-  template [[host_name("arange" #tname)]] [[kernel]] void arange<type>( \
-      constant const type& start,                                       \
-      constant const type& step,                                        \
-      device type* out,                                                 \
-      uint index [[thread_position_in_grid]]);
+  instantiate_kernel("arange" #tname, arange, type)

 instantiate_arange(uint8, uint8_t)
 instantiate_arange(uint16, uint16_t)
--- a/mlx/backend/metal/kernels/layer_norm.metal
+++ b/mlx/backend/metal/kernels/layer_norm.metal
@@ -493,71 +493,11 @@ template <typename T, int N_READS = RMS_N_READS>
 }

 // clang-format off
-#define instantiate_layer_norm_single_row(name, itype)            \
-  template [[host_name("layer_norm" #name)]] [[kernel]] void      \
-  layer_norm_single_row<itype>(                                   \
-      const device itype* x,                                      \
-      const device itype* w,                                      \
-      const device itype* b,                                      \
-      device itype* out,                                          \
-      constant float& eps,                                        \
-      constant uint& axis_size,                                   \
-      constant uint& w_stride,                                    \
-      constant uint& b_stride,                                    \
-      uint gid [[thread_position_in_grid]],                       \
-      uint lid [[thread_position_in_threadgroup]],                \
-      uint simd_lane_id [[thread_index_in_simdgroup]],            \
-      uint simd_group_id [[simdgroup_index_in_threadgroup]]);     \
-  template [[host_name("vjp_layer_norm" #name)]] [[kernel]] void  \
-  vjp_layer_norm_single_row<itype>(                               \
-      const device itype* x,                                      \
-      const device itype* w,                                      \
-      const device itype* g,                                      \
-      device itype* gx,                                           \
-      device itype* gw,                                           \
-      constant float& eps,                                        \
-      constant uint& axis_size,                                   \
-      constant uint& w_stride,                                    \
-      uint gid [[thread_position_in_grid]],                       \
-      uint lid [[thread_position_in_threadgroup]],                \
-      uint simd_lane_id [[thread_index_in_simdgroup]],            \
-      uint simd_group_id [[simdgroup_index_in_threadgroup]]);
-
-#define instantiate_layer_norm_looped(name, itype)                       \
-  template [[host_name("layer_norm_looped" #name)]] [[kernel]] void      \
-  layer_norm_looped<itype>(                                              \
-      const device itype* x,                                             \
-      const device itype* w,                                             \
-      const device itype* b,                                             \
-      device itype* out,                                                 \
-      constant float& eps,                                               \
-      constant uint& axis_size,                                          \
-      constant uint& w_stride,                                           \
-      constant uint& b_stride,                                           \
-      uint gid [[thread_position_in_grid]],                              \
-      uint lid [[thread_position_in_threadgroup]],                       \
-      uint lsize [[threads_per_threadgroup]],                            \
-      uint simd_lane_id [[thread_index_in_simdgroup]],                   \
-      uint simd_group_id [[simdgroup_index_in_threadgroup]]);            \
-  template [[host_name("vjp_layer_norm_looped" #name)]] [[kernel]] void  \
-  vjp_layer_norm_looped<itype>(                                          \
-      const device itype* x,                                             \
-      const device itype* w,                                             \
-      const device itype* g,                                             \
-      device itype* gx,                                                  \
-      device itype* gb,                                                  \
-      constant float& eps,                                               \
-      constant uint& axis_size,                                          \
-      constant uint& w_stride,                                           \
-      uint gid [[thread_position_in_grid]],                              \
-      uint lid [[thread_position_in_threadgroup]],                       \
-      uint lsize [[threads_per_threadgroup]],                            \
-      uint simd_lane_id [[thread_index_in_simdgroup]],                   \
-      uint simd_group_id [[simdgroup_index_in_threadgroup]]);
-
-#define instantiate_layer_norm(name, itype)      \
-  instantiate_layer_norm_single_row(name, itype) \
-  instantiate_layer_norm_looped(name, itype)
+#define instantiate_layer_norm(name, itype)                                       \
+  instantiate_kernel("layer_norm" #name, layer_norm_single_row, itype)            \
+  instantiate_kernel("vjp_layer_norm" #name, vjp_layer_norm_single_row, itype)    \
+  instantiate_kernel("layer_norm_looped" #name, layer_norm_looped, itype)         \
+  instantiate_kernel("vjp_layer_norm_looped" #name, vjp_layer_norm_looped, itype)

 instantiate_layer_norm(float32, float)
 instantiate_layer_norm(float16, half)
--- a/mlx/backend/metal/kernels/logsumexp.h
+++ b/mlx/backend/metal/kernels/logsumexp.h
@@ -0,0 +1,142 @@
+// Copyright © 2025 Apple Inc.
+
+template <typename T, typename AccT = float, int N_READS = 4>
+[[kernel]] void logsumexp(
+    const device T* in,
+    device T* out,
+    constant int& axis_size,
+    uint gid [[threadgroup_position_in_grid]],
+    uint _lid [[thread_position_in_threadgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
+  int lid = _lid;
+
+  constexpr int SIMD_SIZE = 32;
+
+  threadgroup AccT local_max[SIMD_SIZE];
+  threadgroup AccT local_normalizer[SIMD_SIZE];
+
+  AccT ld[N_READS];
+
+  in += gid * size_t(axis_size) + lid * N_READS;
+  if (lid * N_READS + N_READS <= axis_size) {
+    for (int i = 0; i < N_READS; i++) {
+      ld[i] = AccT(in[i]);
+    }
+  } else {
+    for (int i = 0; i < N_READS; i++) {
+      ld[i] =
+          ((lid * N_READS + i) < axis_size) ? AccT(in[i]) : Limits<AccT>::min;
+    }
+  }
+  if (simd_group_id == 0) {
+    local_max[simd_lane_id] = Limits<AccT>::min;
+    local_normalizer[simd_lane_id] = 0;
+  }
+
+  // Get the max
+  AccT maxval = Limits<AccT>::finite_min;
+  for (int i = 0; i < N_READS; i++) {
+    maxval = (maxval < ld[i]) ? ld[i] : maxval;
+  }
+  maxval = simd_max(maxval);
+  if (simd_lane_id == 0) {
+    local_max[simd_group_id] = maxval;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  if (simd_group_id == 0) {
+    maxval = simd_max(local_max[simd_lane_id]);
+    if (simd_lane_id == 0) {
+      local_max[0] = maxval;
+    }
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  maxval = local_max[0];
+
+  // Compute exp(x_i - maxval) and store the partial sums in local_normalizer
+  AccT normalizer = 0;
+  for (int i = 0; i < N_READS; i++) {
+    normalizer += fast::exp(ld[i] - maxval);
+  }
+  normalizer = simd_sum(normalizer);
+  if (simd_lane_id == 0) {
+    local_normalizer[simd_group_id] = normalizer;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  if (simd_group_id == 0) {
+    normalizer = simd_sum(local_normalizer[simd_lane_id]);
+    if (simd_lane_id == 0) {
+      out[gid] = isinf(maxval) ? T(maxval) : T(log(normalizer) + maxval);
+    }
+  }
+}
+
+template <typename T, typename AccT = float, int N_READS = 4>
+[[kernel]] void logsumexp_looped(
+    const device T* in,
+    device T* out,
+    constant int& axis_size,
+    uint gid [[threadgroup_position_in_grid]],
+    uint lid [[thread_position_in_threadgroup]],
+    uint lsize [[threads_per_threadgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
+  in += gid * size_t(axis_size);
+
+  constexpr int SIMD_SIZE = 32;
+
+  threadgroup AccT local_max[SIMD_SIZE];
+  threadgroup AccT local_normalizer[SIMD_SIZE];
+
+  // Get the max and the normalizer in one go
+  AccT prevmax;
+  AccT maxval = Limits<AccT>::finite_min;
+  AccT normalizer = 0;
+  for (int r = 0; r < static_cast<int>(ceildiv(axis_size, N_READS * lsize));
+       r++) {
+    int offset = r * lsize * N_READS + lid * N_READS;
+    AccT vals[N_READS];
+    if (offset + N_READS <= axis_size) {
+      for (int i = 0; i < N_READS; i++) {
+        vals[i] = AccT(in[offset + i]);
+      }
+    } else {
+      for (int i = 0; i < N_READS; i++) {
+        vals[i] = (offset + i < axis_size) ? AccT(in[offset + i])
+                                           : Limits<AccT>::finite_min;
+      }
+    }
+    prevmax = maxval;
+    for (int i = 0; i < N_READS; i++) {
+      maxval = (maxval < vals[i]) ? vals[i] : maxval;
+    }
+    normalizer *= fast::exp(prevmax - maxval);
+    for (int i = 0; i < N_READS; i++) {
+      normalizer += fast::exp(vals[i] - maxval);
+    }
+  }
+  prevmax = maxval;
+  maxval = simd_max(maxval);
+  normalizer *= fast::exp(prevmax - maxval);
+  normalizer = simd_sum(normalizer);
+
+  prevmax = maxval;
+  if (simd_lane_id == 0) {
+    local_max[simd_group_id] = maxval;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  maxval = simd_max(local_max[simd_lane_id]);
+  normalizer *= fast::exp(prevmax - maxval);
+  if (simd_lane_id == 0) {
+    local_normalizer[simd_group_id] = normalizer;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  normalizer = simd_sum(local_normalizer[simd_lane_id]);
+
+  if (simd_group_id == 0) {
+    normalizer = simd_sum(local_normalizer[simd_lane_id]);
+    if (simd_lane_id == 0) {
+      out[gid] = isinf(maxval) ? T(maxval) : T(log(normalizer) + maxval);
+    }
+  }
+}
--- a/mlx/backend/metal/kernels/logsumexp.metal
+++ b/mlx/backend/metal/kernels/logsumexp.metal
@@ -0,0 +1,18 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include <metal_common>
+#include <metal_simdgroup>
+
+using namespace metal;
+
+// clang-format off
+#include "mlx/backend/metal/kernels/utils.h"
+#include "mlx/backend/metal/kernels/logsumexp.h"
+
+#define instantiate_logsumexp(name, itype)                               \
+  instantiate_kernel("block_logsumexp_" #name, logsumexp, itype)         \
+  instantiate_kernel("looped_logsumexp_" #name, logsumexp_looped, itype) \
+
+instantiate_logsumexp(float32, float)
+instantiate_logsumexp(float16, half)
+instantiate_logsumexp(bfloat16, bfloat16_t) // clang-format on
--- a/mlx/backend/metal/kernels/rms_norm.metal
+++ b/mlx/backend/metal/kernels/rms_norm.metal
@@ -380,69 +380,11 @@ template <typename T, int N_READS = RMS_N_READS>
 }

 // clang-format off
-#define instantiate_rms_single_row(name, itype)               \
-  template [[host_name("rms" #name)]] [[kernel]] void         \
-  rms_single_row<itype>(                                      \
-      const device itype* x,                                  \
-      const device itype* w,                                  \
-      device itype* out,                                      \
-      constant float& eps,                                    \
-      constant uint& axis_size,                               \
-      constant uint& w_stride,                                \
-      uint gid [[thread_position_in_grid]],                   \
-      uint lid [[thread_position_in_threadgroup]],            \
-      uint simd_lane_id [[thread_index_in_simdgroup]],        \
-      uint simd_group_id [[simdgroup_index_in_threadgroup]]); \
-                                                              \
-  template [[host_name("vjp_rms" #name)]] [[kernel]] void     \
-  vjp_rms_single_row<itype>(                                  \
-      const device itype* x,                                  \
-      const device itype* w,                                  \
-      const device itype* g,                                  \
-      device itype* gx,                                       \
-      device itype* gw,                                       \
-      constant float& eps,                                    \
-      constant uint& axis_size,                               \
-      constant uint& w_stride,                                \
-      uint gid [[thread_position_in_grid]],                   \
-      uint lid [[thread_position_in_threadgroup]],            \
-      uint simd_lane_id [[thread_index_in_simdgroup]],        \
-      uint simd_group_id [[simdgroup_index_in_threadgroup]]);
-
-#define instantiate_rms_looped(name, itype)                      \
-  template [[host_name("rms_looped" #name)]] [[kernel]] void     \
-  rms_looped<itype>(                                             \
-      const device itype* x,                                     \
-      const device itype* w,                                     \
-      device itype* out,                                         \
-      constant float& eps,                                       \
-      constant uint& axis_size,                                  \
-      constant uint& w_stride,                                   \
-      uint gid [[thread_position_in_grid]],                      \
-      uint lid [[thread_position_in_threadgroup]],               \
-      uint lsize [[threads_per_threadgroup]],                    \
-      uint simd_lane_id [[thread_index_in_simdgroup]],           \
-      uint simd_group_id [[simdgroup_index_in_threadgroup]]);    \
-                                                                 \
-  template [[host_name("vjp_rms_looped" #name)]] [[kernel]] void \
-  vjp_rms_looped<itype>(                                         \
-      const device itype* x,                                     \
-      const device itype* w,                                     \
-      const device itype* g,                                     \
-      device itype* gx,                                          \
-      device itype* gw,                                          \
-      constant float& eps,                                       \
-      constant uint& axis_size,                                  \
-      constant uint& w_stride,                                   \
-      uint gid [[thread_position_in_grid]],                      \
-      uint lid [[thread_position_in_threadgroup]],               \
-      uint lsize [[threads_per_threadgroup]],                    \
-      uint simd_lane_id [[thread_index_in_simdgroup]],           \
-      uint simd_group_id [[simdgroup_index_in_threadgroup]]);
-
-#define instantiate_rms(name, itype)      \
-  instantiate_rms_single_row(name, itype) \
-  instantiate_rms_looped(name, itype)
+#define instantiate_rms(name, itype)                                \
+  instantiate_kernel("rms" #name, rms_single_row, itype)            \
+  instantiate_kernel("vjp_rms" #name, vjp_rms_single_row, itype)    \
+  instantiate_kernel("rms_looped" #name, rms_looped, itype)         \
+  instantiate_kernel("vjp_rms_looped" #name, vjp_rms_looped, itype)

 instantiate_rms(float32, float)
 instantiate_rms(float16, half)
--- a/mlx/backend/metal/kernels/softmax.h
+++ b/mlx/backend/metal/kernels/softmax.h
@@ -40,7 +40,6 @@ template <typename T, typename AccT = T, int N_READS = SOFTMAX_N_READS>
    local_max[simd_lane_id] = Limits<AccT>::min;
    local_normalizer[simd_lane_id] = 0;
  }
-  threadgroup_barrier(mem_flags::mem_threadgroup);

  // Get the max
  AccT maxval = Limits<AccT>::finite_min;
--- a/mlx/backend/metal/kernels/softmax.metal
+++ b/mlx/backend/metal/kernels/softmax.metal
@@ -9,47 +9,13 @@ using namespace metal;
 #include "mlx/backend/metal/kernels/utils.h"
 #include "mlx/backend/metal/kernels/softmax.h"

-#define instantiate_softmax(name, itype)                          \
-  template [[host_name("block_softmax_" #name)]] [[kernel]] void        \
-  softmax_single_row<itype>(                                      \
-      const device itype* in,                                     \
-      device itype* out,                                          \
-      constant int& axis_size,                                    \
-      uint gid [[thread_position_in_grid]],                       \
-      uint _lid [[thread_position_in_threadgroup]],               \
-      uint simd_lane_id [[thread_index_in_simdgroup]],            \
-      uint simd_group_id [[simdgroup_index_in_threadgroup]]);     \
-  template [[host_name("looped_softmax_" #name)]] [[kernel]] void \
-  softmax_looped<itype>(                                          \
-      const device itype* in,                                     \
-      device itype* out,                                          \
-      constant int& axis_size,                                    \
-      uint gid [[threadgroup_position_in_grid]],                  \
-      uint lid [[thread_position_in_threadgroup]],                \
-      uint lsize [[threads_per_threadgroup]],                     \
-      uint simd_lane_id [[thread_index_in_simdgroup]],            \
-      uint simd_group_id [[simdgroup_index_in_threadgroup]]);
+#define instantiate_softmax(name, itype)                                \
+  instantiate_kernel("block_softmax_" #name, softmax_single_row, itype) \
+  instantiate_kernel("looped_softmax_" #name, softmax_looped, itype)

-#define instantiate_softmax_precise(name, itype)                          \
-  template [[host_name("block_softmax_precise_" #name)]] [[kernel]] void        \
-  softmax_single_row<itype, float>(                                       \
-      const device itype* in,                                             \
-      device itype* out,                                                  \
-      constant int& axis_size,                                            \
-      uint gid [[thread_position_in_grid]],                               \
-      uint _lid [[thread_position_in_threadgroup]],                       \
-      uint simd_lane_id [[thread_index_in_simdgroup]],                    \
-      uint simd_group_id [[simdgroup_index_in_threadgroup]]);             \
-  template [[host_name("looped_softmax_precise_" #name)]] [[kernel]] void \
-  softmax_looped<itype, float>(                                           \
-      const device itype* in,                                             \
-      device itype* out,                                                  \
-      constant int& axis_size,                                            \
-      uint gid [[threadgroup_position_in_grid]],                          \
-      uint lid [[thread_position_in_threadgroup]],                        \
-      uint lsize [[threads_per_threadgroup]],                             \
-      uint simd_lane_id [[thread_index_in_simdgroup]],                    \
-      uint simd_group_id [[simdgroup_index_in_threadgroup]]);
+#define instantiate_softmax_precise(name, itype)                                       \
+  instantiate_kernel("block_softmax_precise_" #name, softmax_single_row, itype, float) \
+  instantiate_kernel("looped_softmax_precise_" #name, softmax_looped, itype, float)

 instantiate_softmax(float32, float)
 instantiate_softmax(float16, half)
--- a/mlx/backend/metal/logsumexp.cpp
+++ b/mlx/backend/metal/logsumexp.cpp
@@ -0,0 +1,96 @@
+// Copyright © 2023-2024 Apple Inc.
+#include <algorithm>
+
+#include "mlx/backend/metal/copy.h"
+#include "mlx/backend/metal/device.h"
+#include "mlx/backend/metal/kernels.h"
+#include "mlx/backend/metal/utils.h"
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+constexpr int LOGSUMEXP_LOOPED_LIMIT = 4096;
+
+void LogSumExp::eval_gpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  if (!issubdtype(out.dtype(), floating)) {
+    throw std::runtime_error(
+        "[logsumexp] Does not support non-floating point types.");
+  }
+  auto& s = stream();
+  auto& d = metal::device(s.device);
+
+  // Make sure that the last dimension is contiguous
+  auto ensure_contiguous = [&s, &d](const array& x) {
+    if (x.flags().contiguous && x.strides()[x.ndim() - 1] == 1) {
+      return x;
+    } else {
+      auto x_copy = array(x.shape(), x.dtype(), nullptr, {});
+      copy_gpu(x, x_copy, CopyType::General, s);
+      d.add_temporary(x_copy, s.index);
+      return x_copy;
+    }
+  };
+
+  auto in = ensure_contiguous(inputs[0]);
+  if (in.flags().row_contiguous) {
+    out.set_data(allocator::malloc(out.nbytes()));
+  } else {
+    auto n = in.shape(-1);
+    auto flags = in.flags();
+    auto strides = in.strides();
+    for (auto& s : strides) {
+      s /= n;
+    }
+    bool col_contig = strides[0] == 1;
+    for (int i = 1; col_contig && i < strides.size(); ++i) {
+      col_contig &=
+          (out.shape(i) == 1 || strides[i - 1] == out.shape(i) * strides[i]);
+    }
+    flags.col_contiguous = col_contig;
+    out.set_data(
+        allocator::malloc(in.nbytes() / n),
+        in.data_size() / n,
+        std::move(strides),
+        flags);
+  }
+
+  int axis_size = in.shape().back();
+  int n_rows = in.data_size() / axis_size;
+
+  const int simd_size = 32;
+  const int n_reads = 4;
+  const int looped_limit = LOGSUMEXP_LOOPED_LIMIT;
+
+  std::string kernel_name = (axis_size > looped_limit) ? "looped_" : "block_";
+  kernel_name += "logsumexp_";
+  kernel_name += type_to_name(out);
+
+  auto kernel = get_logsumexp_kernel(d, kernel_name, out);
+  auto& compute_encoder = d.get_command_encoder(s.index);
+  {
+    MTL::Size grid_dims, group_dims;
+    if (axis_size <= looped_limit) {
+      size_t threadgroup_needed = (axis_size + n_reads - 1) / n_reads;
+      size_t simds_needed = (threadgroup_needed + simd_size - 1) / simd_size;
+      size_t threadgroup_size = simd_size * simds_needed;
+      assert(threadgroup_size <= kernel->maxTotalThreadsPerThreadgroup());
+      size_t n_threads = n_rows * threadgroup_size;
+      grid_dims = MTL::Size(n_threads, 1, 1);
+      group_dims = MTL::Size(threadgroup_size, 1, 1);
+    } else {
+      size_t threadgroup_size = kernel->maxTotalThreadsPerThreadgroup();
+      size_t n_threads = n_rows * threadgroup_size;
+      grid_dims = MTL::Size(n_threads, 1, 1);
+      group_dims = MTL::Size(threadgroup_size, 1, 1);
+    }
+
+    compute_encoder.set_compute_pipeline_state(kernel);
+    compute_encoder.set_input_array(in, 0);
+    compute_encoder.set_output_array(out, 1);
+    compute_encoder.set_bytes(axis_size, 2);
+    compute_encoder.dispatch_threads(grid_dims, group_dims);
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/metal/nojit_kernels.cpp
+++ b/mlx/backend/metal/nojit_kernels.cpp
@@ -72,6 +72,13 @@ MTL::ComputePipelineState* get_softmax_kernel(
  return d.get_kernel(kernel_name);
 }

+MTL::ComputePipelineState* get_logsumexp_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array&) {
+  return d.get_kernel(kernel_name);
+}
+
 MTL::ComputePipelineState* get_scan_kernel(
    metal::Device& d,
    const std::string& kernel_name,
--- a/mlx/backend/metal/softmax.cpp
+++ b/mlx/backend/metal/softmax.cpp
@@ -23,12 +23,7 @@ void Softmax::eval_gpu(const std::vector<array>& inputs, array& out) {

  // Make sure that the last dimension is contiguous
  auto set_output = [&s, &out](const array& x) {
-    bool no_copy = x.flags().contiguous && x.strides()[x.ndim() - 1] == 1;
-    if (no_copy && x.ndim() > 1) {
-      auto s = x.strides()[x.ndim() - 2];
-      no_copy &= (s == 0 || s == x.shape().back());
-    }
-    if (no_copy) {
+    if (x.flags().contiguous && x.strides()[x.ndim() - 1] == 1) {
      if (x.is_donatable()) {
        out.copy_shared_buffer(x);
      } else {