Centralize NAX condition (#2811)

2025-12-16 01:49:05 +08:00 · 2025-11-21 13:28:15 -08:00
parent 0d68efd461
commit 0dbc7e5bee
6 changed files with 135 additions and 191 deletions
--- a/mlx/backend/metal/CMakeLists.txt
+++ b/mlx/backend/metal/CMakeLists.txt
@@ -121,14 +121,6 @@ if(NOT MLX_METAL_PATH)
  set(MLX_METAL_PATH ${CMAKE_CURRENT_BINARY_DIR}/kernels/)
 endif()

-if((MLX_METAL_VERSION GREATER_EQUAL 400) AND (MACOS_SDK_VERSION GREATER_EQUAL
-                                              26.2))
-  set(MLX_ENABLE_NAX TRUE)
-  target_compile_definitions(mlx PRIVATE MLX_ENABLE_NAX)
-else()
-  set(MLX_ENABLE_NAX FALSE)
-endif()
-
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels)

 target_compile_definitions(mlx
--- a/mlx/backend/metal/device.h
+++ b/mlx/backend/metal/device.h
@@ -265,14 +265,19 @@ Device& device(mlx::core::Device);

 std::unique_ptr<void, std::function<void(void*)>> new_scoped_memory_pool();

-#ifdef MLX_ENABLE_NAX
-
 inline bool is_nax_available() {
-  static bool is_nax_available_ =
-      metal::device(mlx::core::Device::gpu).get_architecture_gen() >= 17;
+  auto _check_nax = []() {
+    bool can_use_nax = false;
+    if (__builtin_available(
+            macOS 26.2, iOS 26.2, tvOS 26.2, visionOS 26.2, *)) {
+      can_use_nax = true;
+    }
+    can_use_nax &=
+        metal::device(mlx::core::Device::gpu).get_architecture_gen() >= 17;
+    return can_use_nax;
+  };
+  static bool is_nax_available_ = _check_nax();
  return is_nax_available_;
 }

-#endif // MLX_ENABLE_NAX
-
 } // namespace mlx::core::metal
--- a/mlx/backend/metal/kernels/CMakeLists.txt
+++ b/mlx/backend/metal/kernels/CMakeLists.txt
@@ -9,13 +9,17 @@ set(BASE_HEADERS
    utils.h)

 function(build_kernel_base TARGET SRCFILE DEPS)
-  set(METAL_FLAGS -x metal -Wall -Wextra -fno-fast-math -Wno-c++17-extensions)
+  set(METAL_FLAGS
+      -x
+      metal
+      -Wall
+      -Wextra
+      -fno-fast-math
+      -Wno-c++17-extensions
+      -Wno-c++20-extensions)
  if(MLX_METAL_DEBUG)
    set(METAL_FLAGS ${METAL_FLAGS} -gline-tables-only -frecord-sources)
  endif()
-  if(MLX_ENABLE_NAX)
-    set(METAL_FLAGS ${METAL_FLAGS} -Wno-c++20-extensions -std=metal4.0)
-  endif()
  if(NOT CMAKE_OSX_DEPLOYMENT_TARGET STREQUAL "")
    set(METAL_FLAGS ${METAL_FLAGS}
                    "-mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET}")
@@ -123,8 +127,8 @@ if(NOT MLX_METAL_JIT)
  build_kernel(gemv_masked steel/utils.h)
 endif()

-if(MLX_ENABLE_NAX)
-
+if((MLX_METAL_VERSION GREATER_EQUAL 400) AND (MACOS_SDK_VERSION GREATER_EQUAL
+                                              26.2))
  set(STEEL_NAX_HEADERS
      steel/defines.h
      steel/utils.h
--- a/mlx/backend/metal/matmul.cpp
+++ b/mlx/backend/metal/matmul.cpp
@@ -172,8 +172,6 @@ ensure_batch_contiguous(const array& x, metal::Device& d, const Stream& s) {
 // Regular steel matmul dispatch
 ///////////////////////////////////////////////////////////////////////////////

-#ifdef MLX_ENABLE_NAX
-
 template <bool CHECK_AB>
 void steel_matmul_regular_axpby_nax(
    const Stream& s,
@@ -329,8 +327,6 @@ void steel_matmul_regular_axpby_nax(
  d.add_temporaries(std::move(copies), s.index);
 }

-#endif // MLX_ENABLE_NAX
-
 template <bool CHECK_AB>
 void steel_matmul_regular_axpby(
    const Stream& s,
@@ -357,41 +353,35 @@ void steel_matmul_regular_axpby(
    int64_t C_batch_stride /* = 0*/,
    float alpha /* = 1.0f */,
    float beta /* = 0.0f */) {
-#ifdef MLX_ENABLE_NAX
-
-  if (__builtin_available(macOS 26.2, iOS 26.2, tvOS 26.2, visionOS 26.2, *)) {
-    if (metal::is_nax_available() && !issubdtype(a.dtype(), complexfloating) &&
-        (env::enable_tf32() || a.dtype() != float32)) {
-      return steel_matmul_regular_axpby_nax<CHECK_AB>(
-          /* const Stream& s = */ s,
-          /* metal::Device& d = */ d,
-          /* const array& a = */ a,
-          /* const array& b = */ b,
-          /* const array& c = */ c,
-          /* array& out = */ out,
-          /* int M = */ M,
-          /* int N = */ N,
-          /* int K = */ K,
-          /* int batch_size_out = */ batch_size_out,
-          /* int lda = */ lda,
-          /* int ldb = */ ldb,
-          /* int ldd = */ ldd,
-          /* bool transpose_a = */ transpose_a,
-          /* bool transpose_b = */ transpose_b,
-          /* std::vector<array>& copies = */ copies,
-          /* Shape batch_shape = */ batch_shape,
-          /* Strides batch_strides = */ batch_strides,
-          /* int64_t A_batch_stride = */ A_batch_stride,
-          /* int64_t B_batch_stride = */ B_batch_stride,
-          /* int64_t matrix_stride_out = */ matrix_stride_out,
-          /* int64_t C_batch_stride = */ C_batch_stride,
-          /* float alpha = */ alpha,
-          /* float beta = */ beta);
-    }
+  if (metal::is_nax_available() && !issubdtype(a.dtype(), complexfloating) &&
+      (env::enable_tf32() || a.dtype() != float32)) {
+    return steel_matmul_regular_axpby_nax<CHECK_AB>(
+        /* const Stream& s = */ s,
+        /* metal::Device& d = */ d,
+        /* const array& a = */ a,
+        /* const array& b = */ b,
+        /* const array& c = */ c,
+        /* array& out = */ out,
+        /* int M = */ M,
+        /* int N = */ N,
+        /* int K = */ K,
+        /* int batch_size_out = */ batch_size_out,
+        /* int lda = */ lda,
+        /* int ldb = */ ldb,
+        /* int ldd = */ ldd,
+        /* bool transpose_a = */ transpose_a,
+        /* bool transpose_b = */ transpose_b,
+        /* std::vector<array>& copies = */ copies,
+        /* Shape batch_shape = */ batch_shape,
+        /* Strides batch_strides = */ batch_strides,
+        /* int64_t A_batch_stride = */ A_batch_stride,
+        /* int64_t B_batch_stride = */ B_batch_stride,
+        /* int64_t matrix_stride_out = */ matrix_stride_out,
+        /* int64_t C_batch_stride = */ C_batch_stride,
+        /* float alpha = */ alpha,
+        /* float beta = */ beta);
  }

-#endif // MLX_ENABLE_NAX
-
  using namespace mlx::steel;

  // Determine dispatch kernel
@@ -1766,8 +1756,6 @@ void gather_mm_rhs(
  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
 }

-#ifdef MLX_ENABLE_NAX
-
 void gather_mm_rhs_nax(
    const array& a_,
    const array& b_,
@@ -1911,8 +1899,6 @@ void gather_mm_rhs_nax(
  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
 }

-#endif // MLX_ENABLE_NAX
-
 void gather_mv(
    const array& mat_,
    const array& vec_,
@@ -2196,19 +2182,10 @@ void GatherMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  // We are walking a in order and b is also in order so we can batch up the
  // matmuls and reuse reading a and b.
  if (M == 1 && right_sorted_ == true) {
-#ifdef MLX_ENABLE_NAX
-
-    if (__builtin_available(
-            macOS 26.2, iOS 26.2, tvOS 26.2, visionOS 26.2, *)) {
-      if (metal::is_nax_available() &&
-          !issubdtype(a.dtype(), complexfloating) &&
-          (env::enable_tf32() || a.dtype() != float32)) {
-        return gather_mm_rhs_nax(a, b, rhs_indices, out, d, s);
-      }
+    if (metal::is_nax_available() &&
+        (env::enable_tf32() || a.dtype() != float32)) {
+      return gather_mm_rhs_nax(a, b, rhs_indices, out, d, s);
    }
-
-#endif // MLX_ENABLE_NAX
-
    gather_mm_rhs(a, b, rhs_indices, out, d, s);
    return;
  }
--- a/mlx/backend/metal/quantized.cpp
+++ b/mlx/backend/metal/quantized.cpp
@@ -451,8 +451,6 @@ void qvm(
  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
 }

-#ifdef MLX_ENABLE_NAX
-
 void qmm_nax(
    const array& x,
    const array& w,
@@ -653,8 +651,6 @@ void gather_qmm_nax(
  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
 }

-#endif // MLX_ENABLE_NAX
-
 void qmm(
    const array& x,
    const array& w,
@@ -670,31 +666,25 @@ void qmm(
    metal::Device& d,
    const Stream& s,
    const std::string& mode) {
-#ifdef MLX_ENABLE_NAX
-
-  if (__builtin_available(macOS 26.2, iOS 26.2, tvOS 26.2, visionOS 26.2, *)) {
-    if (metal::is_nax_available() && transpose && (K % 64 == 0) &&
-        (env::enable_tf32() || x.dtype() != float32)) {
-      return qmm_nax(
-          /* const array& x = */ x,
-          /* const array& w = */ w,
-          /* const array& scales = */ scales,
-          /* const std::optional<array>& biases = */ biases,
-          /* array& out = */ out,
-          /* bool transpose = */ transpose,
-          /* int group_size = */ group_size,
-          /* int bits = */ bits,
-          /* int M = */ M,
-          /* int N = */ N,
-          /* int K = */ K,
-          /* metal::Device& d = */ d,
-          /* const Stream& s = */ s,
-          /* const std::string& mode = */ mode);
-    }
+  if (metal::is_nax_available() && transpose && (K % 64 == 0) &&
+      (env::enable_tf32() || x.dtype() != float32)) {
+    return qmm_nax(
+        /* const array& x = */ x,
+        /* const array& w = */ w,
+        /* const array& scales = */ scales,
+        /* const std::optional<array>& biases = */ biases,
+        /* array& out = */ out,
+        /* bool transpose = */ transpose,
+        /* int group_size = */ group_size,
+        /* int bits = */ bits,
+        /* int M = */ M,
+        /* int N = */ N,
+        /* int K = */ K,
+        /* metal::Device& d = */ d,
+        /* const Stream& s = */ s,
+        /* const std::string& mode = */ mode);
  }

-#endif // MLX_ENABLE_NAX
-
  int B = out.size() / M / N;

  int wm = 2;
@@ -772,33 +762,27 @@ void gather_qmm(
    metal::Device& d,
    const Stream& s,
    const std::string& mode) {
-#ifdef MLX_ENABLE_NAX
-
-  if (__builtin_available(macOS 26.2, iOS 26.2, tvOS 26.2, visionOS 26.2, *)) {
-    if (metal::is_nax_available() && transpose && (K % 64 == 0) &&
-        (env::enable_tf32() || x.dtype() != float32)) {
-      return gather_qmm_nax(
-          /* const array& x = */ x,
-          /* const array& w = */ w,
-          /* const array& scales = */ scales,
-          /* const std::optional<array>& biases = */ biases,
-          /* const array& lhs_indices = */ lhs_indices,
-          /* const array& rhs_indices = */ rhs_indices,
-          /* array& out = */ out,
-          /* bool transpose = */ transpose,
-          /* int group_size = */ group_size,
-          /* int bits = */ bits,
-          /* int M = */ M,
-          /* int N = */ N,
-          /* int K = */ K,
-          /* metal::Device& d = */ d,
-          /* const Stream& s = */ s,
-          /* const std::string& mode = */ mode);
-    }
+  if (metal::is_nax_available() && transpose && (K % 64 == 0) &&
+      (env::enable_tf32() || x.dtype() != float32)) {
+    return gather_qmm_nax(
+        /* const array& x = */ x,
+        /* const array& w = */ w,
+        /* const array& scales = */ scales,
+        /* const std::optional<array>& biases = */ biases,
+        /* const array& lhs_indices = */ lhs_indices,
+        /* const array& rhs_indices = */ rhs_indices,
+        /* array& out = */ out,
+        /* bool transpose = */ transpose,
+        /* int group_size = */ group_size,
+        /* int bits = */ bits,
+        /* int M = */ M,
+        /* int N = */ N,
+        /* int K = */ K,
+        /* metal::Device& d = */ d,
+        /* const Stream& s = */ s,
+        /* const std::string& mode = */ mode);
  }

-#endif // MLX_ENABLE_NAX
-
  int B = out.size() / M / N;

  int wm = 2;
@@ -975,8 +959,6 @@ void gather_qvm(
  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
 }

-#ifdef MLX_ENABLE_NAX
-
 void gather_qmm_rhs_nax(
    const array& x_,
    const array& w_,
@@ -1108,8 +1090,6 @@ void gather_qmm_rhs_nax(
  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
 }

-#endif // MLX_ENABLE_NAX
-
 void gather_qmm_rhs(
    const array& x_,
    const array& w_,
@@ -1126,32 +1106,26 @@ void gather_qmm_rhs(
    metal::Device& d,
    const Stream& s,
    const std::string mode) {
-#ifdef MLX_ENABLE_NAX
-
-  if (__builtin_available(macOS 26.2, iOS 26.2, tvOS 26.2, visionOS 26.2, *)) {
-    if (metal::is_nax_available() && transpose &&
-        (env::enable_tf32() || x_.dtype() != float32)) {
-      return gather_qmm_rhs_nax(
-          /* const array& x_ = */ x_,
-          /* const array& w_ = */ w_,
-          /* const array& scales_ = */ scales_,
-          /* const std::optional<array>& biases_ = */ biases_,
-          /* const array& indices_ = */ indices_,
-          /* array& out = */ out,
-          /* bool transpose = */ transpose,
-          /* int group_size = */ group_size,
-          /* int bits = */ bits,
-          /* int M = */ M,
-          /* int N = */ N,
-          /* int K = */ K,
-          /* metal::Device& d = */ d,
-          /* const Stream& s = */ s,
-          /* const std::string mode = */ mode);
-    }
+  if (metal::is_nax_available() && transpose &&
+      (env::enable_tf32() || x_.dtype() != float32)) {
+    return gather_qmm_rhs_nax(
+        /* const array& x_ = */ x_,
+        /* const array& w_ = */ w_,
+        /* const array& scales_ = */ scales_,
+        /* const std::optional<array>& biases_ = */ biases_,
+        /* const array& indices_ = */ indices_,
+        /* array& out = */ out,
+        /* bool transpose = */ transpose,
+        /* int group_size = */ group_size,
+        /* int bits = */ bits,
+        /* int M = */ M,
+        /* int N = */ N,
+        /* int K = */ K,
+        /* metal::Device& d = */ d,
+        /* const Stream& s = */ s,
+        /* const std::string mode = */ mode);
  }

-#endif // MLX_ENABLE_NAX
-
  // Start by normalizing the indices
  array indices = ensure_row_contiguous(indices_, d, s);

--- a/mlx/backend/metal/scaled_dot_product_attention.cpp
+++ b/mlx/backend/metal/scaled_dot_product_attention.cpp
@@ -13,8 +13,6 @@ namespace mlx::core::fast {

 namespace {

-#ifdef MLX_ENABLE_NAX
-
 void sdpa_full_self_attention_nax(
    const Stream& s,
    metal::Device& d,
@@ -150,8 +148,6 @@ void sdpa_full_self_attention_nax(
  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
 }

-#endif // MLX_ENABLE_NAX
-
 void sdpa_full_self_attention_metal(
    const Stream& s,
    metal::Device& d,
@@ -163,24 +159,20 @@ void sdpa_full_self_attention_metal(
    bool do_causal_,
    const std::optional<array>& mask,
    const std::optional<array>& sinks) {
-#ifdef MLX_ENABLE_NAX
-  if (__builtin_available(macOS 26.2, iOS 26.2, tvOS 26.2, visionOS 26.2, *)) {
-    if (metal::is_nax_available() && q.shape(3) != 80 &&
-        (env::enable_tf32() || q.dtype() != float32)) {
-      return sdpa_full_self_attention_nax(
-          /* const Stream& s = */ s,
-          /* metal::Device& d = */ d,
-          /* const array& q = */ q,
-          /* const array& k = */ k,
-          /* const array& v = */ v,
-          /* const float scale = */ scale,
-          /* array& o = */ o,
-          /* bool do_causal_ = */ do_causal_,
-          /* const std::optional<array>& mask = */ mask,
-          /* const std::optional<array>& sinks = */ sinks);
-    }
+  if (metal::is_nax_available() && q.shape(3) != 80 &&
+      (env::enable_tf32() || q.dtype() != float32)) {
+    return sdpa_full_self_attention_nax(
+        /* const Stream& s = */ s,
+        /* metal::Device& d = */ d,
+        /* const array& q = */ q,
+        /* const array& k = */ k,
+        /* const array& v = */ v,
+        /* const float scale = */ scale,
+        /* array& o = */ o,
+        /* bool do_causal_ = */ do_causal_,
+        /* const std::optional<array>& mask = */ mask,
+        /* const std::optional<array>& sinks = */ sinks);
  }
-#endif // MLX_ENABLE_NAX

  using namespace mlx::steel;