MLX_SWITCH macros to templates (#2320)

2025-12-16 01:49:05 +08:00 · 2025-07-01 01:33:44 -07:00
parent 33bf1a244b
commit 3d5e17e507
27 changed files with 693 additions and 692 deletions
--- a/mlx/backend/cuda/reduce/col_reduce.cu
+++ b/mlx/backend/cuda/reduce/col_reduce.cu
@@ -215,11 +215,12 @@ void col_reduce_looped(
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_ALL_TYPES(in.dtype(), CTYPE, {
-      MLX_SWITCH_REDUCE_OPS(reduce_type, OP, {
-        MLX_SWITCH_REDUCE_NDIM(args.reduce_ndim, NDIM, {
-          using T = cuda_type_t<CTYPE>;
-          using U = cu::ReduceResult<OP, T>::type;
+    dispatch_all_types(in.dtype(), [&](auto type_tag) {
+      dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+        dispatch_reduce_ndim(args.reduce_ndim, [&](auto reduce_ndim) {
+          using OP = MLX_GET_TYPE(reduce_type_tag);
+          using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+          using U = typename cu::ReduceResult<OP, T>::type;

          // Cub doesn't like const pointers for vectorized loads. (sigh)
          T* indata = const_cast<T*>(in.data<T>());
@@ -229,7 +230,8 @@ void col_reduce_looped(
          constexpr int BN = 32;
          dim3 grid = output_grid_for_col_reduce(out, args, BN);
          int blocks = BM * BN / N_READS;
-          auto kernel = cu::col_reduce_looped<T, U, OP, NDIM, BM, BN, N_READS>;
+          auto kernel =
+              cu::col_reduce_looped<T, U, OP, reduce_ndim(), BM, BN, N_READS>;
          kernel<<<grid, blocks, 0, stream>>>(indata, out.data<U>(), args);
        });
      });