Move all type switching to templates

2025-12-16 01:49:05 +08:00 · 2025-06-29 04:04:15 -07:00
parent 45c43dd24a
commit ef813b6d13
19 changed files with 474 additions and 431 deletions
--- a/mlx/backend/cuda/arg_reduce.cu
+++ b/mlx/backend/cuda/arg_reduce.cu
@@ -155,25 +155,33 @@ void ArgReduce::eval_gpu(const std::vector<array>& inputs, array& out) {
    dispatch_real_types(in.dtype(), "ArgReduce", [&](auto type_tag) {
      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
      constexpr uint32_t N_READS = 4;
-      MLX_SWITCH_BLOCK_DIM(cuda::ceil_div(axis_size, N_READS), BLOCK_DIM, {
-        dim3 num_blocks = get_2d_grid_dims(out.shape(), out.strides());
-        dim3 block_dims{BLOCK_DIM, 1, 1};
-        auto kernel =
-            cu::arg_reduce_general<T, cu::ArgMax<T>, BLOCK_DIM, N_READS>;
-        if (reduce_type_ == ArgReduce::ArgMin) {
-          kernel = cu::arg_reduce_general<T, cu::ArgMin<T>, BLOCK_DIM, N_READS>;
-        }
-        kernel<<<num_blocks, block_dims, 0, stream>>>(
-            in.data<T>(),
-            out.data<uint32_t>(),
-            out.size(),
-            const_param(shape),
-            const_param(in_strides),
-            const_param(out_strides),
-            ndim,
-            axis_stride,
-            axis_size);
-      });
+      dispatch_block_dim(
+          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim_constant) {
+            dim3 num_blocks = get_2d_grid_dims(out.shape(), out.strides());
+            dim3 block_dims{block_dim_constant(), 1, 1};
+            auto kernel = cu::arg_reduce_general<
+                T,
+                cu::ArgMax<T>,
+                block_dim_constant(),
+                N_READS>;
+            if (reduce_type_ == ArgReduce::ArgMin) {
+              kernel = cu::arg_reduce_general<
+                  T,
+                  cu::ArgMin<T>,
+                  block_dim_constant(),
+                  N_READS>;
+            }
+            kernel<<<num_blocks, block_dims, 0, stream>>>(
+                in.data<T>(),
+                out.data<uint32_t>(),
+                out.size(),
+                const_param(shape),
+                const_param(in_strides),
+                const_param(out_strides),
+                ndim,
+                axis_stride,
+                axis_size);
+          });
    });
  });
 }
--- a/mlx/backend/cuda/binary.cu
+++ b/mlx/backend/cuda/binary.cu
@@ -149,47 +149,55 @@ void binary_op_gpu_inplace(
          using OutType = cuda_type_t<CTYPE_OUT>;
          auto bopt = get_binary_op_type(a, b);
          if (bopt == BinaryOpType::General) {
-            auto [shape, strides] = collapse_contiguous_dims(a, b, out);
-            auto& a_strides = strides[0];
-            auto& b_strides = strides[1];
-            bool large = a.data_size() > INT32_MAX ||
-                b.data_size() > INT32_MAX || out.data_size() > INT32_MAX;
-            MLX_SWITCH_BOOL(large, LARGE, {
-              using IdxT = std::conditional_t<LARGE, int64_t, int32_t>;
-              int ndim = shape.size();
-              if (ndim <= 3) {
-                MLX_SWITCH_1_2_3(ndim, NDIM, {
-                  auto kernel =
-                      &cu::binary_g_nd<Op, InType, OutType, IdxT, NDIM>;
-                  auto [num_blocks, block_dims] =
-                      get_launch_args(kernel, out, large);
-                  kernel<<<num_blocks, block_dims, 0, stream>>>(
-                      a.data<InType>(),
-                      b.data<InType>(),
-                      out.data<OutType>(),
-                      out.size(),
-                      const_param<NDIM>(shape),
-                      const_param<NDIM>(a_strides),
-                      const_param<NDIM>(b_strides));
+            dispatch_bool(
+                a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
+                    out.data_size() > INT32_MAX,
+                [&](auto large) {
+                  using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+                  Shape shape;
+                  std::vector<Strides> strides;
+                  std::tie(shape, strides) =
+                      collapse_contiguous_dims(a, b, out);
+                  auto& a_strides = strides[0];
+                  auto& b_strides = strides[1];
+                  int ndim = shape.size();
+                  if (ndim <= 3) {
+                    dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                      auto kernel = cu::binary_g_nd<
+                          Op,
+                          InType,
+                          OutType,
+                          IdxT,
+                          dims_constant()>;
+                      auto [num_blocks, block_dims] =
+                          get_launch_args(kernel, out, large());
+                      kernel<<<num_blocks, block_dims, 0, stream>>>(
+                          a.data<InType>(),
+                          b.data<InType>(),
+                          out.data<OutType>(),
+                          out.size(),
+                          const_param<dims_constant()>(shape),
+                          const_param<dims_constant()>(a_strides),
+                          const_param<dims_constant()>(b_strides));
+                    });
+                  } else {
+                    auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
+                    auto [num_blocks, block_dims] =
+                        get_launch_args(kernel, out, large());
+                    kernel<<<num_blocks, block_dims, 0, stream>>>(
+                        a.data<InType>(),
+                        b.data<InType>(),
+                        out.data<OutType>(),
+                        out.size(),
+                        const_param(shape),
+                        const_param(a_strides),
+                        const_param(b_strides),
+                        ndim);
+                  }
                });
-              } else {
-                auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
-                auto [num_blocks, block_dims] =
-                    get_launch_args(kernel, out, large);
-                kernel<<<num_blocks, block_dims, 0, stream>>>(
-                    a.data<InType>(),
-                    b.data<InType>(),
-                    out.data<OutType>(),
-                    out.size(),
-                    const_param(shape),
-                    const_param(a_strides),
-                    const_param(b_strides),
-                    ndim);
-              }
-            });
          } else {
-            MLX_SWITCH_BOOL(out.data_size() > UINT32_MAX, LARGE, {
-              using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
+            dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
+              using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
              auto kernel = cu::binary_ss<Op, InType, OutType, IdxT>;
              if (bopt == BinaryOpType::ScalarVector) {
                kernel = cu::binary_sv<Op, InType, OutType, IdxT>;
@@ -199,7 +207,7 @@ void binary_op_gpu_inplace(
                kernel = cu::binary_vv<Op, InType, OutType, IdxT>;
              }
              auto [num_blocks, block_dims] = get_launch_args(
-                  kernel, out.data_size(), out.shape(), out.strides(), LARGE);
+                  kernel, out.data_size(), out.shape(), out.strides(), large());
              kernel<<<num_blocks, block_dims, 0, stream>>>(
                  a.data<InType>(),
                  b.data<InType>(),
--- a/mlx/backend/cuda/binary_two.cu
+++ b/mlx/backend/cuda/binary_two.cu
@@ -148,49 +148,54 @@ void binary_op_gpu_inplace(

          auto bopt = get_binary_op_type(a, b);
          if (bopt == BinaryOpType::General) {
-            auto [shape, strides] = collapse_contiguous_dims(a, b, out_a);
-            auto& a_strides = strides[0];
-            auto& b_strides = strides[1];
-            bool large = a.data_size() > INT32_MAX ||
-                b.data_size() > INT32_MAX || out_a.data_size() > INT32_MAX;
-            MLX_SWITCH_BOOL(large, LARGE, {
-              using IdxT = std::conditional_t<LARGE, int64_t, int32_t>;
-              int ndim = shape.size();
-              if (ndim <= 3) {
-                MLX_SWITCH_1_2_3(ndim, NDIM, {
-                  auto kernel =
-                      cu::binary_g_nd<Op, InType, OutType, IdxT, NDIM>;
-                  auto [num_blocks, block_dims] =
-                      get_launch_args(kernel, out_a, large);
-                  kernel<<<num_blocks, block_dims, 0, stream>>>(
-                      a.data<InType>(),
-                      b.data<InType>(),
-                      out_a.data<OutType>(),
-                      out_b.data<OutType>(),
-                      out_a.size(),
-                      const_param<NDIM>(shape),
-                      const_param<NDIM>(a_strides),
-                      const_param<NDIM>(b_strides));
+            dispatch_bool(
+                a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
+                    out_a.data_size() > INT32_MAX,
+                [&](auto large) {
+                  using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+                  auto [shape, strides] = collapse_contiguous_dims(a, b, out_a);
+                  auto& a_strides = strides[0];
+                  auto& b_strides = strides[1];
+                  int ndim = shape.size();
+                  if (ndim <= 3) {
+                    dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                      auto kernel = cu::binary_g_nd<
+                          Op,
+                          InType,
+                          OutType,
+                          IdxT,
+                          dims_constant()>;
+                      auto [num_blocks, block_dims] =
+                          get_launch_args(kernel, out_a, large());
+                      kernel<<<num_blocks, block_dims, 0, stream>>>(
+                          a.data<InType>(),
+                          b.data<InType>(),
+                          out_a.data<OutType>(),
+                          out_b.data<OutType>(),
+                          out_a.size(),
+                          const_param<dims_constant()>(shape),
+                          const_param<dims_constant()>(a_strides),
+                          const_param<dims_constant()>(b_strides));
+                    });
+                  } else {
+                    auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
+                    auto [num_blocks, block_dims] =
+                        get_launch_args(kernel, out_a, large());
+                    kernel<<<num_blocks, block_dims, 0, stream>>>(
+                        a.data<InType>(),
+                        b.data<InType>(),
+                        out_a.data<OutType>(),
+                        out_b.data<OutType>(),
+                        out_a.size(),
+                        const_param(shape),
+                        const_param(a_strides),
+                        const_param(b_strides),
+                        ndim);
+                  }
                });
-              } else {
-                auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
-                auto [num_blocks, block_dims] =
-                    get_launch_args(kernel, out_a, large);
-                kernel<<<num_blocks, block_dims, 0, stream>>>(
-                    a.data<InType>(),
-                    b.data<InType>(),
-                    out_a.data<OutType>(),
-                    out_b.data<OutType>(),
-                    out_a.size(),
-                    const_param(shape),
-                    const_param(a_strides),
-                    const_param(b_strides),
-                    ndim);
-              }
-            });
          } else {
-            MLX_SWITCH_BOOL(out_a.data_size() > UINT32_MAX, LARGE, {
-              using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
+            dispatch_bool(out_a.data_size() > INT32_MAX, [&](auto large) {
+              using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
              auto kernel = cu::binary_ss<Op, InType, OutType, IdxT>;
              if (bopt == BinaryOpType::ScalarVector) {
                kernel = cu::binary_sv<Op, InType, OutType, IdxT>;
@@ -204,7 +209,7 @@ void binary_op_gpu_inplace(
                  out_a.data_size(),
                  out_a.shape(),
                  out_a.strides(),
-                  LARGE);
+                  large());
              kernel<<<num_blocks, block_dims, 0, stream>>>(
                  a.data<InType>(),
                  b.data<InType>(),
--- a/mlx/backend/cuda/copy/copy_contiguous.cu
+++ b/mlx/backend/cuda/copy/copy_contiguous.cu
@@ -38,16 +38,16 @@ void copy_contiguous(
  encoder.launch_kernel([&](cudaStream_t stream) {
    dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
      dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
-        using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
-        using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
-        MLX_SWITCH_BOOL(out.data_size() > UINT32_MAX, LARGE, {
-          using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
+        dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
+          using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
+          using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
+          using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
          auto kernel = cu::copy_s<InType, OutType, IdxT>;
          if (ctype == CopyType::Vector) {
            kernel = cu::copy_v<InType, OutType, IdxT>;
          }
          auto [num_blocks, block_dims] = get_launch_args(
-              kernel, out.data_size(), out.shape(), out.strides(), LARGE);
+              kernel, out.data_size(), out.shape(), out.strides(), large());
          kernel<<<num_blocks, block_dims, 0, stream>>>(
              in.data<InType>() + in_offset,
              out.data<OutType>() + out_offset,
--- a/mlx/backend/cuda/copy/copy_general.cu
+++ b/mlx/backend/cuda/copy/copy_general.cu
@@ -58,44 +58,46 @@ void copy_general(
  encoder.launch_kernel([&](cudaStream_t stream) {
    dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
      dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
-        using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
-        using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
-        const InType* in_ptr = in.data<InType>() + offset_in;
-        OutType* out_ptr = out.data<OutType>() + offset_out;
-        bool large = in.data_size() > INT32_MAX || out.data_size() > INT32_MAX;
-        MLX_SWITCH_BOOL(large, LARGE, {
-          using IdxT = std::conditional_t<LARGE, int64_t, int32_t>;
-          int ndim = shape.size();
-          size_t data_size = 1;
-          for (auto& s : shape)
-            data_size *= s;
-          if (ndim <= 3) {
-            MLX_SWITCH_1_2_3(ndim, NDIM, {
-              auto kernel = cu::copy_gg_nd<InType, OutType, IdxT, NDIM>;
-              auto [num_blocks, block_dims] = get_launch_args(
-                  kernel, data_size, shape, out.strides(), large);
-              kernel<<<num_blocks, block_dims, 0, stream>>>(
-                  in_ptr,
-                  out_ptr,
-                  data_size,
-                  const_param<NDIM>(shape),
-                  const_param<NDIM>(strides_in),
-                  const_param<NDIM>(strides_out));
+        dispatch_bool(
+            in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
+            [&](auto large) {
+              using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
+              using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
+              using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+              const InType* in_ptr = in.data<InType>() + offset_in;
+              OutType* out_ptr = out.data<OutType>() + offset_out;
+              int ndim = shape.size();
+              size_t data_size = 1;
+              for (auto& s : shape)
+                data_size *= s;
+              if (ndim <= 3) {
+                dispatch_1_2_3(ndim, [&](auto ndim_constant) {
+                  auto kernel =
+                      cu::copy_gg_nd<InType, OutType, IdxT, ndim_constant()>;
+                  auto [num_blocks, block_dims] = get_launch_args(
+                      kernel, data_size, shape, out.strides(), large());
+                  kernel<<<num_blocks, block_dims, 0, stream>>>(
+                      in_ptr,
+                      out_ptr,
+                      data_size,
+                      const_param<ndim_constant()>(shape),
+                      const_param<ndim_constant()>(strides_in),
+                      const_param<ndim_constant()>(strides_out));
+                });
+              } else { // ndim >= 4
+                auto kernel = cu::copy_gg<InType, OutType, IdxT>;
+                auto [num_blocks, block_dims] = get_launch_args(
+                    kernel, data_size, shape, out.strides(), large());
+                kernel<<<num_blocks, block_dims, 0, stream>>>(
+                    in_ptr,
+                    out_ptr,
+                    data_size,
+                    const_param(shape),
+                    const_param(strides_in),
+                    const_param(strides_out),
+                    ndim);
+              }
            });
-          } else { // ndim >= 4
-            auto kernel = cu::copy_gg<InType, OutType, IdxT>;
-            auto [num_blocks, block_dims] =
-                get_launch_args(kernel, data_size, shape, out.strides(), large);
-            kernel<<<num_blocks, block_dims, 0, stream>>>(
-                in_ptr,
-                out_ptr,
-                data_size,
-                const_param(shape),
-                const_param(strides_in),
-                const_param(strides_out),
-                ndim);
-          }
-        });
      });
    });
  });
--- a/mlx/backend/cuda/copy/copy_general_dynamic.cu
+++ b/mlx/backend/cuda/copy/copy_general_dynamic.cu
@@ -64,44 +64,50 @@ void copy_general_dynamic(
  encoder.launch_kernel([&](cudaStream_t stream) {
    dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
      dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
-        using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
-        using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
-        const InType* in_ptr = in.data<InType>() + offset_in;
-        OutType* out_ptr = out.data<OutType>() + offset_out;
-        bool large = in.data_size() > INT32_MAX || out.data_size() > INT32_MAX;
-        MLX_SWITCH_BOOL(large, LARGE, {
-          using IdxT = std::conditional_t<LARGE, int64_t, int32_t>;
-          int ndim = shape.size();
-          if (ndim <= 3) {
-            MLX_SWITCH_1_2_3(ndim, NDIM, {
-              auto kernel = cu::copy_gg_dynamic_nd<InType, OutType, IdxT, NDIM>;
-              auto [num_blocks, block_dims] =
-                  get_launch_args(kernel, out, large);
-              kernel<<<num_blocks, block_dims, 0, stream>>>(
-                  in_ptr,
-                  out_ptr,
-                  out.size(),
-                  const_param<NDIM>(shape),
-                  const_param<NDIM>(strides_in),
-                  const_param<NDIM>(strides_out),
-                  dynamic_offset_in.data<int64_t>(),
-                  dynamic_offset_out.data<int64_t>());
+        dispatch_bool(
+            in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
+            [&](auto large) {
+              using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
+              using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
+              using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+              const InType* in_ptr = in.data<InType>() + offset_in;
+              OutType* out_ptr = out.data<OutType>() + offset_out;
+              int ndim = shape.size();
+              if (ndim <= 3) {
+                dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                  auto kernel = cu::copy_gg_dynamic_nd<
+                      InType,
+                      OutType,
+                      IdxT,
+                      dims_constant()>;
+                  auto [num_blocks, block_dims] =
+                      get_launch_args(kernel, out, large());
+                  kernel<<<num_blocks, block_dims, 0, stream>>>(
+                      in_ptr,
+                      out_ptr,
+                      out.size(),
+                      const_param<dims_constant()>(shape),
+                      const_param<dims_constant()>(strides_in),
+                      const_param<dims_constant()>(strides_out),
+                      dynamic_offset_in.data<int64_t>(),
+                      dynamic_offset_out.data<int64_t>());
+                });
+              } else { // ndim >= 4
+                auto kernel = cu::copy_gg_dynamic<InType, OutType, IdxT>;
+                auto [num_blocks, block_dims] =
+                    get_launch_args(kernel, out, large());
+                kernel<<<num_blocks, block_dims, 0, stream>>>(
+                    in_ptr,
+                    out_ptr,
+                    out.size(),
+                    const_param(shape),
+                    const_param(strides_in),
+                    const_param(strides_out),
+                    ndim,
+                    dynamic_offset_in.data<int64_t>(),
+                    dynamic_offset_out.data<int64_t>());
+              }
            });
-          } else { // ndim >= 4
-            auto kernel = cu::copy_gg_dynamic<InType, OutType, IdxT>;
-            auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
-            kernel<<<num_blocks, block_dims, 0, stream>>>(
-                in_ptr,
-                out_ptr,
-                out.size(),
-                const_param(shape),
-                const_param(strides_in),
-                const_param(strides_out),
-                ndim,
-                dynamic_offset_in.data<int64_t>(),
-                dynamic_offset_out.data<int64_t>());
-          }
-        });
      });
    });
  });
--- a/mlx/backend/cuda/copy/copy_general_input.cu
+++ b/mlx/backend/cuda/copy/copy_general_input.cu
@@ -53,38 +53,41 @@ void copy_general_input(
  encoder.launch_kernel([&](cudaStream_t stream) {
    dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
      dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
-        using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
-        using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
-        const InType* in_ptr = in.data<InType>() + offset_in;
-        OutType* out_ptr = out.data<OutType>() + offset_out;
-        bool large = in.data_size() > INT32_MAX || out.data_size() > INT32_MAX;
-        MLX_SWITCH_BOOL(large, LARGE, {
-          using IdxT = std::conditional_t<LARGE, int64_t, int32_t>;
-          int ndim = shape.size();
-          if (ndim <= 3) {
-            MLX_SWITCH_1_2_3(ndim, NDIM, {
-              auto kernel = cu::copy_g_nd<InType, OutType, IdxT, NDIM>;
-              auto [num_blocks, block_dims] =
-                  get_launch_args(kernel, out, large);
-              kernel<<<num_blocks, block_dims, 0, stream>>>(
-                  in_ptr,
-                  out_ptr,
-                  out.size(),
-                  const_param<NDIM>(shape),
-                  const_param<NDIM>(strides_in));
+        dispatch_bool(
+            in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
+            [&](auto large) {
+              using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
+              using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
+              using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+              const InType* in_ptr = in.data<InType>() + offset_in;
+              OutType* out_ptr = out.data<OutType>() + offset_out;
+              int ndim = shape.size();
+              if (ndim <= 3) {
+                dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                  auto kernel =
+                      cu::copy_g_nd<InType, OutType, IdxT, dims_constant()>;
+                  auto [num_blocks, block_dims] =
+                      get_launch_args(kernel, out, large());
+                  kernel<<<num_blocks, block_dims, 0, stream>>>(
+                      in_ptr,
+                      out_ptr,
+                      out.size(),
+                      const_param<dims_constant()>(shape),
+                      const_param<dims_constant()>(strides_in));
+                });
+              } else { // ndim >= 4
+                auto kernel = cu::copy_g<InType, OutType, IdxT>;
+                auto [num_blocks, block_dims] =
+                    get_launch_args(kernel, out, large());
+                kernel<<<num_blocks, block_dims, 0, stream>>>(
+                    in_ptr,
+                    out_ptr,
+                    out.size(),
+                    const_param(shape),
+                    const_param(strides_in),
+                    ndim);
+              }
            });
-          } else { // ndim >= 4
-            auto kernel = cu::copy_g<InType, OutType, IdxT>;
-            auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
-            kernel<<<num_blocks, block_dims, 0, stream>>>(
-                in_ptr,
-                out_ptr,
-                out.size(),
-                const_param(shape),
-                const_param(strides_in),
-                ndim);
-          }
-        });
      });
    });
  });
--- a/mlx/backend/cuda/kernel_utils.cuh
+++ b/mlx/backend/cuda/kernel_utils.cuh
@@ -6,6 +6,8 @@

 #pragma once

+#include <type_traits>
+
 #include "mlx/array.h"
 #include "mlx/backend/cuda/device/utils.cuh"

@@ -17,60 +19,46 @@

 namespace mlx::core {

-// Convert a number between 1~3 to constexpr.
-#define MLX_SWITCH_1_2_3(N, NDIM, ...) \
-  switch (N) {                         \
-    case 1: {                          \
-      constexpr int NDIM = 1;          \
-      __VA_ARGS__;                     \
-      break;                           \
-    }                                  \
-    case 2: {                          \
-      constexpr int NDIM = 2;          \
-      __VA_ARGS__;                     \
-      break;                           \
-    }                                  \
-    case 3: {                          \
-      constexpr int NDIM = 3;          \
-      __VA_ARGS__;                     \
-      break;                           \
-    }                                  \
+template <typename F>
+void dispatch_1_2_3(int n, F&& f) {
+  switch (n) {
+    case 1:
+      f(std::integral_constant<int, 1>{});
+      break;
+    case 2:
+      f(std::integral_constant<int, 2>{});
+      break;
+    case 3:
+      f(std::integral_constant<int, 3>{});
+      break;
  }
+}

-// Like MLX_SWITCH_ALL_TYPES but for booleans.
-#define MLX_SWITCH_BOOL(BOOL, BOOL_ALIAS, ...) \
-  if (BOOL) {                                  \
-    constexpr bool BOOL_ALIAS = true;          \
-    __VA_ARGS__;                               \
-  } else {                                     \
-    constexpr bool BOOL_ALIAS = false;         \
-    __VA_ARGS__;                               \
+template <typename F>
+void dispatch_bool(bool v, F&& f) {
+  if (v) {
+    f(std::true_type{});
+  } else {
+    f(std::false_type{});
  }
+}

-// Convert a block_dim to constexpr between WARP_SIZE and WARP_SIZE ^ 2.
-#define MLX_SWITCH_BLOCK_DIM(NUM_THREADS, BLOCK_DIM, ...)   \
-  {                                                         \
-    uint32_t _num_threads = NUM_THREADS;                    \
-    if (_num_threads <= WARP_SIZE) {                        \
-      constexpr uint32_t BLOCK_DIM = WARP_SIZE;             \
-      __VA_ARGS__;                                          \
-    } else if (_num_threads <= WARP_SIZE * 2) {             \
-      constexpr uint32_t BLOCK_DIM = WARP_SIZE * 2;         \
-      __VA_ARGS__;                                          \
-    } else if (_num_threads <= WARP_SIZE * 4) {             \
-      constexpr uint32_t BLOCK_DIM = WARP_SIZE * 4;         \
-      __VA_ARGS__;                                          \
-    } else if (_num_threads <= WARP_SIZE * 8) {             \
-      constexpr uint32_t BLOCK_DIM = WARP_SIZE * 8;         \
-      __VA_ARGS__;                                          \
-    } else if (_num_threads <= WARP_SIZE * 16) {            \
-      constexpr uint32_t BLOCK_DIM = WARP_SIZE * 16;        \
-      __VA_ARGS__;                                          \
-    } else {                                                \
-      constexpr uint32_t BLOCK_DIM = WARP_SIZE * WARP_SIZE; \
-      __VA_ARGS__;                                          \
-    }                                                       \
+template <typename F>
+void dispatch_block_dim(int threads, F&& f) {
+  if (threads <= WARP_SIZE) {
+    f(std::integral_constant<int, WARP_SIZE>{});
+  } else if (threads <= WARP_SIZE * 2) {
+    f(std::integral_constant<int, WARP_SIZE * 2>{});
+  } else if (threads <= WARP_SIZE * 4) {
+    f(std::integral_constant<int, WARP_SIZE * 4>{});
+  } else if (threads <= WARP_SIZE * 8) {
+    f(std::integral_constant<int, WARP_SIZE * 8>{});
+  } else if (threads <= WARP_SIZE * 16) {
+    f(std::integral_constant<int, WARP_SIZE * 16>{});
+  } else {
+    f(std::integral_constant<int, WARP_SIZE * 32>{});
  }
+}

 // Maps CPU types to CUDA types.
 template <typename T>
--- a/mlx/backend/cuda/layer_norm.cu
+++ b/mlx/backend/cuda/layer_norm.cu
@@ -260,20 +260,21 @@ void LayerNorm::eval_gpu(
  encoder.set_output_array(out);
  encoder.launch_kernel([&](cudaStream_t stream) {
    dispatch_float_types(out.dtype(), "layernorm", [&](auto type_tag) {
-      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
      constexpr uint32_t N_READS = 4;
-      MLX_SWITCH_BLOCK_DIM(cuda::ceil_div(axis_size, N_READS), BLOCK_DIM, {
-        auto kernel = cu::layer_norm<DataType, BLOCK_DIM, N_READS>;
-        kernel<<<n_rows, BLOCK_DIM, 0, stream>>>(
-            x.data<DataType>(),
-            w.data<DataType>(),
-            b.data<DataType>(),
-            out.data<DataType>(),
-            eps_,
-            axis_size,
-            w_stride,
-            b_stride);
-      });
+      dispatch_block_dim(
+          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+            using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+            auto kernel = cu::layer_norm<DataType, block_dim(), N_READS>;
+            kernel<<<n_rows, block_dim(), 0, stream>>>(
+                x.data<DataType>(),
+                w.data<DataType>(),
+                b.data<DataType>(),
+                out.data<DataType>(),
+                eps_,
+                axis_size,
+                w_stride,
+                b_stride);
+          });
    });
  });
 }
@@ -358,21 +359,26 @@ void LayerNormVJP::eval_gpu(
  encoder.set_output_array(gw_temp);
  encoder.launch_kernel([&, x = x, g = g](cudaStream_t stream) {
    dispatch_float_types(gx.dtype(), "layernorm_vjp", [&](auto type_tag) {
-      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-      constexpr int N_READS = 4;
-      MLX_SWITCH_BOOL(has_w, HAS_W, {
-        MLX_SWITCH_BLOCK_DIM(cuda::ceil_div(axis_size, N_READS), BLOCK_DIM, {
-          auto kernel = cu::layer_norm_vjp<DataType, HAS_W, BLOCK_DIM, N_READS>;
-          kernel<<<n_rows, BLOCK_DIM, 0, stream>>>(
-              x.data<DataType>(),
-              w.data<DataType>(),
-              g.data<DataType>(),
-              gx.data<DataType>(),
-              gw_temp.data<DataType>(),
-              eps_,
-              axis_size,
-              w_stride);
-        });
+      dispatch_bool(has_w, [&](auto has_w_constant) {
+        constexpr int N_READS = 4;
+        dispatch_block_dim(
+            cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+              using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+              auto kernel = cu::layer_norm_vjp<
+                  DataType,
+                  has_w_constant(),
+                  block_dim(),
+                  N_READS>;
+              kernel<<<n_rows, block_dim(), 0, stream>>>(
+                  x.data<DataType>(),
+                  w.data<DataType>(),
+                  g.data<DataType>(),
+                  gx.data<DataType>(),
+                  gw_temp.data<DataType>(),
+                  eps_,
+                  axis_size,
+                  w_stride);
+            });
      });
    });
  });
--- a/mlx/backend/cuda/logsumexp.cu
+++ b/mlx/backend/cuda/logsumexp.cu
@@ -145,13 +145,14 @@ void LogSumExp::eval_gpu(const std::vector<array>& inputs, array& out) {
  encoder.set_output_array(out);
  encoder.launch_kernel([&](cudaStream_t stream) {
    dispatch_float_types(out.dtype(), "logsumexp", [&](auto type_tag) {
-      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
      constexpr int N_READS = 4;
-      MLX_SWITCH_BLOCK_DIM(cuda::ceil_div(axis_size, N_READS), BLOCK_DIM, {
-        auto kernel = cu::logsumexp<DataType, float, BLOCK_DIM, N_READS>;
-        kernel<<<n_rows, BLOCK_DIM, 0, stream>>>(
-            in.data<DataType>(), out.data<DataType>(), axis_size);
-      });
+      dispatch_block_dim(
+          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+            using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+            auto kernel = cu::logsumexp<DataType, float, block_dim(), N_READS>;
+            kernel<<<n_rows, block_dim(), 0, stream>>>(
+                in.data<DataType>(), out.data<DataType>(), axis_size);
+          });
    });
  });
 }
--- a/mlx/backend/cuda/reduce/all_reduce.cu
+++ b/mlx/backend/cuda/reduce/all_reduce.cu
@@ -112,7 +112,8 @@ void all_reduce(
    encoder.set_output_array(intermediate);
    encoder.launch_kernel([&](cudaStream_t stream) {
      dispatch_all_types(dt, [&](auto type_tag) {
-        MLX_SWITCH_REDUCE_OPS(reduce_type, OP, {
+        dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+          using OP = MLX_GET_TYPE(reduce_type_tag);
          using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
          using U = typename cu::ReduceResult<OP, T>::type;
          auto kernel = cu::all_reduce<T, U, OP, N_READS>;
@@ -136,7 +137,8 @@ void all_reduce(
  encoder.set_output_array(out);
  encoder.launch_kernel([&](cudaStream_t stream) {
    dispatch_all_types(dt, [&](auto type_tag) {
-      MLX_SWITCH_REDUCE_OPS(reduce_type, OP, {
+      dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+        using OP = MLX_GET_TYPE(reduce_type_tag);
        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
        using U = typename cu::ReduceResult<OP, T>::type;
        auto kernel = cu::all_reduce<T, U, OP, N_READS>;
--- a/mlx/backend/cuda/reduce/col_reduce.cu
+++ b/mlx/backend/cuda/reduce/col_reduce.cu
@@ -216,10 +216,10 @@ void col_reduce_looped(
  encoder.set_output_array(out);
  encoder.launch_kernel([&](cudaStream_t stream) {
    dispatch_all_types(in.dtype(), [&](auto type_tag) {
-      using CTYPE = MLX_GET_TYPE(type_tag);
-      MLX_SWITCH_REDUCE_OPS(reduce_type, OP, {
-        MLX_SWITCH_REDUCE_NDIM(args.reduce_ndim, NDIM, {
-          using T = cuda_type_t<CTYPE>;
+      dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+        dispatch_reduce_ndim(args.reduce_ndim, [&](auto reduce_ndim) {
+          using OP = MLX_GET_TYPE(reduce_type_tag);
+          using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
          using U = typename cu::ReduceResult<OP, T>::type;

          // Cub doesn't like const pointers for vectorized loads. (sigh)
@@ -230,7 +230,8 @@ void col_reduce_looped(
          constexpr int BN = 32;
          dim3 grid = output_grid_for_col_reduce(out, args, BN);
          int blocks = BM * BN / N_READS;
-          auto kernel = cu::col_reduce_looped<T, U, OP, NDIM, BM, BN, N_READS>;
+          auto kernel =
+              cu::col_reduce_looped<T, U, OP, reduce_ndim(), BM, BN, N_READS>;
          kernel<<<grid, blocks, 0, stream>>>(indata, out.data<U>(), args);
        });
      });
--- a/mlx/backend/cuda/reduce/init_reduce.cu
+++ b/mlx/backend/cuda/reduce/init_reduce.cu
@@ -34,7 +34,8 @@ void init_reduce(
  encoder.set_output_array(out);
  encoder.launch_kernel([&](cudaStream_t stream) {
    dispatch_all_types(in.dtype(), [&](auto type_tag) {
-      MLX_SWITCH_REDUCE_OPS(reduce_type, OP, {
+      dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+        using OP = MLX_GET_TYPE(reduce_type_tag);
        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
        using U = typename cu::ReduceResult<OP, T>::type;
        auto kernel = cu::init_reduce<T, U, OP>;
--- a/mlx/backend/cuda/reduce/reduce.cuh
+++ b/mlx/backend/cuda/reduce/reduce.cuh
@@ -1,5 +1,7 @@
 // Copyright © 2025 Apple Inc.

+#include <type_traits>
+
 #include "mlx/backend/common/reduce.h"
 #include "mlx/backend/cuda/device/cucomplex_math.cuh"
 #include "mlx/backend/cuda/kernel_utils.cuh"
@@ -9,43 +11,35 @@

 namespace mlx::core {

-// Dispatch dynamic ndim to constexpr.
-// The behavior follows get_kernel_reduce_ndim in metal/reduce.cpp file.
-#define MLX_SWITCH_REDUCE_NDIM(ndim, NDIM, ...) \
-  if (ndim == 1) {                              \
-    constexpr uint32_t NDIM = 1;                \
-    __VA_ARGS__;                                \
-  } else if (ndim == 2) {                       \
-    constexpr uint32_t NDIM = 2;                \
-    __VA_ARGS__;                                \
-  } else {                                      \
-    constexpr uint32_t NDIM = 5;                \
-    __VA_ARGS__;                                \
+template <typename F>
+void dispatch_reduce_ndim(int ndim, F&& f) {
+  if (ndim == 1) {
+    f(std::integral_constant<int, 1>{});
+  } else if (ndim == 2) {
+    f(std::integral_constant<int, 2>{});
+  } else {
+    f(std::integral_constant<int, 5>{});
  }
+}

-// Dispatch reduce ops to constexpr.
-#define MLX_SWITCH_REDUCE_OPS(REDUCE, OP, ...)           \
-  if (REDUCE == Reduce::ReduceType::And) {               \
-    using OP = cu::And;                                  \
-    __VA_ARGS__;                                         \
-  } else if (REDUCE == Reduce::ReduceType::Or) {         \
-    using OP = cu::Or;                                   \
-    __VA_ARGS__;                                         \
-  } else if (REDUCE == Reduce::ReduceType::Sum) {        \
-    using OP = cu::Sum;                                  \
-    __VA_ARGS__;                                         \
-  } else if (REDUCE == Reduce::ReduceType::Prod) {       \
-    using OP = cu::Prod;                                 \
-    __VA_ARGS__;                                         \
-  } else if (REDUCE == Reduce::ReduceType::Max) {        \
-    using OP = cu::Max;                                  \
-    __VA_ARGS__;                                         \
-  } else if (REDUCE == Reduce::ReduceType::Min) {        \
-    using OP = cu::Min;                                  \
-    __VA_ARGS__;                                         \
-  } else {                                               \
-    throw std::invalid_argument("Unknown reduce type."); \
+template <typename F>
+void dispatch_reduce_ops(Reduce::ReduceType reduce_type, F&& f) {
+  if (reduce_type == Reduce::ReduceType::And) {
+    f(type_identity<cu::And>{});
+  } else if (reduce_type == Reduce::ReduceType::Or) {
+    f(type_identity<cu::Or>{});
+  } else if (reduce_type == Reduce::ReduceType::Sum) {
+    f(type_identity<cu::Sum>{});
+  } else if (reduce_type == Reduce::ReduceType::Prod) {
+    f(type_identity<cu::Prod>{});
+  } else if (reduce_type == Reduce::ReduceType::Max) {
+    f(type_identity<cu::Max>{});
+  } else if (reduce_type == Reduce::ReduceType::Min) {
+    f(type_identity<cu::Min>{});
+  } else {
+    throw std::invalid_argument("Unknown reduce type.");
  }
+}

 void all_reduce(
    cu::CommandEncoder& encoder,
--- a/mlx/backend/cuda/reduce/row_reduce.cu
+++ b/mlx/backend/cuda/reduce/row_reduce.cu
@@ -247,9 +247,9 @@ void row_reduce_simple(
  encoder.set_output_array(out);
  encoder.launch_kernel([&](cudaStream_t stream) {
    dispatch_all_types(in.dtype(), [&](auto type_tag) {
-      using CTYPE = MLX_GET_TYPE(type_tag);
-      MLX_SWITCH_REDUCE_OPS(reduce_type, OP, {
-        using T = cuda_type_t<CTYPE>;
+      dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+        using OP = MLX_GET_TYPE(reduce_type_tag);
+        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
        using U = typename cu::ReduceResult<OP, T>::type;

        // Cub doesn't like const pointers for vectorized loads. (sigh)
@@ -295,9 +295,9 @@ void row_reduce_looped(
  encoder.set_output_array(out);
  encoder.launch_kernel([&](cudaStream_t stream) {
    dispatch_all_types(in.dtype(), [&](auto type_tag) {
-      using CTYPE = MLX_GET_TYPE(type_tag);
-      MLX_SWITCH_REDUCE_OPS(reduce_type, OP, {
-        using T = cuda_type_t<CTYPE>;
+      dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+        using OP = MLX_GET_TYPE(reduce_type_tag);
+        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
        using U = typename cu::ReduceResult<OP, T>::type;

        // Cub doesn't like const pointers for vectorized loads. (sigh)
@@ -313,10 +313,16 @@ void row_reduce_looped(

        // Pick the kernel
        auto kernel = cu::row_reduce_looped<T, U, OP, 1, 32, N_READS>;
-        MLX_SWITCH_REDUCE_NDIM(args.reduce_ndim, NDIM, {
-          MLX_SWITCH_BLOCK_DIM(threads, THREADS, {
-            kernel = cu::row_reduce_looped<T, U, OP, NDIM, THREADS, N_READS>;
-            block.x = THREADS;
+        dispatch_reduce_ndim(args.reduce_ndim, [&](auto reduce_ndim) {
+          dispatch_block_dim(threads, [&](auto threads_constant) {
+            kernel = cu::row_reduce_looped<
+                T,
+                U,
+                OP,
+                reduce_ndim(),
+                threads_constant(),
+                N_READS>;
+            block.x = threads_constant();
          });
        });

--- a/mlx/backend/cuda/rms_norm.cu
+++ b/mlx/backend/cuda/rms_norm.cu
@@ -226,18 +226,19 @@ void RMSNorm::eval_gpu(
  encoder.set_output_array(out);
  encoder.launch_kernel([&](cudaStream_t stream) {
    dispatch_float_types(out.dtype(), "rms_norm", [&](auto type_tag) {
-      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
      constexpr uint32_t N_READS = 4;
-      MLX_SWITCH_BLOCK_DIM(cuda::ceil_div(axis_size, N_READS), BLOCK_DIM, {
-        auto kernel = cu::rms_norm<DataType, BLOCK_DIM, N_READS>;
-        kernel<<<n_rows, BLOCK_DIM, 0, stream>>>(
-            x.data<DataType>(),
-            w.data<DataType>(),
-            out.data<DataType>(),
-            eps_,
-            axis_size,
-            w_stride);
-      });
+      dispatch_block_dim(
+          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+            using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+            auto kernel = cu::rms_norm<DataType, block_dim(), N_READS>;
+            kernel<<<n_rows, block_dim(), 0, stream>>>(
+                x.data<DataType>(),
+                w.data<DataType>(),
+                out.data<DataType>(),
+                eps_,
+                axis_size,
+                w_stride);
+          });
    });
  });
 }
@@ -312,21 +313,27 @@ void RMSNormVJP::eval_gpu(
  encoder.set_output_array(gw_temp);
  encoder.launch_kernel([&, x = x, g = g](cudaStream_t stream) {
    dispatch_float_types(gx.dtype(), "rms_norm_vjp", [&](auto type_tag) {
-      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-      constexpr int N_READS = 4;
-      MLX_SWITCH_BOOL(has_w, HAS_W, {
-        MLX_SWITCH_BLOCK_DIM(cuda::ceil_div(axis_size, N_READS), BLOCK_DIM, {
-          auto kernel = cu::rms_norm_vjp<DataType, HAS_W, BLOCK_DIM, N_READS>;
-          kernel<<<n_rows, BLOCK_DIM, 0, stream>>>(
-              x.data<DataType>(),
-              w.data<DataType>(),
-              g.data<DataType>(),
-              gx.data<DataType>(),
-              gw_temp.data<DataType>(),
-              eps_,
-              axis_size,
-              w_stride);
-        });
+      dispatch_bool(has_w, [&](auto has_w_constant) {
+        constexpr int N_READS = 4;
+        dispatch_block_dim(
+            cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+              using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+              constexpr int N_READS = 4;
+              auto kernel = cu::rms_norm_vjp<
+                  DataType,
+                  has_w_constant(),
+                  block_dim(),
+                  N_READS>;
+              kernel<<<n_rows, block_dim(), 0, stream>>>(
+                  x.data<DataType>(),
+                  w.data<DataType>(),
+                  g.data<DataType>(),
+                  gx.data<DataType>(),
+                  gw_temp.data<DataType>(),
+                  eps_,
+                  axis_size,
+                  w_stride);
+            });
      });
    });
  });
--- a/mlx/backend/cuda/rope.cu
+++ b/mlx/backend/cuda/rope.cu
@@ -311,11 +311,11 @@ void RoPE::eval_gpu(
  encoder.set_output_array(out);
  encoder.launch_kernel([&](cudaStream_t stream) {
    dispatch_float_types(out.dtype(), "rope", [&](auto type_tag) {
-      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-      MLX_SWITCH_BOOL(traditional_, TRADITIONAL, {
-        MLX_SWITCH_BOOL(forward_, FORWARD, {
+      dispatch_bool(traditional_, [&](auto traditional) {
+        dispatch_bool(forward_, [&](auto forward) {
+          using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
          if (single && !with_freqs) {
-            auto kernel = cu::rope_single<DataType, TRADITIONAL, FORWARD>;
+            auto kernel = cu::rope_single<DataType, traditional(), forward()>;
            uint2 dims = make_uint2(dims_ / 2, in.size() / mat_size);
            auto [grid, block] = get_grid_and_block(dims.x, dims.y, 1);
            kernel<<<grid, block, 0, stream>>>(
@@ -327,7 +327,8 @@ void RoPE::eval_gpu(
                mat_size,
                dims);
          } else if (single) {
-            auto kernel = cu::rope_single_freqs<DataType, TRADITIONAL, FORWARD>;
+            auto kernel =
+                cu::rope_single_freqs<DataType, traditional(), forward()>;
            uint2 dims = make_uint2(dims_ / 2, in.size() / mat_size);
            auto [grid, block] = get_grid_and_block(dims.x, dims.y, 1);
            kernel<<<grid, block, 0, stream>>>(
@@ -340,7 +341,7 @@ void RoPE::eval_gpu(
                dims,
                inputs[2].strides(0));
          } else if (with_freqs) {
-            auto kernel = cu::rope_freqs<DataType, TRADITIONAL, FORWARD>;
+            auto kernel = cu::rope_freqs<DataType, traditional(), forward()>;
            uint3 dims =
                make_uint3(dims_ / 2, in.shape(-2), in.size() / mat_size);
            dims.z = (dims.z + 3) / 4;
@@ -358,7 +359,7 @@ void RoPE::eval_gpu(
                dims,
                inputs[2].strides(0));
          } else {
-            auto kernel = cu::rope<DataType, TRADITIONAL, FORWARD>;
+            auto kernel = cu::rope<DataType, traditional(), forward()>;
            uint3 dims =
                make_uint3(dims_ / 2, in.shape(-2), in.size() / mat_size);
            dims.z = (dims.z + 3) / 4;
--- a/mlx/backend/cuda/softmax.cu
+++ b/mlx/backend/cuda/softmax.cu
@@ -143,16 +143,17 @@ void Softmax::eval_gpu(const std::vector<array>& inputs, array& out) {
  encoder.set_output_array(out);
  encoder.launch_kernel([&](cudaStream_t stream) {
    dispatch_float_types(out.dtype(), "softmax", [&](auto type_tag) {
-      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
      constexpr int N_READS = 4;
-      MLX_SWITCH_BLOCK_DIM(cuda::ceil_div(axis_size, N_READS), BLOCK_DIM, {
-        auto kernel = cu::softmax<DataType, DataType, BLOCK_DIM, N_READS>;
-        if (precise) {
-          kernel = cu::softmax<DataType, float, BLOCK_DIM, N_READS>;
-        }
-        kernel<<<n_rows, BLOCK_DIM, 0, stream>>>(
-            in.data<DataType>(), out.data<DataType>(), axis_size);
-      });
+      dispatch_block_dim(
+          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+            using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+            auto kernel = cu::softmax<DataType, DataType, block_dim(), N_READS>;
+            if (precise) {
+              kernel = cu::softmax<DataType, float, block_dim(), N_READS>;
+            }
+            kernel<<<n_rows, block_dim(), 0, stream>>>(
+                in.data<DataType>(), out.data<DataType>(), axis_size);
+          });
    });
  });
 }
--- a/mlx/backend/cuda/ternary.cu
+++ b/mlx/backend/cuda/ternary.cu
@@ -97,53 +97,56 @@ void ternary_op_gpu_inplace(

      auto topt = get_ternary_op_type(a, b, c);
      if (topt == TernaryOpType::General) {
-        auto [shape, strides] = collapse_contiguous_dims(a, b, c, out);
-        auto& a_strides = strides[0];
-        auto& b_strides = strides[1];
-        auto& c_strides = strides[2];
-        bool large = a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
-            c.data_size() > INT32_MAX || out.data_size() > INT32_MAX;
-        MLX_SWITCH_BOOL(large, LARGE, {
-          using IdxT = std::conditional_t<LARGE, int64_t, int32_t>;
-          int ndim = shape.size();
-          if (ndim <= 3) {
-            MLX_SWITCH_1_2_3(ndim, NDIM, {
-              auto kernel = cu::ternary_g_nd<Op, DType, IdxT, NDIM>;
-              auto [num_blocks, block_dims] =
-                  get_launch_args(kernel, out, large);
-              kernel<<<num_blocks, block_dims, 0, stream>>>(
-                  a.data<bool>(),
-                  b.data<DType>(),
-                  c.data<DType>(),
-                  out.data<DType>(),
-                  out.size(),
-                  const_param<NDIM>(shape),
-                  const_param<NDIM>(a_strides),
-                  const_param<NDIM>(b_strides),
-                  const_param<NDIM>(c_strides));
+        dispatch_bool(
+            a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
+                c.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
+            [&](auto large) {
+              using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+              auto [shape, strides] = collapse_contiguous_dims(a, b, c, out);
+              auto& a_strides = strides[0];
+              auto& b_strides = strides[1];
+              auto& c_strides = strides[2];
+              int ndim = shape.size();
+              if (ndim <= 3) {
+                dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                  auto kernel =
+                      cu::ternary_g_nd<Op, DType, IdxT, dims_constant()>;
+                  auto [num_blocks, block_dims] =
+                      get_launch_args(kernel, out, large());
+                  kernel<<<num_blocks, block_dims, 0, stream>>>(
+                      a.data<bool>(),
+                      b.data<DType>(),
+                      c.data<DType>(),
+                      out.data<DType>(),
+                      out.size(),
+                      const_param<dims_constant()>(shape),
+                      const_param<dims_constant()>(a_strides),
+                      const_param<dims_constant()>(b_strides),
+                      const_param<dims_constant()>(c_strides));
+                });
+              } else {
+                auto kernel = cu::ternary_g<Op, DType, IdxT>;
+                auto [num_blocks, block_dims] =
+                    get_launch_args(kernel, out, large());
+                kernel<<<num_blocks, block_dims, 0, stream>>>(
+                    a.data<bool>(),
+                    b.data<DType>(),
+                    c.data<DType>(),
+                    out.data<DType>(),
+                    out.data_size(),
+                    const_param(shape),
+                    const_param(a_strides),
+                    const_param(b_strides),
+                    const_param(c_strides),
+                    ndim);
+              }
            });
-          } else {
-            auto kernel = cu::ternary_g<Op, DType, IdxT>;
-            auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
-            kernel<<<num_blocks, block_dims, 0, stream>>>(
-                a.data<bool>(),
-                b.data<DType>(),
-                c.data<DType>(),
-                out.data<DType>(),
-                out.data_size(),
-                const_param(shape),
-                const_param(a_strides),
-                const_param(b_strides),
-                const_param(c_strides),
-                ndim);
-          }
-        });
      } else {
-        MLX_SWITCH_BOOL(out.data_size() > UINT32_MAX, LARGE, {
-          using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
+        dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
+          using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
          auto kernel = cu::ternary_v<Op, DType, IdxT>;
          auto [num_blocks, block_dims] = get_launch_args(
-              kernel, out.data_size(), out.shape(), out.strides(), LARGE);
+              kernel, out.data_size(), out.shape(), out.strides(), large());
          kernel<<<num_blocks, block_dims, 0, stream>>>(
              a.data<bool>(),
              b.data<DType>(),