MLX_SWITCH macros to templates (#2320)

2025-12-16 01:49:05 +08:00 · 2025-07-01 01:33:44 -07:00
parent 33bf1a244b
commit 3d5e17e507
27 changed files with 693 additions and 692 deletions
--- a/mlx/backend/cuda/copy/copy_general_input.cu
+++ b/mlx/backend/cuda/copy/copy_general_input.cu
@@ -51,35 +51,43 @@ void copy_general_input(
    const Shape& shape,
    const Strides& strides_in) {
  encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, {
-      const InType* in_ptr = in.data<InType>() + offset_in;
-      OutType* out_ptr = out.data<OutType>() + offset_out;
-      bool large = in.data_size() > INT32_MAX || out.data_size() > INT32_MAX;
-      MLX_SWITCH_BOOL(large, LARGE, {
-        using IdxT = std::conditional_t<LARGE, int64_t, int32_t>;
-        int ndim = shape.size();
-        if (ndim <= 3) {
-          MLX_SWITCH_1_2_3(ndim, NDIM, {
-            auto kernel = cu::copy_g_nd<InType, OutType, IdxT, NDIM>;
-            auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
-            kernel<<<num_blocks, block_dims, 0, stream>>>(
-                in_ptr,
-                out_ptr,
-                out.size(),
-                const_param<NDIM>(shape),
-                const_param<NDIM>(strides_in));
-          });
-        } else { // ndim >= 4
-          auto kernel = cu::copy_g<InType, OutType, IdxT>;
-          auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
-          kernel<<<num_blocks, block_dims, 0, stream>>>(
-              in_ptr,
-              out_ptr,
-              out.size(),
-              const_param(shape),
-              const_param(strides_in),
-              ndim);
-        }
+    dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
+      dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+        dispatch_bool(
+            in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
+            [&](auto large) {
+              using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
+              using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
+              using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+              const InType* in_ptr = in.data<InType>() + offset_in;
+              OutType* out_ptr = out.data<OutType>() + offset_out;
+              int ndim = shape.size();
+              if (ndim <= 3) {
+                dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                  auto kernel =
+                      cu::copy_g_nd<InType, OutType, IdxT, dims_constant()>;
+                  auto [num_blocks, block_dims] =
+                      get_launch_args(kernel, out, large());
+                  kernel<<<num_blocks, block_dims, 0, stream>>>(
+                      in_ptr,
+                      out_ptr,
+                      out.size(),
+                      const_param<dims_constant()>(shape),
+                      const_param<dims_constant()>(strides_in));
+                });
+              } else { // ndim >= 4
+                auto kernel = cu::copy_g<InType, OutType, IdxT>;
+                auto [num_blocks, block_dims] =
+                    get_launch_args(kernel, out, large());
+                kernel<<<num_blocks, block_dims, 0, stream>>>(
+                    in_ptr,
+                    out_ptr,
+                    out.size(),
+                    const_param(shape),
+                    const_param(strides_in),
+                    ndim);
+              }
+            });
      });
    });
  });