Fix unintuitive metal kernel caching

2025-12-16 01:49:05 +08:00 · 2025-06-03 08:01:53 -07:00
parent c6a20b427a
commit ac1117b224
4 changed files with 313 additions and 244 deletions
--- a/docs/src/dev/custom_metal_kernels.rst
+++ b/docs/src/dev/custom_metal_kernels.rst
@@ -8,11 +8,12 @@ MLX supports writing custom Metal kernels through the Python and C++ APIs.
 Simple Example
 --------------

+.. currentmodule:: mlx.core
+
 Let's write a custom kernel that computes ``exp`` elementwise:

 .. code-block:: python

-  def exp_elementwise(a: mx.array):
  source = """
      uint elem = thread_position_in_grid.x;
      T tmp = inp[elem];
@@ -25,6 +26,8 @@ Let's write a custom kernel that computes ``exp`` elementwise:
      output_names=["out"],
      source=source,
  )
+
+  def exp_elementwise(a: mx.array):
      outputs = kernel(
          inputs=[a],
          template=[("T", mx.float32)],
@@ -39,8 +42,13 @@ Let's write a custom kernel that computes ``exp`` elementwise:
  b = exp_elementwise(a)
  assert mx.allclose(b, mx.exp(a))

+Every time you make a kernel, a new Metal library is created and possibly
+JIT compiled. To reduce the overhead from that, build the kernel once with
+:func:`fast.metal_kernel` and then use it many times.
+
 .. note::
-    We are only required to pass the body of the Metal kernel in ``source``.
+   Only pass the body of the Metal kernel in ``source``. The function
+   signature is generated automatically.

 The full function signature will be generated using:

@@ -78,29 +86,34 @@ Putting this all together, the generated function signature for ``myexp`` is as

  template [[host_name("custom_kernel_myexp_float")]] [[kernel]] decltype(custom_kernel_myexp_float<float>) custom_kernel_myexp_float<float>;

-Note: ``grid`` and ``threadgroup`` are parameters to the Metal `dispatchThreads <https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/2866532-dispatchthreads>`_ function.
-This means we will launch ``mx.prod(grid)`` threads, subdivided into ``threadgroup`` size threadgroups.
-For optimal performance, each thread group dimension should be less than or equal to the corresponding grid dimension.
+Note: ``grid`` and ``threadgroup`` are parameters to the Metal `dispatchThreads
+<https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/2866532-dispatchthreads>`_
+function. This means we will launch ``mx.prod(grid)`` threads, subdivided into
+``threadgroup`` size threadgroups.  For optimal performance, each thread group
+dimension should be less than or equal to the corresponding grid dimension.

-Passing ``verbose=True`` to ``mx.fast.metal_kernel.__call__`` will print the generated code for debugging purposes.
+Passing ``verbose=True`` to :func:`ast.metal_kernel.__call__` will print the
+generated code for debugging purposes.

 Using Shape/Strides
 -------------------

-``mx.fast.metal_kernel`` supports an argument ``ensure_row_contiguous`` which is ``True`` by default.
-This will copy the ``mx.array`` inputs if needed before the kernel is launched to ensure that the memory layout is row contiguous.
-Generally this makes writing the kernel easier, since we don't have to worry about gaps or the ordering of the dims
-when indexing.
+:func:`fast.metal_kernel` supports an argument ``ensure_row_contiguous`` which
+is ``True`` by default. This will copy the array inputs if needed
+before the kernel is launched to ensure that the memory layout is row
+contiguous.  Generally this makes writing the kernel easier, since we don't
+have to worry about gaps or the ordering of the dims when indexing.

-If we want to avoid this copy, ``metal_kernel`` automatically passes ``a_shape``, ``a_strides`` and ``a_ndim`` for each
-input array ``a`` if any are present in ``source``.
-We can then use MLX's built in indexing utils to fetch the right elements for each thread.
+If we want to avoid this copy, :func:`fast.metal_kernel` automatically passes
+``a_shape``, ``a_strides`` and ``a_ndim`` for each input array ``a`` if any are
+present in ``source``. We can then use MLX's built in indexing utils to fetch
+the right elements for each thread.

-Let's convert ``myexp`` above to support arbitrarily strided arrays without relying on a copy from ``ensure_row_contiguous``:
+Let's convert ``myexp`` above to support arbitrarily strided arrays without
+relying on a copy from ``ensure_row_contiguous``:

 .. code-block:: python
   
-  def exp_elementwise(a: mx.array):
  source = """
      uint elem = thread_position_in_grid.x;
      // Utils from `mlx/backend/metal/kernels/utils.h` are automatically included
@@ -116,6 +129,8 @@ Let's convert ``myexp`` above to support arbitrarily strided arrays without rely
      output_names=["out"],
      source=source
  )
+
+  def exp_elementwise(a: mx.array):
      outputs = kernel(
          inputs=[a],
          template=[("T", mx.float32)],
@@ -183,25 +198,13 @@ We'll start with the following MLX implementation using standard ops:

      return output

-Now let's use ``mx.custom_function`` together with ``mx.fast.metal_kernel``
+Now let's use :func:`custom_function` together with :func:`fast.metal_kernel`
 to write a fast GPU kernel for both the forward and backward passes.

 First we'll implement the forward pass as a fused kernel:

 .. code-block:: python

-    @mx.custom_function
-    def grid_sample(x, grid):
-
-        assert x.ndim == 4, "`x` must be 4D."
-        assert grid.ndim == 4, "`grid` must be 4D."
-
-        B, _, _, C = x.shape
-        _, gN, gM, D = grid.shape
-        out_shape = (B, gN, gM, C)
-
-        assert D == 2, "Last dim of `grid` must be size 2."
-
  source = """
      uint elem = thread_position_in_grid.x;
      int H = x_shape[1];
@@ -251,12 +254,26 @@ First we'll implement the forward pass as a fused kernel:

      out[elem] = nw * I_nw + ne * I_ne + sw * I_sw + se * I_se;
  """
+
  kernel = mx.fast.metal_kernel(
      name="grid_sample",
      input_names=["x", "grid"],
      output_names=["out"],
      source=source,
  )
+
+  @mx.custom_function
+  def grid_sample(x, grid):
+
+      assert x.ndim == 4, "`x` must be 4D."
+      assert grid.ndim == 4, "`grid` must be 4D."
+
+      B, _, _, C = x.shape
+      _, gN, gM, D = grid.shape
+      out_shape = (B, gN, gM, C)
+
+      assert D == 2, "Last dim of `grid` must be size 2."
+
      outputs = kernel(
          inputs=[x, grid],
          template=[("T", x.dtype)],
@@ -281,11 +298,11 @@ On an M1 Max, we see a big performance improvement:
 Grid Sample VJP
 ---------------

-Since we decorated ``grid_sample`` with ``mx.custom_function``, we can now define
-its custom vjp transform so MLX can differentiate it.
+Since we decorated ``grid_sample`` with :func:`custom_function`, we can now
+define its custom vjp transform so MLX can differentiate it.

 The backwards pass requires atomically updating ``x_grad``/``grid_grad`` and so
-requires a few extra ``mx.fast.metal_kernel`` features:
+requires a few extra :func:`fast.metal_kernel` features:

 * ``init_value=0``
    Initialize all of the kernel's outputs to this value before it runs. This allows us to update only part of the output arrays with the kernel.
@@ -299,14 +316,6 @@ We can then implement the backwards pass as follows:

 .. code-block:: python

-    @grid_sample.vjp
-    def grid_sample_vjp(primals, cotangent, _):
-        x, grid = primals
-        B, _, _, C = x.shape
-        _, gN, gM, D = grid.shape
-
-        assert D == 2, "Last dim of `grid` must be size 2."
-
  source = """
      uint elem = thread_position_in_grid.x;
      int H = x_shape[1];
@@ -406,6 +415,15 @@ We can then implement the backwards pass as follows:
      source=source,
      atomic_outputs=True,
  )
+
+  @grid_sample.vjp
+  def grid_sample_vjp(primals, cotangent, _):
+      x, grid = primals
+      B, _, _, C = x.shape
+      _, gN, gM, D = grid.shape
+
+      assert D == 2, "Last dim of `grid` must be size 2."
+
      # pad the output channels to simd group size
      # so that our `simd_sum`s don't overlap.
      simdgroup_size = 32
--- a/mlx/backend/metal/custom_kernel.cpp
+++ b/mlx/backend/metal/custom_kernel.cpp
@@ -73,6 +73,16 @@ void CustomKernel::eval_gpu(
  }

  const auto [tx, ty, tz] = threadgroup_;
+  auto tg_size = tx * ty * tz;
+  auto max_tg_size = kernel->maxTotalThreadsPerThreadgroup();
+  if (tg_size > max_tg_size) {
+    std::ostringstream msg;
+    msg << "Thread group size (" << tg_size << ") is greater than "
+        << " the maximum allowed threads per threadgroup (" << max_tg_size
+        << ").";
+    throw std::invalid_argument(msg.str());
+  }
+
  const auto [gx, gy, gz] = grid_;
  MTL::Size group_dims =
      MTL::Size(std::min(tx, gx), std::min(ty, gy), std::min(tz, gz));
--- a/mlx/fast.cpp
+++ b/mlx/fast.cpp
@@ -1,5 +1,6 @@
 // Copyright © 2023-2024 Apple Inc.
 #include <cassert>
+#include <chrono>
 #include <iostream>
 #include <numeric>
 #include <regex>
@@ -1228,6 +1229,10 @@ MetalKernelFunction metal_kernel(
      attributes.push_back("  " + dtype + " " + attr + " [[" + attr + "]]");
    }
  }
+  auto now = std::chrono::system_clock::now();
+  int64_t timestamp = std::chrono::duration_cast<std::chrono::milliseconds>(
+                          now.time_since_epoch())
+                          .count();

  return [=,
          shape_infos = std::move(shape_infos),
@@ -1271,14 +1276,15 @@ MetalKernelFunction metal_kernel(

    std::ostringstream func_name;
    std::string template_def = "";
-    std::string hash_key = "";
+    std::string template_hash = "";
    if (!template_args.empty()) {
      std::regex disallowed_chars("\\<|\\>|(, )");
      template_def = write_template(template_args);
-      hash_key = std::regex_replace(template_def, disallowed_chars, "_");
-      hash_key.pop_back();
+      template_hash = std::regex_replace(template_def, disallowed_chars, "_");
+      template_hash.pop_back();
    }
-    func_name << "custom_kernel_" << name << hash_key;
+    func_name << "custom_kernel_" << name << "_" << template_hash << "_"
+              << timestamp;
    std::string kernel_name = func_name.str();

    std::string kernel_source = write_signature(
--- a/python/tests/test_fast.py
+++ b/python/tests/test_fast.py
@@ -735,6 +735,41 @@ class TestFast(mlx_tests.MLXTestCase):
        )[0]
        self.assertEqual(out.item(), 2)

+    @unittest.skipIf(not mx.metal.is_available(), "Metal is not available")
+    def test_custom_kernel_caching(self):
+        def call_kernel(a: mx.array, source):
+            kernel = mx.fast.metal_kernel(
+                name="my_kernel",
+                input_names=["inp"],
+                output_names=["out"],
+                source=source,
+            )
+            return kernel(
+                inputs=[a],
+                grid=(a.size, 1, 1),
+                threadgroup=(a.size, 1, 1),
+                output_shapes=[a.shape],
+                output_dtypes=[a.dtype],
+                stream=mx.gpu,
+            )[0]
+
+        a = mx.random.normal(shape=(32,))
+
+        source = """
+            uint elem = thread_position_in_grid.x;
+            out[elem] = 0.0;
+        """
+
+        out = call_kernel(a, source)
+        self.assertTrue(mx.array_equal(out, mx.zeros_like(out)))
+
+        source = """
+            uint elem = thread_position_in_grid.x;
+            out[elem] = 1.0;
+        """
+        out = call_kernel(a, source)
+        self.assertTrue(mx.array_equal(out, mx.ones_like(out)))
+

 if __name__ == "__main__":
    unittest.main()