Implement no_grad functionality following PyTorch API

This PR adds gradient control functionality to MLX that matches PyTorch's no_grad, enable_grad, and set_grad_enabled APIs. ## Features Added ### C++ Backend - New AutogradState class with thread-local gradient state management - RAII guard classes: NoGradGuard, EnableGradGuard, AutoGradMode - GradMode static class with is_enabled() and set_enabled() methods - Integration with existing transforms (vjp, jvp) to respect gradient state ### Python Frontend - Python bindings: mx.is_grad_enabled(), mx.set_grad_enabled() - Context managers and decorators: no_grad(), enable_grad(), set_grad_enabled() - Full PyTorch API compatibility ### Tests - Comprehensive C++ tests for gradient state control and transform integration - Python tests covering context managers, decorators, and nested scenarios - All existing functionality preserved ## Usage ```python import mlx.core as mx from mlx.autograd import no_grad, enable_grad, set_grad_enabled # Context manager with no_grad(): output = model(input) # No gradients computed # Decorator @no_grad() def inference(x): return model(x) # Conditional gradients with set_grad_enabled(is_training): loss = compute_loss(predictions, targets) ``` ## Implementation Details - Thread-local gradient state (each thread independent) - Zero performance overhead when gradients enabled - Automatic state restoration via RAII - Backward compatible with existing MLX code Co-Authored-By: Yannick Muller <yannick@yajm.ch>
2025-09-01 04:24:36 +08:00 · 2025-08-02 23:27:37 -04:00
parent aaf78f4c6b
commit 1e4bd653db
8 changed files with 578 additions and 0 deletions
--- a/mlx/CMakeLists.txt
+++ b/mlx/CMakeLists.txt
@@ -2,6 +2,7 @@ target_sources(
  mlx
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/array.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/autograd_state.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/compile.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/dtype.cpp
--- a/mlx/autograd_state.cpp
+++ b/mlx/autograd_state.cpp
@@ -0,0 +1,24 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include "mlx/autograd_state.h"
+
+namespace mlx::core {
+
+AutogradState& AutogradState::get_tls_state() {
+  thread_local static AutogradState tls_state{true}; // gradients enabled by default
+  return tls_state;
+}
+
+void AutogradState::set_tls_state(AutogradState state) {
+  get_tls_state() = state;
+}
+
+bool GradMode::is_enabled() {
+  return AutogradState::get_tls_state().get_grad_mode();
+}
+
+void GradMode::set_enabled(bool enabled) {
+  AutogradState::get_tls_state().set_grad_mode(enabled);
+}
+
+} // namespace mlx::core
--- a/mlx/autograd_state.h
+++ b/mlx/autograd_state.h
@@ -0,0 +1,63 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#pragma once
+
+namespace mlx::core {
+
+/**
+ * Structure used to manage thread-local autograd state flags
+ * similar to PyTorch's autograd state management.
+ */
+struct AutogradState {
+  static AutogradState& get_tls_state();
+  static void set_tls_state(AutogradState state);
+
+  AutogradState(bool grad_mode = true) : grad_mode_(grad_mode) {}
+
+  void set_grad_mode(bool enabled) { grad_mode_ = enabled; }
+  bool get_grad_mode() const { return grad_mode_; }
+
+ private:
+  bool grad_mode_;
+};
+
+/**
+ * Global gradient mode control functions
+ */
+struct GradMode {
+  static bool is_enabled();
+  static void set_enabled(bool enabled);
+};
+
+/**
+ * A RAII, thread local guard that enables or disables grad mode upon
+ * construction, and sets it back to the original value upon destruction.
+ */
+struct AutoGradMode {
+  AutoGradMode(bool enabled) : prev_mode(GradMode::is_enabled()) {
+    GradMode::set_enabled(enabled);
+  }
+  AutoGradMode(const AutoGradMode&) = delete;
+  AutoGradMode(AutoGradMode&&) = delete;
+  AutoGradMode& operator=(const AutoGradMode&) = delete;
+  AutoGradMode& operator=(AutoGradMode&&) = delete;
+  ~AutoGradMode() { GradMode::set_enabled(prev_mode); }
+  bool prev_mode;
+};
+
+/**
+ * A RAII, thread local guard that stops future operations from building
+ * gradients.
+ */
+struct NoGradGuard : public AutoGradMode {
+  NoGradGuard() : AutoGradMode(/*enabled=*/false) {}
+};
+
+/**
+ * A RAII, thread local guard that enables gradient computation.
+ */
+struct EnableGradGuard : public AutoGradMode {
+  EnableGradGuard() : AutoGradMode(/*enabled=*/true) {}
+};
+
+} // namespace mlx::core
--- a/mlx/transforms.cpp
+++ b/mlx/transforms.cpp
@@ -9,6 +9,7 @@
 #include <unordered_map>
 #include <unordered_set>

+#include "mlx/autograd_state.h"
 #include "mlx/backend/cpu/eval.h"
 #include "mlx/backend/gpu/eval.h"
 #include "mlx/fence.h"
@@ -323,6 +324,17 @@ std::pair<std::vector<array>, std::vector<array>> vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotans,
    const std::vector<int>& argnums) {
+  // Check if gradients are enabled
+  if (!GradMode::is_enabled()) {
+    // If gradients are disabled, run function normally and return zero gradients
+    auto outputs = fun(primals);
+    std::vector<array> vjps;
+    for (int argnum : argnums) {
+      vjps.push_back(zeros_like(primals[argnum]));
+    }
+    return {outputs, vjps};
+  }
+
  // Set the global tracing flag.
  detail::InTracing in_tracing{false, true};

@@ -521,6 +533,17 @@ std::pair<std::vector<array>, std::vector<array>> jvp(
    const std::function<std::vector<array>(const std::vector<array>&)>& fun,
    const std::vector<array>& primals,
    const std::vector<array>& tangents) {
+  // Check if gradients are enabled
+  if (!GradMode::is_enabled()) {
+    // If gradients are disabled, run function normally and return zero tangents
+    auto outputs = fun(primals);
+    std::vector<array> jvps;
+    for (const auto& output : outputs) {
+      jvps.push_back(zeros_like(output));
+    }
+    return {outputs, jvps};
+  }
+
  // Set the global tracing flag.
  detail::InTracing in_tracing{false, true};

--- a/python/mlx/autograd.py
+++ b/python/mlx/autograd.py
@@ -0,0 +1,164 @@
+# Copyright © 2023-2024 Apple Inc.
+
+"""Gradient computation control utilities for MLX."""
+
+from typing import Any, Callable, TypeVar
+import mlx.core as mx
+
+F = TypeVar("F", bound=Callable[..., Any])
+
+# Export the core functions
+is_grad_enabled = mx.is_grad_enabled
+set_grad_enabled = mx.set_grad_enabled
+
+__all__ = [
+    "no_grad",
+    "enable_grad", 
+    "set_grad_enabled",
+    "is_grad_enabled",
+]
+
+
+class _NoParamDecoratorContextManager:
+    """Base class for context managers that can also be used as decorators."""
+    
+    def __call__(self, func: F) -> F:
+        """Decorator usage."""
+        def wrapper(*args, **kwargs):
+            with self:
+                return func(*args, **kwargs)
+        return wrapper
+
+
+class no_grad(_NoParamDecoratorContextManager):
+    r"""Context manager that disables gradient calculation.
+
+    Disabling gradient calculation is useful for inference, when you are sure
+    that you will not call backward passes. It will reduce memory consumption
+    for computations that would otherwise compute gradients.
+
+    In this mode, gradient computation will be disabled for all operations.
+
+    This context manager is thread local; it will not affect computation
+    in other threads.
+
+    Also functions as a decorator.
+
+    Example::
+        >>> import mlx.core as mx
+        >>> x = mx.array([1.], requires_grad=True)  # MLX doesn't have requires_grad, but for illustration
+        >>> with mx.no_grad():
+        ...     y = x * 2
+        >>> # y won't have gradients computed for it
+        >>> @mx.no_grad()
+        ... def doubler(x):
+        ...     return x * 2
+        >>> z = doubler(x)
+        >>> # z won't have gradients computed for it
+    """
+
+    def __init__(self) -> None:
+        self.prev = False
+
+    def __enter__(self) -> None:
+        self.prev = mx.is_grad_enabled()
+        mx.set_grad_enabled(False)
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        mx.set_grad_enabled(self.prev)
+
+
+class enable_grad(_NoParamDecoratorContextManager):
+    r"""Context manager that enables gradient calculation.
+
+    Enables gradient calculation, if it has been disabled via :class:`~no_grad`
+    or :class:`~set_grad_enabled`.
+
+    This context manager is thread local; it will not affect computation
+    in other threads.
+
+    Also functions as a decorator.
+
+    Example::
+        >>> import mlx.core as mx
+        >>> x = mx.array([1.])
+        >>> with mx.no_grad():
+        ...     with mx.enable_grad():
+        ...         y = x * 2
+        >>> # y will have gradients computed for it
+        >>> @mx.enable_grad()
+        ... def doubler(x):
+        ...     return x * 2
+        >>> with mx.no_grad():
+        ...     z = doubler(x)
+        >>> # z will have gradients computed for it
+    """
+
+    def __init__(self) -> None:
+        self.prev = False
+
+    def __enter__(self) -> None:
+        self.prev = mx.is_grad_enabled()
+        mx.set_grad_enabled(True)
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        mx.set_grad_enabled(self.prev)
+
+
+class set_grad_enabled:
+    r"""Context manager that sets gradient calculation on or off.
+
+    ``set_grad_enabled`` will enable or disable grads based on its argument :attr:`mode`.
+    It can be used as a context manager or as a function.
+
+    This context manager is thread local; it will not affect computation
+    in other threads.
+
+    Args:
+        mode (bool): Flag whether to enable grad (``True``), or disable
+                     (``False``). This can be used to conditionally enable
+                     gradients.
+
+    Example::
+        >>> import mlx.core as mx
+        >>> x = mx.array([1.])
+        >>> is_train = False
+        >>> with mx.set_grad_enabled(is_train):
+        ...     y = x * 2
+        >>> # y won't have gradients computed
+        >>> _ = mx.set_grad_enabled(True)
+        >>> y = x * 2
+        >>> # y will have gradients computed
+        >>> _ = mx.set_grad_enabled(False)
+        >>> y = x * 2
+        >>> # y won't have gradients computed
+    """
+
+    def __init__(self, mode: bool) -> None:
+        self.prev = mx.is_grad_enabled()
+        self.mode = mode
+        mx.set_grad_enabled(mode)
+
+    def __call__(self, func: F) -> F:
+        """Decorator usage."""
+        mx.set_grad_enabled(self.prev)
+        def wrapper(*args, **kwargs):
+            with set_grad_enabled(self.mode):
+                return func(*args, **kwargs)
+        return wrapper
+
+    def __enter__(self) -> None:
+        mx.set_grad_enabled(self.mode)
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        mx.set_grad_enabled(self.prev)
+
+    def __str__(self) -> str:
+        return f"set_grad_enabled(mode={self.mode})"
+
+    def __repr__(self) -> str:
+        return str(self)
+
+    def clone(self) -> "set_grad_enabled":
+        """Create a copy of this class."""
+        return self.__class__(self.mode)
--- a/python/src/transforms.cpp
+++ b/python/src/transforms.cpp
@@ -14,6 +14,7 @@
 #include <nanobind/stl/vector.h>

 #include "mlx/array.h"
+#include "mlx/autograd_state.h"
 #include "mlx/compile.h"
 #include "mlx/compile_impl.h"
 #include "mlx/transforms.h"
@@ -1500,6 +1501,27 @@ void init_transforms(nb::module_& m) {
      [](nb::callable fun) { return mlx_func(PyCheckpointedFun{fun}, fun); },
      "fun"_a);

+  // Gradient control functions
+  m.def(
+      "is_grad_enabled",
+      &mx::GradMode::is_enabled,
+      R"pbdoc(
+        Returns True if gradient computation is currently enabled globally.
+
+        Returns:
+            bool: True if gradient computation is enabled, False otherwise.
+      )pbdoc");
+  m.def(
+      "set_grad_enabled",
+      &mx::GradMode::set_enabled,
+      "enabled"_a,
+      R"pbdoc(
+        Enables or disables gradient computation globally.
+
+        Args:
+            enabled (bool): Whether to enable gradient computation.
+      )pbdoc");
+
  // Register static Python object cleanup before the interpreter exits
  auto atexit = nb::module_::import_("atexit");
  atexit.attr("register")(nb::cpp_function([]() {
--- a/python/tests/test_autograd.py
+++ b/python/tests/test_autograd.py
@@ -797,6 +797,194 @@ class TestAutograd(mlx_tests.MLXTestCase):
        grad_fn(model)
        self.assertEqual(model[1].item(), 2.0)

+    def test_gradient_mode_control(self):
+        """Test gradient mode control functions."""
+        # Test default state
+        self.assertTrue(mx.is_grad_enabled())
+        
+        # Test set_grad_enabled
+        mx.set_grad_enabled(False)
+        self.assertFalse(mx.is_grad_enabled())
+        
+        mx.set_grad_enabled(True)
+        self.assertTrue(mx.is_grad_enabled())
+
+    def test_no_grad_context_manager(self):
+        """Test no_grad context manager."""
+        from mlx.autograd import no_grad
+        
+        x = mx.array(2.0)
+        fun = lambda x: x * x
+        
+        # Test with gradients enabled (default)
+        out, grad = mx.vjp(fun, x, mx.array(1.0))
+        self.assertEqual(out.item(), 4.0)
+        self.assertEqual(grad.item(), 4.0)
+        
+        # Test with no_grad context manager
+        with no_grad():
+            self.assertFalse(mx.is_grad_enabled())
+            out, grad = mx.vjp(fun, x, mx.array(1.0))
+            self.assertEqual(out.item(), 4.0)
+            self.assertEqual(grad.item(), 0.0)
+        
+        # Check that gradient mode is restored
+        self.assertTrue(mx.is_grad_enabled())
+
+    def test_no_grad_decorator(self):
+        """Test no_grad as a decorator."""
+        from mlx.autograd import no_grad
+        
+        x = mx.array(2.0)
+        
+        @no_grad()
+        def square_no_grad(x):
+            return x * x
+        
+        # Function should run without gradients
+        out, grad = mx.vjp(square_no_grad, x, mx.array(1.0))
+        self.assertEqual(out.item(), 4.0)
+        self.assertEqual(grad.item(), 0.0)
+        
+        # Gradient mode should be restored after function
+        self.assertTrue(mx.is_grad_enabled())
+
+    def test_enable_grad_context_manager(self):
+        """Test enable_grad context manager."""
+        from mlx.autograd import enable_grad, no_grad
+        
+        x = mx.array(2.0)
+        fun = lambda x: x * x
+        
+        with no_grad():
+            self.assertFalse(mx.is_grad_enabled())
+            
+            # Test enable_grad within no_grad
+            with enable_grad():
+                self.assertTrue(mx.is_grad_enabled())
+                out, grad = mx.vjp(fun, x, mx.array(1.0))
+                self.assertEqual(out.item(), 4.0)
+                self.assertEqual(grad.item(), 4.0)
+            
+            # Should be back to no_grad
+            self.assertFalse(mx.is_grad_enabled())
+            out, grad = mx.vjp(fun, x, mx.array(1.0))
+            self.assertEqual(out.item(), 4.0)
+            self.assertEqual(grad.item(), 0.0)
+        
+        # Should be back to enabled
+        self.assertTrue(mx.is_grad_enabled())
+
+    def test_set_grad_enabled_context_manager(self):
+        """Test set_grad_enabled as context manager."""
+        from mlx.autograd import set_grad_enabled
+        
+        x = mx.array(2.0)
+        fun = lambda x: x * x
+        
+        # Test conditional gradient disabling
+        is_train = False
+        with set_grad_enabled(is_train):
+            self.assertFalse(mx.is_grad_enabled())
+            out, grad = mx.vjp(fun, x, mx.array(1.0))
+            self.assertEqual(out.item(), 4.0)
+            self.assertEqual(grad.item(), 0.0)
+        
+        # Test conditional gradient enabling
+        is_train = True
+        with set_grad_enabled(is_train):
+            self.assertTrue(mx.is_grad_enabled())
+            out, grad = mx.vjp(fun, x, mx.array(1.0))
+            self.assertEqual(out.item(), 4.0)
+            self.assertEqual(grad.item(), 4.0)
+        
+        # Gradient mode should be restored
+        self.assertTrue(mx.is_grad_enabled())
+
+    def test_set_grad_enabled_decorator(self):
+        """Test set_grad_enabled as decorator."""
+        from mlx.autograd import set_grad_enabled
+        
+        x = mx.array(2.0)
+        
+        @set_grad_enabled(False)
+        def square_no_grad(x):
+            return x * x
+        
+        # Function should run without gradients
+        out, grad = mx.vjp(square_no_grad, x, mx.array(1.0))
+        self.assertEqual(out.item(), 4.0)
+        self.assertEqual(grad.item(), 0.0)
+        
+        # Gradient mode should be restored
+        self.assertTrue(mx.is_grad_enabled())
+
+    def test_jvp_with_no_grad(self):
+        """Test JVP with gradient mode disabled."""
+        from mlx.autograd import no_grad
+        
+        x = mx.array(2.0)
+        fun = lambda x: x * x
+        
+        # Test with gradients enabled
+        out, jvp_result = mx.jvp(fun, x, mx.array(1.0))
+        self.assertEqual(out.item(), 4.0)
+        self.assertEqual(jvp_result.item(), 4.0)
+        
+        # Test with gradients disabled
+        with no_grad():
+            out, jvp_result = mx.jvp(fun, x, mx.array(1.0))
+            self.assertEqual(out.item(), 4.0)
+            self.assertEqual(jvp_result.item(), 0.0)
+
+    def test_value_and_grad_with_no_grad(self):
+        """Test value_and_grad with gradient mode disabled."""
+        from mlx.autograd import no_grad
+        
+        fun = lambda x: mx.sum(x * x)
+        x = mx.array([1.0, 2.0, 3.0])
+        
+        # Test with gradients enabled
+        value, grad = mx.value_and_grad(fun)(x)
+        self.assertEqual(value.item(), 14.0)  # 1 + 4 + 9
+        self.assertTrue(mx.allclose(grad, mx.array([2.0, 4.0, 6.0])).item())
+        
+        # Test with gradients disabled
+        with no_grad():
+            value, grad = mx.value_and_grad(fun)(x)
+            self.assertEqual(value.item(), 14.0)
+            self.assertTrue(mx.allclose(grad, mx.zeros_like(x)).item())
+
+    def test_nested_gradient_contexts(self):
+        """Test complex nesting of gradient contexts."""
+        from mlx.autograd import no_grad, enable_grad, set_grad_enabled
+        
+        x = mx.array(2.0)
+        fun = lambda x: x * x
+        
+        self.assertTrue(mx.is_grad_enabled())
+        
+        with no_grad():
+            self.assertFalse(mx.is_grad_enabled())
+            
+            with enable_grad():
+                self.assertTrue(mx.is_grad_enabled())
+                
+                with set_grad_enabled(False):
+                    self.assertFalse(mx.is_grad_enabled())
+                    out, grad = mx.vjp(fun, x, mx.array(1.0))
+                    self.assertEqual(out.item(), 4.0)
+                    self.assertEqual(grad.item(), 0.0)
+                
+                self.assertTrue(mx.is_grad_enabled())
+                out, grad = mx.vjp(fun, x, mx.array(1.0))
+                self.assertEqual(out.item(), 4.0)
+                self.assertEqual(grad.item(), 4.0)
+            
+            self.assertFalse(mx.is_grad_enabled())
+        
+        self.assertTrue(mx.is_grad_enabled())
+

 if __name__ == "__main__":
    mlx_tests.MLXTestRunner()
--- a/tests/autograd_tests.cpp
+++ b/tests/autograd_tests.cpp
@@ -10,6 +10,7 @@
 #include <vector>
 #include "doctest/doctest.h"

+#include "mlx/autograd_state.h"
 #include "mlx/graph_utils.h"
 #include "mlx/mlx.h"

@@ -1353,3 +1354,95 @@ TEST_CASE("test grad dynamic slices") {
    CHECK(allclose(outs[1], ones({1, 2})).item<bool>());
  }
 }
+
+TEST_CASE("test gradient mode control") {
+  // Test basic gradient mode functions
+  CHECK(GradMode::is_enabled() == true); // Default should be enabled
+  
+  GradMode::set_enabled(false);
+  CHECK(GradMode::is_enabled() == false);
+  
+  GradMode::set_enabled(true);
+  CHECK(GradMode::is_enabled() == true);
+  
+  // Test NoGradGuard
+  {
+    CHECK(GradMode::is_enabled() == true);
+    {
+      NoGradGuard no_grad;
+      CHECK(GradMode::is_enabled() == false);
+    }
+    CHECK(GradMode::is_enabled() == true);
+  }
+  
+  // Test EnableGradGuard
+  {
+    GradMode::set_enabled(false);
+    CHECK(GradMode::is_enabled() == false);
+    {
+      EnableGradGuard enable_grad;
+      CHECK(GradMode::is_enabled() == true);
+    }
+    CHECK(GradMode::is_enabled() == false);
+    GradMode::set_enabled(true); // Reset for other tests
+  }
+  
+  // Test AutoGradMode
+  {
+    CHECK(GradMode::is_enabled() == true);
+    {
+      AutoGradMode auto_grad(false);
+      CHECK(GradMode::is_enabled() == false);
+    }
+    CHECK(GradMode::is_enabled() == true);
+  }
+}
+
+TEST_CASE("test no_grad with transforms") {
+  auto x = array(2.0);
+  auto fun = [](array input) { return multiply(input, input); };
+  
+  // Test with gradients enabled (default)
+  {
+    auto [output, grad] = vjp(fun, x, array(1.0));
+    CHECK(array_equal(output, array(4.0)).item<bool>());
+    CHECK(array_equal(grad, array(4.0)).item<bool>());
+  }
+  
+  // Test with gradients disabled
+  {
+    NoGradGuard no_grad;
+    auto [output, grad] = vjp(fun, x, array(1.0));
+    CHECK(array_equal(output, array(4.0)).item<bool>());
+    CHECK(array_equal(grad, array(0.0)).item<bool>());
+  }
+  
+  // Test JVP with no_grad
+  {
+    NoGradGuard no_grad;
+    auto [output, jvp_result] = jvp(fun, x, array(1.0));
+    CHECK(array_equal(output, array(4.0)).item<bool>());
+    CHECK(array_equal(jvp_result, array(0.0)).item<bool>());
+  }
+  
+  // Test nested gradient contexts
+  {
+    CHECK(GradMode::is_enabled() == true);
+    {
+      NoGradGuard no_grad;
+      CHECK(GradMode::is_enabled() == false);
+      {
+        EnableGradGuard enable_grad;
+        CHECK(GradMode::is_enabled() == true);
+        auto [output, grad] = vjp(fun, x, array(1.0));
+        CHECK(array_equal(output, array(4.0)).item<bool>());
+        CHECK(array_equal(grad, array(4.0)).item<bool>());
+      }
+      CHECK(GradMode::is_enabled() == false);
+      auto [output, grad] = vjp(fun, x, array(1.0));
+      CHECK(array_equal(output, array(4.0)).item<bool>());
+      CHECK(array_equal(grad, array(0.0)).item<bool>());
+    }
+    CHECK(GradMode::is_enabled() == true);
+  }
+}