fix: prevent gradients from flowing through the reference model's logits

2025-12-16 02:08:55 +08:00 · 2025-02-09 17:02:58 +01:00
parent 54179901b5
commit a527cdb39b
1 changed files with 1 additions and 1 deletions
--- a/llms/mlx_lm/tuner/grpo_trainer.py
+++ b/llms/mlx_lm/tuner/grpo_trainer.py
@@ -189,7 +189,7 @@ def compute_kl(logprobs1, logprobs2):


 def compute_policy_ratio(current_logprobs, ref_logprobs):
-    return mx.exp(mx.array(current_logprobs - ref_logprobs, dtype=mx.float32))
+    return mx.exp(mx.array(current_logprobs - mx.stop_gradient(ref_logprobs), dtype=mx.float32))


 def grpo_loss(