adding PPO like clipping adapted from trl

2025-12-16 02:08:55 +08:00 · 2025-03-11 09:08:38 +01:00
parent 06ff47012f
commit 9fd6a5b6d0
1 changed files with 16 additions and 4 deletions
--- a/llms/mlx_lm/tuner/grpo_trainer.py
+++ b/llms/mlx_lm/tuner/grpo_trainer.py
@@ -375,10 +375,22 @@ def grpo_loss(
        mx.array(token_log_probs - mx.stop_gradient(ref_token_log_probs))
    )

-    # Compute per-token loss
-    per_token_loss = -(
-        (policy_ratio * advantages.reshape(-1, 1) - beta * kl_div) * length_mask
-    )
+    # Apply PPO like clipping
+    policy_ratio_cliped = mx.clip(policy_ratio, 1 - epsilon, 1 + epsilon)
+
+    # Calculate both unclipped and clipped objectives
+    unclipped_obj = policy_ratio * advantages.reshape(-1, 1)
+    clipped_obj = policy_ratio_cliped * advantages.reshape(-1, 1)
+
+    # Take the minimum (pessimistic bound)
+    per_token_loss = -mx.minimum(unclipped_obj, clipped_obj)
+
+    # Add KL penalty if beta is non-zero
+    if beta != 0.0:
+        per_token_loss = per_token_loss + beta * kl_div
+
+
+    per_token_loss = per_token_loss * length_mask

    # Average over tokens
    loss = (per_token_loss * length_mask).sum() / length_mask.sum() # Matches the pytorch implementaiton