From 00712522ba5871b0de4c3bd5a6406b9574aea055 Mon Sep 17 00:00:00 2001 From: Goekdeniz-Guelmez Date: Sun, 9 Feb 2025 17:13:05 +0100 Subject: [PATCH] rebase loss calculation --- llms/mlx_lm/tuner/grpo_trainer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llms/mlx_lm/tuner/grpo_trainer.py b/llms/mlx_lm/tuner/grpo_trainer.py index b7bdc7dc..b3619e0a 100644 --- a/llms/mlx_lm/tuner/grpo_trainer.py +++ b/llms/mlx_lm/tuner/grpo_trainer.py @@ -317,7 +317,9 @@ def grpo_loss( per_token_loss = -((policy_ratio * advantages.reshape(-1, 1) - beta * kl_div) * length_mask) # Average over tokens - loss = per_token_loss.sum().mean() + sequence_sums = per_token_loss.sum(axis=1) + sequence_lengths = length_mask.sum(axis=1) + loss = (sequence_sums / sequence_lengths).mean() # Calculate mean KL divergence for metrics mean_kl = ((kl_div * length_mask).sum(axis=1) / length_mask.sum(axis=1)).mean() @@ -343,7 +345,7 @@ def grpo_loss( } mx.metal.clear_cache() - return loss, length_mask.sum(axis=1).sum(), metrics + return loss, sequence_lengths.sum(), metrics def iterate_grpo_batches(dataset, tokenizer, batch_size, max_seq_length, train=False):