mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-10-24 06:28:07 +08:00
LoRA: Add printing and callbacks for learning rate during training (#457)
* LoRA:Refactor TrainingCallback to enhance flexibility and extensibility This commit refactors the TrainingCallback class to accept a dictionary parameter for both on_train_loss_report and on_val_loss_report methods. By switching from multiple parameters to a single dict parameter, this change significantly improves the class's flexibility and makes it easier to extend with new training or validation metrics in the future without altering the method signatures. This approach simplifies the addition of new information to be logged or processed and aligns with best practices for scalable and maintainable code design. * LoRA: Add printing and callbacks for learning rate during training
This commit is contained in:
@@ -121,18 +121,11 @@ def evaluate(
|
|||||||
|
|
||||||
class TrainingCallback:
|
class TrainingCallback:
|
||||||
|
|
||||||
def on_train_loss_report(
|
def on_train_loss_report(self, train_info: dict):
|
||||||
self,
|
|
||||||
steps: int,
|
|
||||||
loss: float,
|
|
||||||
it_sec: float,
|
|
||||||
tokens_sec: float,
|
|
||||||
trained_tokens: int,
|
|
||||||
):
|
|
||||||
"""Called to report training loss at specified intervals."""
|
"""Called to report training loss at specified intervals."""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def on_val_loss_report(self, steps: int, loss: float, val_time: float):
|
def on_val_loss_report(self, val_info: dict):
|
||||||
"""Called to report validation loss at specified intervals or the beginning."""
|
"""Called to report validation loss at specified intervals or the beginning."""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -146,7 +139,7 @@ def train(
|
|||||||
args: TrainingArgs = TrainingArgs(),
|
args: TrainingArgs = TrainingArgs(),
|
||||||
loss: callable = default_loss,
|
loss: callable = default_loss,
|
||||||
iterate_batches: callable = iterate_batches,
|
iterate_batches: callable = iterate_batches,
|
||||||
training_callback=None,
|
training_callback: TrainingCallback = None,
|
||||||
):
|
):
|
||||||
print(f"Starting training..., iters: {args.iters}")
|
print(f"Starting training..., iters: {args.iters}")
|
||||||
|
|
||||||
@@ -189,20 +182,28 @@ def train(
|
|||||||
train_loss = np.mean(losses)
|
train_loss = np.mean(losses)
|
||||||
|
|
||||||
stop = time.perf_counter()
|
stop = time.perf_counter()
|
||||||
|
learning_rate = optimizer.learning_rate.item()
|
||||||
it_sec = args.steps_per_report / (stop - start)
|
it_sec = args.steps_per_report / (stop - start)
|
||||||
tokens_sec = float(n_tokens) / (stop - start)
|
tokens_sec = float(n_tokens) / (stop - start)
|
||||||
trained_tokens += n_tokens
|
trained_tokens += n_tokens
|
||||||
print(
|
print(
|
||||||
f"Iter {it + 1}: Train loss {train_loss:.3f}, "
|
f"Iter {it + 1}: Train loss {train_loss:.3f}, "
|
||||||
|
f"Learning Rate {learning_rate:.3e}, "
|
||||||
f"It/sec {it_sec:.3f}, "
|
f"It/sec {it_sec:.3f}, "
|
||||||
f"Tokens/sec {tokens_sec:.3f}, "
|
f"Tokens/sec {tokens_sec:.3f}, "
|
||||||
f"Trained Tokens {trained_tokens}"
|
f"Trained Tokens {trained_tokens}"
|
||||||
)
|
)
|
||||||
|
|
||||||
if training_callback is not None:
|
if training_callback is not None:
|
||||||
training_callback.on_train_loss_report(
|
train_info = {
|
||||||
it + 1, train_loss, it_sec, tokens_sec, trained_tokens
|
"iteration": it + 1,
|
||||||
)
|
"train_loss": train_loss,
|
||||||
|
"learning_rate": learning_rate,
|
||||||
|
"iterations_per_second": it_sec,
|
||||||
|
"tokens_per_second": tokens_sec,
|
||||||
|
"trained_tokens": trained_tokens,
|
||||||
|
}
|
||||||
|
training_callback.on_train_loss_report(train_info)
|
||||||
|
|
||||||
losses = []
|
losses = []
|
||||||
n_tokens = 0
|
n_tokens = 0
|
||||||
@@ -229,7 +230,12 @@ def train(
|
|||||||
)
|
)
|
||||||
|
|
||||||
if training_callback is not None:
|
if training_callback is not None:
|
||||||
training_callback.on_val_loss_report(it + 1, val_loss, val_time)
|
val_info = {
|
||||||
|
"iteration": it + 1,
|
||||||
|
"val_loss": val_loss,
|
||||||
|
"val_time": val_time
|
||||||
|
}
|
||||||
|
training_callback.on_val_loss_report(val_info)
|
||||||
|
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user