mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-09-01 12:49:50 +08:00
@@ -4,6 +4,7 @@ import time
|
||||
from dataclasses import dataclass, field
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
import mlx.core as mx
|
||||
import mlx.nn as nn
|
||||
@@ -54,7 +55,7 @@ class TrainingArgs:
|
||||
default=2048, metadata={"help": "Maximum sequence length."}
|
||||
)
|
||||
adapter_file: str = field(
|
||||
default="adapter.npz",
|
||||
default="adapters.safetensors",
|
||||
metadata={"help": "Save/load path for the trained adapter weights."},
|
||||
)
|
||||
grad_checkpoint: bool = field(
|
||||
@@ -172,18 +173,6 @@ def train(
|
||||
):
|
||||
print(f"Starting training..., iters: {args.iters}")
|
||||
|
||||
def checkpoints_path(adapter_file) -> str:
|
||||
checkpoints_path = Path("checkpoints")
|
||||
if Path(adapter_file).parent:
|
||||
checkpoints_path = Path(adapter_file).parent / "checkpoints"
|
||||
|
||||
checkpoints_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
return str(checkpoints_path)
|
||||
|
||||
# Create checkpoints directory if it does not exist
|
||||
adapter_path = checkpoints_path(args.adapter_file)
|
||||
|
||||
if args.grad_checkpoint:
|
||||
grad_checkpoint(model.layers[0])
|
||||
|
||||
@@ -206,7 +195,7 @@ def train(
|
||||
# Main training loop
|
||||
start = time.perf_counter()
|
||||
for it, batch in zip(
|
||||
range(args.iters),
|
||||
range(1, args.iters + 1),
|
||||
iterate_batches(
|
||||
dataset=train_dataset,
|
||||
tokenizer=tokenizer,
|
||||
@@ -223,7 +212,7 @@ def train(
|
||||
n_tokens += toks.item()
|
||||
|
||||
# Report training loss if needed
|
||||
if ((it + 1) % args.steps_per_report == 0) or (it + 1 == args.iters):
|
||||
if it % args.steps_per_report == 0 or it == args.iters:
|
||||
train_loss = np.mean(losses)
|
||||
|
||||
stop = time.perf_counter()
|
||||
@@ -233,7 +222,7 @@ def train(
|
||||
trained_tokens += n_tokens
|
||||
peak_mem = mx.metal.get_peak_memory() / 2**30
|
||||
print(
|
||||
f"Iter {it + 1}: Train loss {train_loss:.3f}, "
|
||||
f"Iter {it}: Train loss {train_loss:.3f}, "
|
||||
f"Learning Rate {learning_rate:.3e}, "
|
||||
f"It/sec {it_sec:.3f}, "
|
||||
f"Tokens/sec {tokens_sec:.3f}, "
|
||||
@@ -243,7 +232,7 @@ def train(
|
||||
|
||||
if training_callback is not None:
|
||||
train_info = {
|
||||
"iteration": it + 1,
|
||||
"iteration": it,
|
||||
"train_loss": train_loss,
|
||||
"learning_rate": learning_rate,
|
||||
"iterations_per_second": it_sec,
|
||||
@@ -258,7 +247,7 @@ def train(
|
||||
start = time.perf_counter()
|
||||
|
||||
# Report validation loss if needed
|
||||
if it == 0 or ((it + 1) % args.steps_per_eval == 0) or (it + 1 == args.iters):
|
||||
if it == 1 or it % args.steps_per_eval == 0 or it == args.iters:
|
||||
stop = time.perf_counter()
|
||||
val_loss = evaluate(
|
||||
model=model,
|
||||
@@ -272,14 +261,12 @@ def train(
|
||||
)
|
||||
val_time = time.perf_counter() - stop
|
||||
print(
|
||||
f"Iter {it + 1}: "
|
||||
f"Val loss {val_loss:.3f}, "
|
||||
f"Val took {val_time:.3f}s"
|
||||
f"Iter {it}: " f"Val loss {val_loss:.3f}, " f"Val took {val_time:.3f}s"
|
||||
)
|
||||
|
||||
if training_callback is not None:
|
||||
val_info = {
|
||||
"iteration": it + 1,
|
||||
"iteration": it,
|
||||
"val_loss": val_loss,
|
||||
"val_time": val_time,
|
||||
}
|
||||
@@ -287,23 +274,26 @@ def train(
|
||||
|
||||
start = time.perf_counter()
|
||||
|
||||
# Save adapter weights if needed
|
||||
if (it + 1) % args.steps_per_save == 0:
|
||||
checkpoint_adapter_file = (
|
||||
f"{adapter_path}/{it + 1}_{Path(args.adapter_file).name}"
|
||||
# Save adapter weights
|
||||
if it % args.steps_per_save == 0:
|
||||
save_adapter(model, args.adapter_file)
|
||||
checkpoint = (
|
||||
Path(args.adapter_file).parent / f"{it:07d}_adapters.safetensors"
|
||||
)
|
||||
save_adapter(model, checkpoint)
|
||||
print(
|
||||
f"Iter {it}: Saved adapter weights to "
|
||||
f"{args.adapter_file} and {checkpoint}."
|
||||
)
|
||||
save_adapter(model=model, adapter_file=checkpoint_adapter_file)
|
||||
print(f"Iter {it + 1}: Saved adapter weights to {checkpoint_adapter_file}.")
|
||||
|
||||
# save final adapter weights
|
||||
save_adapter(model=model, adapter_file=args.adapter_file)
|
||||
save_adapter(model, args.adapter_file)
|
||||
print(f"Saved final adapter weights to {args.adapter_file}.")
|
||||
|
||||
|
||||
def save_adapter(
|
||||
model: nn.Module,
|
||||
adapter_file: str,
|
||||
adapter_file: Union[str, Path],
|
||||
):
|
||||
flattened_tree = tree_flatten(model.trainable_parameters())
|
||||
|
||||
mx.savez(adapter_file, **dict(flattened_tree))
|
||||
mx.save_safetensors(str(adapter_file), dict(flattened_tree))
|
||||
|
@@ -1,4 +1,7 @@
|
||||
import os
|
||||
# Copyright © 2024 Apple Inc.
|
||||
import json
|
||||
import types
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
|
||||
import mlx.core as mx
|
||||
@@ -91,40 +94,28 @@ def linear_to_lora_layers(
|
||||
raise ValueError(f"Lora does not support {model.model_type}")
|
||||
|
||||
for l in model.layers[num_layers - num_lora_layers :]:
|
||||
modules = l.named_modules()
|
||||
lora_layers = [(k, to_lora(m)) for k, m in l.named_modules() if k in keys]
|
||||
l.update_modules(tree_unflatten(lora_layers))
|
||||
|
||||
|
||||
def apply_lora_layers(model: nn.Module, adapter_file: str) -> nn.Module:
|
||||
def apply_lora_layers(model: nn.Module, adapter_path: str) -> nn.Module:
|
||||
"""
|
||||
Apply LoRA layers to the model.
|
||||
|
||||
Args:
|
||||
model (nn.Module): The neural network model.
|
||||
adapter_file (str): Path to the adapter configuration file.
|
||||
adapter_path (str): Path to the adapter configuration file.
|
||||
|
||||
Returns:
|
||||
nn.Module: The updated model with LoRA layers applied.
|
||||
"""
|
||||
if not os.path.exists(adapter_file):
|
||||
raise FileNotFoundError(f"The adapter file does not exist: {adapter_file}")
|
||||
|
||||
adapters = list(mx.load(adapter_file).items())
|
||||
|
||||
linear_replacements = []
|
||||
lora_layers = set(
|
||||
[name.replace(".lora_a", "").replace(".lora_b", "") for name, _ in adapters]
|
||||
)
|
||||
for name, module in model.named_modules():
|
||||
if name in lora_layers:
|
||||
replacement_module = LoRALinear.from_linear(module)
|
||||
linear_replacements.append((name, replacement_module))
|
||||
|
||||
model.update_modules(tree_unflatten(linear_replacements))
|
||||
|
||||
model.update(tree_unflatten(adapters))
|
||||
|
||||
adapter_path = Path(adapter_path)
|
||||
if not adapter_path.exists():
|
||||
raise FileNotFoundError(f"The adapter path does not exist: {adapter_path}")
|
||||
with open(adapter_path / "adapter_config.json", "r") as fid:
|
||||
config = types.SimpleNamespace(**json.load(fid))
|
||||
linear_to_lora_layers(model, config.lora_layers, config.lora_parameters)
|
||||
model.load_weights(str(adapter_path / "adapters.safetensors"), strict=False)
|
||||
return model
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user