mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-06-24 09:21:18 +08:00

* Adding full model weights finetuning * Updating the LORA.md and ACKNOWLEDGMENTS.md files. * removing --use-dora and --fulll-training and adding --fine-tune-type * some clean up * reformating and fixing dora training * updated CONFIG_DEFAULTS * update config example * update in the config example fie * Update LORA.md * merge and commit * adding argument for dora linear layer * clean up * clean up in the example yaml file * fix * final fix before sending * small addition to re md file * fix for loading the fully trained model by saving all the files and configs correctly * clean up * removing the unnesesairy files * changing lora layers back to 16 * removed max file size * nits * resolve merge * some consistency changes --------- Co-authored-by: Awni Hannun <awni@apple.com>
81 lines
1.8 KiB
YAML
81 lines
1.8 KiB
YAML
# The path to the local model directory or Hugging Face repo.
|
|
model: "mlx_model"
|
|
|
|
# Whether or not to train (boolean)
|
|
train: true
|
|
|
|
# The fine-tuning method: "lora", "dora", or "full".
|
|
fine_tune_type: lora
|
|
|
|
# Directory with {train, valid, test}.jsonl files
|
|
data: "/path/to/training/data"
|
|
|
|
# The PRNG seed
|
|
seed: 0
|
|
|
|
# Number of layers to fine-tune
|
|
lora_layers: 16
|
|
|
|
# Minibatch size.
|
|
batch_size: 4
|
|
|
|
# Iterations to train for.
|
|
iters: 1000
|
|
|
|
# Number of validation batches, -1 uses the entire validation set.
|
|
val_batches: 25
|
|
|
|
# Adam learning rate.
|
|
learning_rate: 1e-5
|
|
|
|
# Number of training steps between loss reporting.
|
|
steps_per_report: 10
|
|
|
|
# Number of training steps between validations.
|
|
steps_per_eval: 200
|
|
|
|
# Load path to resume training with the given adapter weights.
|
|
resume_adapter_file: null
|
|
|
|
# Save/load path for the trained adapter weights.
|
|
adapter_path: "adapters"
|
|
|
|
# Save the model every N iterations.
|
|
save_every: 100
|
|
|
|
# Evaluate on the test set after training
|
|
test: false
|
|
|
|
# Number of test set batches, -1 uses the entire test set.
|
|
test_batches: 100
|
|
|
|
# Maximum sequence length.
|
|
max_seq_length: 2048
|
|
|
|
# Use gradient checkpointing to reduce memory use.
|
|
grad_checkpoint: false
|
|
|
|
# LoRA parameters can only be specified in a config file
|
|
lora_parameters:
|
|
# The layer keys to apply LoRA to.
|
|
# These will be applied for the last lora_layers
|
|
keys: ["self_attn.q_proj", "self_attn.v_proj"]
|
|
rank: 8
|
|
scale: 20.0
|
|
dropout: 0.0
|
|
|
|
# Schedule can only be specified in a config file, uncomment to use.
|
|
#lr_schedule:
|
|
# name: cosine_decay
|
|
# warmup: 100 # 0 for no warmup
|
|
# warmup_init: 1e-7 # 0 if not specified
|
|
# arguments: [1e-5, 1000, 1e-7] # passed to scheduler
|
|
|
|
#hf_dataset:
|
|
# name: "billsum"
|
|
# train_split: "train[:1000]"
|
|
# valid_split: "train[-100:]"
|
|
# prompt_feature: "text"
|
|
# completion_feature: "summary"
|
|
|