This commit is contained in:
Goekdeniz-Guelmez 2025-01-24 16:57:18 +01:00
parent e3688293ed
commit 09ed837896
3 changed files with 88 additions and 81 deletions

View File

@ -19,7 +19,6 @@ LoRA (QLoRA).[^qlora] LoRA fine-tuning works with the following model families:
- [Run](#Run)
- [Fine-tune](#Fine-tune)
- [DPO Training](#DPO Training)
- [ORPO Training](#ORPO Training)
- [Evaluate](#Evaluate)
- [Generate](#Generate)
@ -79,64 +78,56 @@ You can specify the output location with `--adapter-path`.
You can resume fine-tuning with an existing adapter with
`--resume-adapter-file <path_to_adapters.safetensors>`.
### DPO Training
Direct Preference Optimization (DPO) training allows you to fine-tune models using human preference data. To use DPO training, set the training mode to 'dpo':
```shell
mlx_lm.lora \
--model <path_to_model> \
--train \
--training-mode dpo \
--data <path_to_data> \
--beta 0.1
```
The DPO training accepts the following additional parameters:
- `--beta`: Controls the strength of the DPO loss (default: 0.1)
- `--dpo-loss-type`: Choose between "sigmoid" (default), "hinge", "ipo", or "dpop" loss functions
- `--is-reference-free`: Enable reference-free DPO training
- `--delta`: Margin parameter for hinge loss (default: 50.0)
- `--reference-model-path`: Path to a reference model for DPO training
For DPO training, the data should be in JSONL format with the following structure:
```jsonl
{"prompt": "User prompt", "chosen": "Preferred response", "rejected": "Less preferred response"}
```
Here's the equivalent ORPO documentation:
### ORPO Training
Odds Ratio Preference Optimization (ORPO) training allows you to fine-tune models using human preference data with pre-computed rewards. To use ORPO training, set the training mode to 'orpo':
Odds Ratio Preference Optimization (ORPO) training fine-tunes models using human preference data. Usage:
```shell
mlx_lm.lora \
--model <path_to_model> \
--train \
--training-mode orpo \
--data <path_to_data> \
--beta 0.1 \
--reward-scaling 1.0
--model <path_to_model> \
--train \
--training-mode orpo \
--data <path_to_data> \
--beta 0.1
```
The ORPO training accepts the following additional parameters:
- `--beta`: Controls the temperature parameter for the logistic function (default: 0.1)
- `--reward-scaling`: Scaling factor for the offline rewards (default: 1.0)
Parameters:
For ORPO training, the data should be in JSONL format with the following structure:
- `--beta`: Temperature for logistic function (default: 0.1)
Data format (JSONL):
```jsonl
# Basic format with string responses
{"prompt": "User prompt", "chosen": "Preferred response", "rejected": "Less preferred response"}
# With custom preference score
{"prompt": "User prompt", "chosen": "Preferred response", "rejected": "Less preferred response", "preference_score": 8.0}
# With system message
{"prompt": "User prompt", "chosen": "Preferred response", "rejected": "Less preferred response", "system": "System instruction"}
# With full conversation objects
{
"prompt": "User prompt",
"chosen": {
"messages": [
{"role": "system", "content": "System instruction"},
{"role": "user", "content": "User message"},
{"role": "assistant", "content": "Assistant response"}
]
},
"rejected": {
"messages": [
{"role": "system", "content": "System instruction"},
{"role": "user", "content": "User message"},
{"role": "assistant", "content": "Assistant response"}
]
}
}
```
The training process will automatically assign binary rewards (1.0 for chosen and 0.0 for rejected responses) if no explicit rewards are provided. You can also provide custom rewards in your data:
```jsonl
{"prompt": "User prompt", "chosen": "Preferred response", "rejected": "Less preferred response", "chosen_reward": 0.8, "rejected_reward": 0.3}
```
The trainer assigns binary rewards (1.0 chosen, 0.0 rejected) if no explicit rewards provided via `preference_score`.
### Evaluate

View File

@ -1,50 +1,66 @@
import json
from pathlib import Path
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Union
from transformers import PreTrainedTokenizer
class ORPODataset:
def __init__(
self,
data: List[Dict[str, str]],
tokenizer: PreTrainedTokenizer,
prompt_key: str = "prompt",
chosen_key: str = "chosen",
rejected_key: str = "rejected",
preference_score_key: str = "preference_score"
):
def __init__(
self,
data: List[Dict[str, Union[str, Dict]]],
tokenizer: PreTrainedTokenizer,
prompt_key: str = "prompt",
chosen_key: str = "chosen",
rejected_key: str = "rejected",
preference_score_key: str = "preference_score",
system_key: str = None
):
self._chosen_data = []
self._rejected_data = []
self._scores = []
for d in data:
chosen_text = tokenizer.apply_chat_template([
{"role": "user", "content": d[prompt_key]},
{"role": "assistant", "content": d[chosen_key]},
])
rejected_text = tokenizer.apply_chat_template([
{"role": "user", "content": d[prompt_key]},
{"role": "assistant", "content": d[rejected_key]},
])
if system_key and system_key in d:
base_messages = [{"role": "system", "content": d[system_key]}]
chosen_messages = base_messages + [{"role": "user", "content": d[prompt_key]}]
if isinstance(d[chosen_key], str):
chosen_messages.append({"role": "assistant", "content": d[chosen_key]})
else:
chosen_messages.extend(d[chosen_key]["messages"])
rejected_messages = base_messages + [{"role": "user", "content": d[prompt_key]}]
if isinstance(d[rejected_key], str):
rejected_messages.append({"role": "assistant", "content": d[rejected_key]})
else:
rejected_messages.extend(d[rejected_key]["messages"])
chosen_text = tokenizer.apply_chat_template(chosen_messages)
rejected_text = tokenizer.apply_chat_template(rejected_messages)
else:
chosen_text = tokenizer.apply_chat_template([
{"role": "user", "content": d[prompt_key]},
{"role": "assistant", "content": d[chosen_key] if isinstance(d[chosen_key], str) else d[chosen_key]["messages"][-1]["content"]},
])
rejected_text = tokenizer.apply_chat_template([
{"role": "user", "content": d[prompt_key]},
{"role": "assistant", "content": d[rejected_key] if isinstance(d[rejected_key], str) else d[rejected_key]["messages"][-1]["content"]},
])
self._chosen_data.append(chosen_text)
self._rejected_data.append(rejected_text)
if preference_score_key in d:
self._scores.append(float(d[preference_score_key]))
else:
self._scores.append(1.0)
def __getitem__(self, idx: int):
return {
"chosen": self._chosen_data[idx],
"rejected": self._rejected_data[idx],
"preference_score": self._scores[idx]
}
def __len__(self):
return len(self._chosen_data)
def __len__(self):
return len(self._chosen_data)
def __getitem__(self, idx: int):
return {
"chosen": self._chosen_data[idx],
"rejected": self._rejected_data[idx],
"preference_score": self._scores[idx]
}
class Dataset:

View File

@ -40,7 +40,7 @@ def orpo_loss(model, chosen, rejected, chosen_masks, rejected_masks, chosen_rewa
loss = -beta * ratio
accuracies = (log_odds > 0).astype(mx.float32)
margins = mx.mean(ratio)
margins = mx.mean(ratio - 1)
metrics = {
'accuracies': mx.mean(accuracies),
'margins': margins,
@ -107,9 +107,9 @@ def iterate_orpo_batches(dataset, tokenizer, batch_size, max_seq_length, train=F
rejected_masks = np.zeros((batch_size // step, max_length_in_batch), np.float32)
# Get preference scores and convert to rewards
preference_scores = np.array([x.get('preference_score', 1.0) for x in batch], np.float32)
chosen_rewards = preference_scores
rejected_rewards = 1.0 - preference_scores
preference_scores = [x.get('preference_score', 1.0) for x in batch]
chosen_rewards = np.array(preference_scores, np.float32)
rejected_rewards = np.array([1.0 - score for score in preference_scores], np.float32)
for j in range(batch_size // step):
# Use pre-tokenized sequences directly