mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-06-25 01:41:19 +08:00
chore(mlx-lm): truncate the input sentence to max seq len in lora iterate_batches (#373)
* chore(mlx-lm): pass max seq len to evaluate in training loop * chore: make sure the batch seq not exceed max len * chore: update comment * chore: add warning before truncate input
This commit is contained in:
parent
0f19237fb8
commit
f51e98fcf1
@ -66,19 +66,25 @@ def iterate_batches(dataset, tokenizer, batch_size, max_seq_length, train=False)
|
|||||||
]
|
]
|
||||||
lengths = [len(x) for x in batch]
|
lengths = [len(x) for x in batch]
|
||||||
|
|
||||||
# Check if any sequence is longer than max_seq_length
|
|
||||||
if max(lengths) > max_seq_length:
|
if max(lengths) > max_seq_length:
|
||||||
print(
|
print(
|
||||||
"[WARNING] Some sequences are longer than 2048 tokens. "
|
f"[WARNING] Some sequences are longer than {max_seq_length} tokens. "
|
||||||
|
f"The longest sentence {max(lengths)} will be truncated to {max_seq_length}. "
|
||||||
"Consider pre-splitting your data to save memory."
|
"Consider pre-splitting your data to save memory."
|
||||||
)
|
)
|
||||||
|
|
||||||
# Pad to the max length
|
# Pad to the max length
|
||||||
batch_arr = np.zeros((batch_size, max(lengths)), np.int32)
|
max_length_in_batch = min(max(lengths), max_seq_length)
|
||||||
|
batch_arr = np.zeros((batch_size, max_length_in_batch), np.int32)
|
||||||
|
|
||||||
for j in range(batch_size):
|
for j in range(batch_size):
|
||||||
batch_arr[j, : lengths[j]] = batch[j]
|
truncated_length = min(lengths[j], max_seq_length)
|
||||||
|
batch_arr[j, :truncated_length] = batch[j][:truncated_length]
|
||||||
|
lengths[
|
||||||
|
j
|
||||||
|
] = truncated_length # Update lengths to match truncated lengths
|
||||||
batch = mx.array(batch_arr)
|
batch = mx.array(batch_arr)
|
||||||
|
|
||||||
yield batch[:, :-1], batch[:, 1:], mx.array(lengths)
|
yield batch[:, :-1], batch[:, 1:], mx.array(lengths)
|
||||||
|
|
||||||
if not train:
|
if not train:
|
||||||
@ -175,6 +181,7 @@ def train(
|
|||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
batch_size=args.batch_size,
|
batch_size=args.batch_size,
|
||||||
num_batches=args.val_batches,
|
num_batches=args.val_batches,
|
||||||
|
max_seq_length=args.max_seq_length,
|
||||||
)
|
)
|
||||||
print(
|
print(
|
||||||
f"Iter {it + 1}: "
|
f"Iter {it + 1}: "
|
||||||
|
Loading…
Reference in New Issue
Block a user