mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-08-30 02:53:41 +08:00
in place
This commit is contained in:
parent
1d53354b51
commit
8444ff0f6a
@ -160,7 +160,7 @@ def main():
|
|||||||
max_msg_len = max(max_msg_len, len(msg))
|
max_msg_len = max(max_msg_len, len(msg))
|
||||||
print(msg + " " * (max_msg_len - len(msg)), end="", flush=True)
|
print(msg + " " * (max_msg_len - len(msg)), end="", flush=True)
|
||||||
|
|
||||||
cache = maybe_quantize_kv_cache(
|
maybe_quantize_kv_cache(
|
||||||
cache, args.quantized_kv_start, args.kv_group_size, args.kv_bits
|
cache, args.quantized_kv_start, args.kv_group_size, args.kv_bits
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -165,10 +165,10 @@ def maybe_quantize_kv_cache(prompt_cache, quantized_kv_start, kv_group_size, kv_
|
|||||||
and not isinstance(prompt_cache[0], cache.QuantizedKVCache)
|
and not isinstance(prompt_cache[0], cache.QuantizedKVCache)
|
||||||
and prompt_cache[0].offset > quantized_kv_start
|
and prompt_cache[0].offset > quantized_kv_start
|
||||||
):
|
):
|
||||||
return [
|
for i in range(len(prompt_cache)):
|
||||||
c.to_quantized(group_size=kv_group_size, bits=kv_bits) for c in prompt_cache
|
prompt_cache[i] = prompt_cache[i].to_quantized(
|
||||||
]
|
group_size=kv_group_size, bits=kv_bits
|
||||||
return prompt_cache
|
)
|
||||||
|
|
||||||
|
|
||||||
def generate_step(
|
def generate_step(
|
||||||
@ -290,7 +290,7 @@ def generate_step(
|
|||||||
for processor in logits_processor:
|
for processor in logits_processor:
|
||||||
logits = processor(tokens, logits)
|
logits = processor(tokens, logits)
|
||||||
|
|
||||||
prompt_cache = maybe_quantize_kv_cache(
|
maybe_quantize_kv_cache(
|
||||||
prompt_cache, quantized_kv_start, kv_group_size, kv_bits
|
prompt_cache, quantized_kv_start, kv_group_size, kv_bits
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user