mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-09-01 04:14:38 +08:00
Quantized KV Cache (#1075)
* add QuantizedKVCache * simplify * add tests * single sdpa function * fix sed * in place * fix tests * support different k and v head dims
This commit is contained in:
@@ -8,7 +8,7 @@ from typing import Tuple
|
||||
import mlx.core as mx
|
||||
import mlx.nn as nn
|
||||
|
||||
from .base import create_attention_mask
|
||||
from .base import create_attention_mask, scaled_dot_product_attention
|
||||
from .switch_layers import SwitchMLP
|
||||
|
||||
|
||||
@@ -71,8 +71,13 @@ class RoPEAttention(nn.Module):
|
||||
# Finally perform the attention computation
|
||||
scale = math.sqrt(1 / queries.shape[-1])
|
||||
|
||||
output = mx.fast.scaled_dot_product_attention(
|
||||
queries.astype(mx.float32), keys, values, scale=scale, mask=mask
|
||||
output = scaled_dot_product_attention(
|
||||
queries.astype(mx.float32),
|
||||
keys,
|
||||
values,
|
||||
cache=cache,
|
||||
scale=scale,
|
||||
mask=mask,
|
||||
).astype(values.dtype)
|
||||
output = output.moveaxis(2, 1).reshape(B, L, -1)
|
||||
|
||||
|
Reference in New Issue
Block a user