Length masking for batch inputs (#1173)

* length masking * add mask to mlx_lm model interface * remove lengths * fix test: * comment + fix
2025-12-15 09:48:54 +08:00 · 2024-12-18 19:43:52 -08:00
parent db109184b7
commit d4ef909d4a
34 changed files with 191 additions and 72 deletions
--- a/llms/mlx_lm/models/gpt_bigcode.py
+++ b/llms/mlx_lm/models/gpt_bigcode.py
@@ -137,6 +137,7 @@ class GPTBigCodeModel(nn.Module):
    def __call__(
        self,
        inputs: mx.array,
+        mask: mx.array = None,
        cache=None,
    ):
        B, L = inputs.shape
@@ -149,7 +150,8 @@ class GPTBigCodeModel(nn.Module):
            position_ids = mx.array(np.arange(L))
            hidden_states += self.wpe(position_ids)

-            mask = create_attention_mask(hidden_states, cache)
+            if mask is None:
+                mask = create_attention_mask(hidden_states, cache)

        if cache is None:
            cache = [None] * len(self.h)
@@ -172,9 +174,10 @@ class Model(nn.Module):
    def __call__(
        self,
        inputs: mx.array,
+        mask: mx.array = None,
        cache=None,
    ):
-        out = self.transformer(inputs, cache)
+        out = self.transformer(inputs, mask, cache)
        if self.args.tie_word_embeddings:
            out = self.transformer.wte.as_linear(out)
        else: