Make attention faster for a some models (#574)

* make attention faster for a couple models * remove unused generation flags * add comment on lora * include text files as well
2025-12-15 09:48:54 +08:00 · 2024-03-14 21:35:54 -07:00
parent 3f3741d229
commit e4b19bb9e1
6 changed files with 35 additions and 56 deletions
--- a/llms/mlx_lm/utils.py
+++ b/llms/mlx_lm/utils.py
@@ -81,6 +81,7 @@ def get_model_path(path_or_hf_repo: str, revision: Optional[str] = None) -> Path
                    "*.py",
                    "tokenizer.model",
                    "*.tiktoken",
+                    "*.txt",
                ],
            )
        )
@@ -396,7 +397,6 @@ def fetch_from_hub(
    model_path: Path, lazy: bool = False
 ) -> Tuple[nn.Module, dict, PreTrainedTokenizer]:
    model = load_model(model_path, lazy)
-
    config = AutoConfig.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)