Make attention faster for a some models (#574)

* make attention faster for a couple models * remove unused generation flags * add comment on lora * include text files as well
2025-09-01 04:14:38 +08:00 · 2024-03-14 21:35:54 -07:00
parent 3f3741d229
commit e4b19bb9e1
6 changed files with 35 additions and 56 deletions
--- a/llms/mlx_lm/lora.py
+++ b/llms/mlx_lm/lora.py
@@ -61,19 +61,6 @@ def build_parser():
        "--model",
        help="The path to the local model directory or Hugging Face repo.",
    )
-    parser.add_argument(
-        "--max-tokens",
-        "-m",
-        type=int,
-        help="The maximum number of tokens to generate",
-    )
-    parser.add_argument("--temp", type=float, help="The sampling temperature")
-    parser.add_argument(
-        "--prompt",
-        "-p",
-        type=str,
-        help="The prompt for generation",
-    )

    # Training args
    parser.add_argument(