Handle longer prompt/generation (#931)

* rebase

* nits

* nit

* fix rotating cache with step prefill

* update version
This commit is contained in:
Awni Hannun
2024-08-16 15:28:39 -07:00
committed by GitHub
parent e196fa3208
commit 7be292c0c9
32 changed files with 255 additions and 13 deletions

View File

@@ -76,7 +76,12 @@ def setup_arg_parser():
type=int,
default=None,
help="Set the MLX cache limit in GB",
required=False,
)
parser.add_argument(
"--max-kv-size",
type=int,
default=1024,
help="Set the maximum key-value cache size",
)
return parser
@@ -154,6 +159,7 @@ def main():
formatter=formatter,
temp=args.temp,
top_p=args.top_p,
max_kv_size=args.max_kv_size,
)