Merge branch 'ml-explore:main' into completion_only

2025-09-02 05:04:37 +08:00 · 2024-11-04 08:48:26 -05:00
parent 1929f5351c 331148d8ec
commit 95fb22449b
135 changed files with 11626 additions and 1465 deletions
--- a/llms/README.md
+++ b/llms/README.md
@@ -16,10 +16,35 @@ conda install -c conda-forge mlx-lm

 The `mlx-lm` package also has:

- [LoRA and QLoRA fine-tuning](https://github.com/ml-explore/mlx-examples/blob/main/llms/mlx_lm/LORA.md)
+- [LoRA, QLoRA, and full fine-tuning](https://github.com/ml-explore/mlx-examples/blob/main/llms/mlx_lm/LORA.md)
 - [Merging models](https://github.com/ml-explore/mlx-examples/blob/main/llms/mlx_lm/MERGE.md)
 - [HTTP model serving](https://github.com/ml-explore/mlx-examples/blob/main/llms/mlx_lm/SERVER.md)

+### Quick Start
+
+To generate text with an LLM use:
+
+```bash
+mlx_lm.generate --prompt "Hi!"
+```
+
+To chat with an LLM use:
+
+```bash
+mlx_lm.chat
+```
+
+This will give you a chat REPL that you can use to interact with the LLM. The
+chat context is preserved during the lifetime of the REPL.
+
+Commands in `mlx-lm` typically take command line options which let you specify
+the model, sampling parameters, and more. Use `-h` to see a list of available
+options for a command, e.g.:
+
+```bash
+mlx_lm.generate -h
+```
+
 ### Python API

 You can use `mlx-lm` as a module:
@@ -29,7 +54,14 @@ from mlx_lm import load, generate

 model, tokenizer = load("mlx-community/Mistral-7B-Instruct-v0.3-4bit")

-response = generate(model, tokenizer, prompt="hello", verbose=True)
+prompt = "Write a story about Einstein"
+
+messages = [{"role": "user", "content": prompt}]
+prompt = tokenizer.apply_chat_template(
+    messages, tokenize=False, add_generation_prompt=True
+)
+
+response = generate(model, tokenizer, prompt=prompt, verbose=True)
 ```

 To see a description of all the arguments you can do:
@@ -38,6 +70,10 @@ To see a description of all the arguments you can do:
 >>> help(generate)
 ```

+Check out the [generation
+example](https://github.com/ml-explore/mlx-examples/tree/main/llms/mlx_lm/examples/generate_response.py)
+to see how to use the API in more detail.
+
 The `mlx-lm` package also comes with functionality to quantize and optionally
 upload models to the Hugging Face Hub.

@@ -75,6 +111,11 @@ model, tokenizer = load(repo)

 prompt = "Write a story about Einstein"

+messages = [{"role": "user", "content": prompt}]
+prompt = tokenizer.apply_chat_template(
+    messages, tokenize=False, add_generation_prompt=True
+)
+
 for t in stream_generate(model, tokenizer, prompt, max_tokens=512):
    print(t, end="", flush=True)
 print()
@@ -120,10 +161,50 @@ mlx_lm.convert \
    --upload-repo mlx-community/my-4bit-mistral
 ```

+### Long Prompts and Generations 
+
+`mlx-lm` has some tools to scale efficiently to long prompts and generations:
+
+- A rotating fixed-size key-value cache.
+- Prompt caching
+
+To use the rotating key-value cache pass the argument `--max-kv-size n` where
+`n` can be any integer. Smaller values like `512` will use very little RAM but
+result in worse quality. Larger values like `4096` or higher will use more RAM
+but have better quality.
+
+Caching prompts can substantially speedup reusing the same long context with
+different queries. To cache a prompt use `mlx_lm.cache_prompt`. For example:
+
+```bash
+cat prompt.txt | mlx_lm.cache_prompt \
+  --model mistralai/Mistral-7B-Instruct-v0.3 \
+  --prompt - \
+  --prompt-cache-file mistral_prompt.safetensors
+``` 
+
+Then use the cached prompt with `mlx_lm.generate`:
+
+```
+mlx_lm.generate \
+    --prompt-cache-file mistral_prompt.safetensors \
+    --prompt "\nSummarize the above text."
+```
+
+The cached prompt is treated as a prefix to the supplied prompt. Also notice
+when using a cached prompt, the model to use is read from the cache and need
+not be supplied explicitly.
+
+Prompt caching can also be used in the Python API in order to to avoid
+recomputing the prompt. This is useful in multi-turn dialogues or across
+requests that use the same context. See the
+[example](https://github.com/ml-explore/mlx-examples/blob/main/llms/mlx_lm/examples/chat.py)
+for more usage details.
+
 ### Supported Models

-The example supports Hugging Face format Mistral, Llama, and Phi-2 style
-models.  If the model you want to run is not supported, file an
+`mlx-lm` supports thousands of Hugging Face format LLMs. If the model you want to
+run is not supported, file an
 [issue](https://github.com/ml-explore/mlx-examples/issues/new) or better yet,
 submit a pull request.

@@ -167,3 +248,28 @@ model, tokenizer = load(
    tokenizer_config={"eos_token": "<|endoftext|>", "trust_remote_code": True},
 )
 ```
+
+### Large Models
+
+> [!NOTE]
+    This requires macOS 15.0 or higher to work.
+
+Models which are large relative to the total RAM available on the machine can
+be slow. `mlx-lm` will attempt to make them faster by wiring the memory
+occupied by the model and cache. This requires macOS 15 or higher to
+work.
+
+If you see the following warning message:
+
+> [WARNING] Generating with a model that requires ...
+
+then the model will likely be slow on the given machine. If the model fits in
+RAM then it can often be sped up by increasing the system wired memory limit.
+To increase the limit, set the following `sysctl`:
+
+```bash
+sudo sysctl iogpu.wired_limit_mb=N
+```
+
+The value `N` should be larger than the size of the model in megabytes but
+smaller than the memory size of the machine.
--- a/llms/mlx_lm/LORA.md
+++ b/llms/mlx_lm/LORA.md
@@ -57,6 +57,9 @@ mlx_lm.lora \
    --iters 600
 ```

+To fine-tune the full model weights, add the `--fine-tune-type full` flag.
+Currently supported fine-tuning types are `lora` (default), `dora`, and `full`.
+
 The `--data` argument must specify a path to a `train.jsonl`, `valid.jsonl`
 when using `--train` and a path to a `test.jsonl` when using `--test`. For more
 details on the data format see the section on [Data](#Data).
@@ -67,8 +70,8 @@ mistralai/Mistral-7B-v0.1`.
 If `--model` points to a quantized model, then the training will use QLoRA,
 otherwise it will use regular LoRA.

-By default, the adapter config and weights are saved in `adapters/`. You can
-specify the output location with `--adapter-path`.
+By default, the adapter config and learned weights are saved in `adapters/`.
+You can specify the output location with `--adapter-path`.

 You can resume fine-tuning with an existing adapter with
 `--resume-adapter-file <path_to_adapters.safetensors>`.
@@ -118,7 +121,7 @@ mlx_lm.fuse --model <path_to_model>
 ```

 This will by default load the adapters from `adapters/`, and save the fused
-model in the path `lora_fused_model/`. All of these are configurable.
+model in the path `fused_model/`. All of these are configurable.

 To upload a fused model, supply the `--upload-repo` and `--hf-path` arguments
 to `mlx_lm.fuse`. The latter is the repo name of the original model, which is
@@ -141,7 +144,7 @@ mlx_lm.fuse \
    --export-gguf
 ```

-This will save the GGUF model in `lora_fused_model/ggml-model-f16.gguf`. You
+This will save the GGUF model in `fused_model/ggml-model-f16.gguf`. You
 can specify the file name with `--gguf-path`.

 ## Data
@@ -160,50 +163,97 @@ For fine-tuning (`--train`), the data loader expects a `train.jsonl` and a
 `valid.jsonl` to be in the data directory. For evaluation (`--test`), the data
 loader expects a `test.jsonl` in the data directory. 

-Currently, `*.jsonl` files support three data formats: `chat`,
-`completions`, and `text`. Here are three examples of these formats:
+Currently, `*.jsonl` files support `chat`, `tools`, `completions`, and `text`
+data formats. Here are examples of these formats:

 `chat`:

+```jsonl
+{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello."}, {"role": "assistant", "content": "How can I assistant you today."}]}
+```
+
+`tools`:
+
+```jsonl
+{"messages":[{"role":"user","content":"What is the weather in San Francisco?"},{"role":"assistant","tool_calls":[{"id":"call_id","type":"function","function":{"name":"get_current_weather","arguments":"{\"location\": \"San Francisco, USA\", \"format\": \"celsius\"}"}}]}],"tools":[{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","properties":{"location":{"type":"string","description":"The city and country, eg. San Francisco, USA"},"format":{"type":"string","enum":["celsius","fahrenheit"]}},"required":["location","format"]}}}]}
+```
+
+<details>
+<summary>View the expanded single data tool format</summary>
+
 ```jsonl
 {
-  "messages": [
-    {
-      "role": "system",
-      "content": "You are a helpful assistant."
-    },
-    {
-      "role": "user",
-      "content": "Hello."
-    },
-    {
-      "role": "assistant",
-      "content": "How can I assistant you today."
-    }
-  ]
+    "messages": [
+        { "role": "user", "content": "What is the weather in San Francisco?" },
+        {
+            "role": "assistant",
+            "tool_calls": [
+                {
+                    "id": "call_id",
+                    "type": "function",
+                    "function": {
+                        "name": "get_current_weather",
+                        "arguments": "{\"location\": \"San Francisco, USA\", \"format\": \"celsius\"}"
+                    }
+                }
+            ]
+        }
+    ],
+    "tools": [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The city and country, eg. San Francisco, USA"
+                        },
+                        "format": { "type": "string", "enum": ["celsius", "fahrenheit"] }
+                    },
+                    "required": ["location", "format"]
+                }
+            }
+        }
+    ]
 }
 ```

+
+The format for the `arguments` field in a function varies for different models.
+Common formats include JSON strings and dictionaries. The example provided
+follows the format used by
+[OpenAI](https://platform.openai.com/docs/guides/fine-tuning/fine-tuning-examples)
+and [Mistral
+AI](https://github.com/mistralai/mistral-finetune?tab=readme-ov-file#instruct).
+A dictionary format is used in Hugging Face's [chat
+templates](https://huggingface.co/docs/transformers/main/en/chat_templating#a-complete-tool-use-example).
+Refer to the documentation for the model you are fine-tuning for more details.
+
+</details>
+
 `completions`:

 ```jsonl
-{
-  "prompt": "What is the capital of France?",
-  "completion": "Paris."
-}
+{"prompt": "What is the capital of France?", "completion": "Paris."}
 ```

 `text`:

 ```jsonl
-{
-  "text": "This is an example for the model."
-}
+{"text": "This is an example for the model."}
 ```

 Note, the format is automatically determined by the dataset. Note also, keys in
 each line not expected by the loader will be ignored.

+> [!NOTE]
+> Each example in the datasets must be on a single line. Do not put more than
+> one example per line and do not split an example across multiple lines.
+
 ### Hugging Face Datasets

 To use Hugging Face datasets, first install the `datasets` package:
@@ -212,7 +262,13 @@ To use Hugging Face datasets, first install the `datasets` package:
 pip install datasets
 ```

-Specify the Hugging Face dataset arguments in a YAML config. For example:
+If the Hugging Face dataset is already in a supported format, you can specify
+it on the command line. For example, pass `--data mlx-community/wikisql` to
+train on the pre-formatted WikiwSQL data.
+
+Otherwise, provide a mapping of keys in the dataset to the features MLX LM
+expects. Use a YAML config to specify the Hugging Face dataset arguments. For
+example:

 ```
 hf_dataset:
@@ -231,11 +287,13 @@ hf_dataset:
 - Arguments specified in `config` will be passed as keyword arguments to
  [`datasets.load_dataset`](https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset).

-In general, for the `chat` and `completions` formats, Hugging Face [chat
-templates](https://huggingface.co/blog/chat-templates) are used. This applies
-the model's chat template by default. If the model does not have a chat
-template, then Hugging Face will use a default. For example, the final text in
-the `chat` example above with Hugging Face's default template becomes:
+In general, for the `chat`, `tools` and `completions` formats, Hugging Face
+[chat
+templates](https://huggingface.co/docs/transformers/main/en/chat_templating)
+are used. This applies the model's chat template by default. If the model does
+not have a chat template, then Hugging Face will use a default. For example,
+the final text in the `chat` example above with Hugging Face's default template
+becomes:

 ```text
 <|im_start|>system
@@ -263,7 +321,7 @@ of memory. Here are some tips to reduce memory use should you need to do so:
   setting this to `2` or `1` will reduce memory consumption. This may slow
   things down a little, but will also reduce the memory use.

-3. Reduce the number of layers to fine-tune with `--lora-layers`. The default
+3. Reduce the number of layers to fine-tune with `--num-layers`. The default
   is `16`, so you can try `8` or `4`. This reduces the amount of memory
   needed for back propagation. It may also reduce the quality of the
   fine-tuned model if you are fine-tuning with a lot of data.
@@ -285,7 +343,7 @@ mlx_lm.lora \
    --model mistralai/Mistral-7B-v0.1 \
    --train \
    --batch-size 1 \
-    --lora-layers 4 \
+    --num-layers 4 \
    --data wikisql
 ```

@@ -295,4 +353,5 @@ tokens-per-second, using the MLX Example
 data set.

 [^lora]: Refer to the [arXiv paper](https://arxiv.org/abs/2106.09685) for more details on LoRA.
+
 [^qlora]: Refer to the paper [QLoRA: Efficient Finetuning of Quantized LLMs](https://arxiv.org/abs/2305.14314)
--- a/llms/mlx_lm/SERVER.md
+++ b/llms/mlx_lm/SERVER.md
@@ -50,7 +50,7 @@ curl localhost:8080/v1/chat/completions \
 - `role_mapping`: (Optional) A dictionary to customize the role prefixes in
  the generated prompt. If not provided, the default mappings are used.

- `stop`: (Optional) An array of strings or a single string. Thesse are
+- `stop`: (Optional) An array of strings or a single string. These are
  sequences of tokens on which the generation should stop.

 - `max_tokens`: (Optional) An integer specifying the maximum number of tokens
@@ -78,3 +78,54 @@ curl localhost:8080/v1/chat/completions \
 - `logprobs`: (Optional) An integer specifying the number of top tokens and
  corresponding log probabilities to return for each output in the generated
  sequence. If set, this can be any value between 1 and 10, inclusive.
+
+- `model`: (Optional) A string path to a local model or Hugging Face repo id.
+  If the path is local is must be relative to the directory the server was
+  started in.
+
+- `adapters`: (Optional) A string path to low-rank adapters. The path must be
+  relative to the directory the server was started in.
+
+### Response Fields
+
+- `id`: A unique identifier for the chat.
+
+- `system_fingerprint`: A unique identifier for the system.
+
+- `object`: Any of "chat.completions", "chat.completions.chunk" (for
+  streaming), or "text.completion".
+
+- `model`: The model repo or path (e.g. `"mlx-community/Llama-3.2-3B-Instruct-4bit"`).
+
+- `created`: A time-stamp for when the request was processed.
+
+- `choices`: A list of outputs. Each output is a dictionary containing the fields:
+    - `index`: The index in the list.
+    - `logprobs`: A dictionary containing the fields:
+        - `token_logprobs`: A list of the log probabilities for the generated
+          tokens.
+        - `tokens`: A list of the generated token ids.
+        - `top_logprobs`: A list of lists. Each list contains the `logprobs`
+          top tokens (if requested) with their corresponding probabilities.
+    - `finish_reason`: The reason the completion ended. This can be either of
+      `"stop"` or `"length"`.
+    - `message`: The text response from the model.
+
+- `usage`: A dictionary containing the fields:
+    - `prompt_tokens`: The number of prompt tokens processed.
+    - `completion_tokens`: The number of tokens generated.
+    - `total_tokens`: The total number of tokens, i.e. the sum of the above two fields.
+
+### List Models
+
+Use the `v1/models` endpoint to list available models:
+
+```shell
+curl localhost:8080/v1/models -H "Content-Type: application/json"
+```
+
+This will return a list of locally available models where each model in the
+list contains the following fields:
+
+- `id`: The Hugging Face repo id.
+- `created`: A time-stamp representing the model creation time.
--- a/llms/mlx_lm/init.py
+++ b/llms/mlx_lm/init.py
@@ -1,4 +1,9 @@
 # Copyright © 2023-2024 Apple Inc.

+import os
+
+from ._version import __version__
+
+os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
+
 from .utils import convert, generate, load, stream_generate
-from .version import __version__
--- a/llms/mlx_lm/_version.py
+++ b/llms/mlx_lm/_version.py
@@ -1,3 +1,3 @@
 # Copyright © 2023-2024 Apple Inc.

-__version__ = "0.16.0"
+__version__ = "0.19.3"
--- a/llms/mlx_lm/cache_prompt.py
+++ b/llms/mlx_lm/cache_prompt.py
@@ -0,0 +1,180 @@
+# Copyright © 2024 Apple Inc.
+
+import argparse
+import json
+import sys
+import time
+
+import mlx.core as mx
+
+from .models.cache import make_prompt_cache, save_prompt_cache
+from .utils import load, maybe_quantize_kv_cache
+
+DEFAULT_QUANTIZED_KV_START = 5000
+
+
+def setup_arg_parser():
+    """Set up and return the argument parser."""
+    parser = argparse.ArgumentParser(
+        description="Cache the state of a prompt to be reused with mlx_lm.generate"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="mlx_model",
+        help="The path to the local model directory or Hugging Face repo.",
+    )
+    parser.add_argument(
+        "--adapter-path",
+        type=str,
+        help="Optional path for the trained adapter weights and config.",
+    )
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Enable trusting remote code for tokenizer",
+    )
+    parser.add_argument(
+        "--eos-token",
+        type=str,
+        default=None,
+        help="End of sequence token for tokenizer",
+    )
+    parser.add_argument(
+        "--ignore-chat-template",
+        action="store_true",
+        help="Use the raw prompt without the tokenizer's chat template.",
+    )
+    parser.add_argument(
+        "--use-default-chat-template",
+        action="store_true",
+        help="Use the default chat template",
+    )
+    parser.add_argument(
+        "--cache-limit-gb",
+        type=int,
+        default=None,
+        help="Set the MLX cache limit in GB",
+    )
+    parser.add_argument(
+        "--max-kv-size",
+        type=int,
+        default=None,
+        help="Set the maximum key-value cache size",
+    )
+    parser.add_argument(
+        "--prompt-cache-file",
+        help="The file to save the prompt cache in",
+        required=True,
+    )
+    parser.add_argument(
+        "--prompt",
+        required=True,
+        help="Message to be processed by the model ('-' reads from stdin)",
+    )
+    parser.add_argument(
+        "--kv-bits",
+        type=int,
+        help="Number of bits for KV cache quantization. "
+        "Defaults to no quantization.",
+        default=None,
+    )
+    parser.add_argument(
+        "--kv-group-size",
+        type=int,
+        help="Group size for KV cache quantization.",
+        default=64,
+    )
+    parser.add_argument(
+        "--quantized-kv-start",
+        help="When --kv-bits is set, start quantizing the KV cache "
+        "from this step onwards.",
+        type=int,
+        default=DEFAULT_QUANTIZED_KV_START,
+    )
+    return parser
+
+
+def main():
+    parser = setup_arg_parser()
+    args = parser.parse_args()
+
+    if args.cache_limit_gb is not None:
+        mx.metal.set_cache_limit(args.cache_limit_gb * 1024 * 1024 * 1024)
+
+    # Building tokenizer_config
+    tokenizer_config = {"trust_remote_code": True if args.trust_remote_code else None}
+    if args.eos_token is not None:
+        tokenizer_config["eos_token"] = args.eos_token
+
+    model, tokenizer = load(
+        args.model,
+        adapter_path=args.adapter_path,
+        tokenizer_config=tokenizer_config,
+    )
+
+    args.prompt = sys.stdin.read() if args.prompt == "-" else args.prompt
+
+    if args.use_default_chat_template:
+        if tokenizer.chat_template is None:
+            tokenizer.chat_template = tokenizer.default_chat_template
+
+    if not args.ignore_chat_template and (
+        hasattr(tokenizer, "apply_chat_template")
+        and tokenizer.chat_template is not None
+    ):
+        messages = [{"role": "user", "content": args.prompt}]
+        prompt = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+
+        # Treat the prompt as a prefix assuming that the suffix will be
+        # provided at generation time.
+        test_prompt = tokenizer.apply_chat_template(
+            [{"role": "user", "content": "<query>"}],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        n = len(test_prompt) - test_prompt.index("<query>") - len("<query>")
+        prompt = prompt[:-n]
+    else:
+        prompt = args.prompt
+
+    cache = make_prompt_cache(model, args.max_kv_size)
+    y = mx.array(tokenizer.encode(prompt))
+
+    # Process the prompt
+    processed = 0
+    step_size = 512
+    start = time.time()
+    max_msg_len = 0
+    while y.size > 0:
+
+        model(y[:step_size][None], cache=cache)
+        mx.eval([c.state for c in cache])
+        processed += min(y.size, step_size)
+        y = y[step_size:]
+        current = time.time()
+        speed = processed / (current - start)
+        msg = f"\rProcessed {processed:6d} tokens ({speed:6.2f} tok/s)"
+        max_msg_len = max(max_msg_len, len(msg))
+        print(msg + " " * (max_msg_len - len(msg)), end="", flush=True)
+
+        maybe_quantize_kv_cache(
+            cache, args.quantized_kv_start, args.kv_group_size, args.kv_bits
+        )
+
+    print()
+    print(f"Peak memory: {mx.metal.get_peak_memory() / 2**30:.3f} GB")
+
+    print("Saving...")
+    metadata = {}
+    metadata["model"] = args.model
+    metadata["chat_template"] = tokenizer.chat_template
+    metadata["tokenizer_config"] = json.dumps(tokenizer_config)
+    print(f"Peak memory: {mx.metal.get_peak_memory() / 2**30:.3f} GB")
+    save_prompt_cache(args.prompt_cache_file, cache, metadata)
+
+
+if __name__ == "__main__":
+    main()
--- a/llms/mlx_lm/chat.py
+++ b/llms/mlx_lm/chat.py
@@ -0,0 +1,82 @@
+# Copyright © 2023-2024 Apple Inc.
+
+import argparse
+import json
+
+import mlx.core as mx
+
+from .models.cache import load_prompt_cache, make_prompt_cache, save_prompt_cache
+from .utils import load, stream_generate
+
+DEFAULT_TEMP = 0.0
+DEFAULT_TOP_P = 1.0
+DEFAULT_SEED = 0
+DEFAULT_MODEL = "mlx-community/Llama-3.2-3B-Instruct-4bit"
+
+
+def setup_arg_parser():
+    """Set up and return the argument parser."""
+    parser = argparse.ArgumentParser(description="Chat with an LLM")
+    parser.add_argument(
+        "--model",
+        type=str,
+        help="The path to the local model directory or Hugging Face repo.",
+        default=DEFAULT_MODEL,
+    )
+    parser.add_argument(
+        "--adapter-path",
+        type=str,
+        help="Optional path for the trained adapter weights and config.",
+    )
+    parser.add_argument(
+        "--temp", type=float, default=DEFAULT_TEMP, help="Sampling temperature"
+    )
+    parser.add_argument(
+        "--top-p", type=float, default=DEFAULT_TOP_P, help="Sampling top-p"
+    )
+    parser.add_argument("--seed", type=int, default=DEFAULT_SEED, help="PRNG seed")
+    parser.add_argument(
+        "--max-kv-size",
+        type=int,
+        help="Set the maximum key-value cache size",
+        default=None,
+    )
+    return parser
+
+
+def main():
+    parser = setup_arg_parser()
+    args = parser.parse_args()
+
+    mx.random.seed(args.seed)
+
+    model, tokenizer = load(
+        args.model,
+        adapter_path=args.adapter_path,
+        tokenizer_config={"trust_remote_code": True},
+    )
+
+    print(f"[INFO] Starting chat session with {args.model}. To exit, enter 'q'.")
+    prompt_cache = make_prompt_cache(model, args.max_kv_size)
+    while True:
+        query = input(">> ")
+        if query == "q":
+            break
+        messages = [{"role": "user", "content": query}]
+        prompt = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        for response in stream_generate(
+            model,
+            tokenizer,
+            prompt,
+            temp=args.temp,
+            top_p=args.top_p,
+            prompt_cache=prompt_cache,
+        ):
+            print(response, flush=True, end="")
+        print()
+
+
+if __name__ == "__main__":
+    main()
--- a/llms/mlx_lm/convert.py
+++ b/llms/mlx_lm/convert.py
@@ -31,7 +31,7 @@ def configure_parser() -> argparse.ArgumentParser:
    )
    parser.add_argument(
        "--dtype",
-        help="Type to save the parameters, ignored if -q is given.",
+        help="Type to save the non-quantized parameters.",
        type=str,
        choices=["float16", "bfloat16", "float32"],
        default="float16",
--- a/llms/mlx_lm/examples/chat.py
+++ b/llms/mlx_lm/examples/chat.py
@@ -0,0 +1,53 @@
+# Copyright © 2024 Apple Inc.
+
+"""
+An example of a multi-turn chat with prompt caching.
+"""
+
+from mlx_lm import generate, load
+from mlx_lm.models.cache import load_prompt_cache, make_prompt_cache, save_prompt_cache
+
+model, tokenizer = load("mlx-community/Mistral-7B-Instruct-v0.3-4bit")
+
+# Make the initial prompt cache for the model
+prompt_cache = make_prompt_cache(model)
+
+# User turn
+prompt = "Hi my name is <Name>."
+messages = [{"role": "user", "content": prompt}]
+prompt = tokenizer.apply_chat_template(
+    messages, tokenize=False, add_generation_prompt=True
+)
+
+# Assistant response
+response = generate(
+    model,
+    tokenizer,
+    prompt=prompt,
+    verbose=True,
+    temp=0.0,
+    prompt_cache=prompt_cache,
+)
+
+# User turn
+prompt = "What's my name?"
+messages = [{"role": "user", "content": prompt}]
+prompt = tokenizer.apply_chat_template(
+    messages, tokenize=False, add_generation_prompt=True
+)
+
+# Assistant response
+response = generate(
+    model,
+    tokenizer,
+    prompt=prompt,
+    verbose=True,
+    temp=0.0,
+    prompt_cache=prompt_cache,
+)
+
+# Save the prompt cache to disk to reuse it at a later time
+save_prompt_cache("mistral_prompt.safetensors", prompt_cache)
+
+# Load the prompt cache from disk
+prompt_cache = load_prompt_cache("mistral_prompt.safetensors")
--- a/llms/mlx_lm/examples/generate_response.py
+++ b/llms/mlx_lm/examples/generate_response.py
@@ -0,0 +1,42 @@
+# Copyright © 2024 Apple Inc.
+
+from mlx_lm import generate, load
+
+# Specify the checkpoint
+checkpoint = "mistralai/Mistral-7B-Instruct-v0.3"
+
+# Load the corresponding model and tokenizer
+model, tokenizer = load(path_or_hf_repo=checkpoint)
+
+# Specify the prompt and conversation history
+prompt = "Why is the sky blue?"
+conversation = [{"role": "user", "content": prompt}]
+
+# Transform the prompt into the chat template
+prompt = tokenizer.apply_chat_template(
+    conversation=conversation, tokenize=False, add_generation_prompt=True
+)
+
+# Specify the maximum number of tokens
+max_tokens = 1_000
+
+# Specify if tokens and timing information will be printed
+verbose = True
+
+# Some optional arguments for causal language model generation
+generation_args = {
+    "temp": 0.7,
+    "repetition_penalty": 1.2,
+    "repetition_context_size": 20,
+    "top_p": 0.95,
+}
+
+# Generate a response with the specified settings
+response = generate(
+    model=model,
+    tokenizer=tokenizer,
+    prompt=prompt,
+    max_tokens=max_tokens,
+    verbose=verbose,
+    **generation_args,
+)
--- a/llms/mlx_lm/examples/lora_config.yaml
+++ b/llms/mlx_lm/examples/lora_config.yaml
@@ -1,8 +1,12 @@
 # The path to the local model directory or Hugging Face repo.
 model: "mlx_model"
+
 # Whether or not to train (boolean)
 train: true

+# The fine-tuning method: "lora", "dora", or "full".
+fine_tune_type: lora
+
 # Directory with {train, valid, test}.jsonl files
 data: "/path/to/training/data"

@@ -10,7 +14,7 @@ data: "/path/to/training/data"
 seed: 0

 # Number of layers to fine-tune
-lora_layers: 16
+num_layers: 16

 # Minibatch size.
 batch_size: 4
@@ -51,9 +55,6 @@ max_seq_length: 2048
 # Use gradient checkpointing to reduce memory use.
 grad_checkpoint: false

-# Use DoRA instead of LoRA.
-use_dora: false
-
 # LoRA parameters can only be specified in a config file
 lora_parameters:
  # The layer keys to apply LoRA to.
--- a/llms/mlx_lm/fuse.py
+++ b/llms/mlx_lm/fuse.py
@@ -6,9 +6,9 @@ from pathlib import Path
 from mlx.utils import tree_flatten, tree_unflatten

 from .gguf import convert_to_gguf
-from .tuner.dora import DoRALinear
-from .tuner.lora import LoRALinear, LoRASwitchLinear
-from .tuner.utils import apply_lora_layers, dequantize
+from .tuner.dora import DoRAEmbedding, DoRALinear
+from .tuner.lora import LoRAEmbedding, LoRALinear, LoRASwitchLinear
+from .tuner.utils import dequantize, load_adapters
 from .utils import (
    fetch_from_hub,
    get_model_path,
@@ -29,7 +29,7 @@ def parse_arguments() -> argparse.Namespace:
    )
    parser.add_argument(
        "--save-path",
-        default="lora_fused_model",
+        default="fused_model",
        help="The path to save the fused model.",
    )
    parser.add_argument(
@@ -77,15 +77,14 @@ def main() -> None:
    model, config, tokenizer = fetch_from_hub(model_path)

    model.freeze()
-    model = apply_lora_layers(model, args.adapter_path)
+    model = load_adapters(model, args.adapter_path)

    fused_linears = [
-        (n, m.to_linear())
-        for n, m in model.named_modules()
-        if isinstance(m, (LoRASwitchLinear, LoRALinear, DoRALinear))
+        (n, m.fuse()) for n, m in model.named_modules() if hasattr(m, "fuse")
    ]

-    model.update_modules(tree_unflatten(fused_linears))
+    if fused_linears:
+        model.update_modules(tree_unflatten(fused_linears))

    if args.de_quantize:
        print("De-quantizing model")
--- a/llms/mlx_lm/generate.py
+++ b/llms/mlx_lm/generate.py
@@ -1,17 +1,25 @@
 # Copyright © 2023-2024 Apple Inc.

 import argparse
+import json
+import sys

 import mlx.core as mx

+from .models.cache import QuantizedKVCache, load_prompt_cache
 from .utils import generate, load

-DEFAULT_MODEL_PATH = "mlx_model"
 DEFAULT_PROMPT = "hello"
 DEFAULT_MAX_TOKENS = 100
-DEFAULT_TEMP = 0.6
+DEFAULT_TEMP = 0.0
 DEFAULT_TOP_P = 1.0
 DEFAULT_SEED = 0
+DEFAULT_MODEL = "mlx-community/Llama-3.2-3B-Instruct-4bit"
+DEFAULT_QUANTIZED_KV_START = 5000
+
+
+def str2bool(string):
+    return string.lower() not in ["false", "f"]


 def setup_arg_parser():
@@ -20,8 +28,11 @@ def setup_arg_parser():
    parser.add_argument(
        "--model",
        type=str,
-        default="mlx_model",
-        help="The path to the local model directory or Hugging Face repo.",
+        help=(
+            "The path to the local model directory or Hugging Face repo. "
+            f"If no model is specified, then {DEFAULT_MODEL} is used."
+        ),
+        default=None,
    )
    parser.add_argument(
        "--adapter-path",
@@ -40,7 +51,9 @@ def setup_arg_parser():
        help="End of sequence token for tokenizer",
    )
    parser.add_argument(
-        "--prompt", default=DEFAULT_PROMPT, help="Message to be processed by the model"
+        "--prompt",
+        default=DEFAULT_PROMPT,
+        help="Message to be processed by the model ('-' reads from stdin)",
    )
    parser.add_argument(
        "--max-tokens",
@@ -66,17 +79,48 @@ def setup_arg_parser():
        action="store_true",
        help="Use the default chat template",
    )
+    parser.add_argument(
+        "--verbose",
+        type=str2bool,
+        default=True,
+        help="Log verbose output when 'True' or 'T' or only print the response when 'False' or 'F'",
+    )
    parser.add_argument(
        "--colorize",
        action="store_true",
        help="Colorize output based on T[0] probability",
    )
    parser.add_argument(
-        "--cache-limit-gb",
+        "--max-kv-size",
        type=int,
+        help="Set the maximum key-value cache size",
        default=None,
-        help="Set the MLX cache limit in GB",
-        required=False,
+    )
+    parser.add_argument(
+        "--prompt-cache-file",
+        type=str,
+        default=None,
+        help="A file containing saved KV caches to avoid recomputing them",
+    )
+    parser.add_argument(
+        "--kv-bits",
+        type=int,
+        help="Number of bits for KV cache quantization. "
+        "Defaults to no quantization.",
+        default=None,
+    )
+    parser.add_argument(
+        "--kv-group-size",
+        type=int,
+        help="Group size for KV cache quantization.",
+        default=64,
+    )
+    parser.add_argument(
+        "--quantized-kv-start",
+        help="When --kv-bits is set, start quantizing the KV cache "
+        "from this step onwards.",
+        type=int,
+        default=DEFAULT_QUANTIZED_KV_START,
    )
    return parser

@@ -114,16 +158,46 @@ def main():

    mx.random.seed(args.seed)

-    if args.cache_limit_gb is not None:
-        mx.metal.set_cache_limit(args.cache_limit_gb * 1024 * 1024 * 1024)
+    # Load the prompt cache and metadata if a cache file is provided
+    using_cache = args.prompt_cache_file is not None
+    if using_cache:
+        prompt_cache, metadata = load_prompt_cache(
+            args.prompt_cache_file,
+            return_metadata=True,
+        )
+        if isinstance(prompt_cache[0], QuantizedKVCache):
+            if args.kv_bits is not None and args.kv_bits != prompt_cache[0].bits:
+                raise ValueError(
+                    "--kv-bits does not match the kv cache loaded from --prompt-cache-file."
+                )
+            if args.kv_group_size != prompt_cache[0].group_size:
+                raise ValueError(
+                    "--kv-group-size does not match the kv cache loaded from --prompt-cache-file."
+                )

    # Building tokenizer_config
-    tokenizer_config = {"trust_remote_code": True if args.trust_remote_code else None}
+    tokenizer_config = (
+        {} if not using_cache else json.loads(metadata["tokenizer_config"])
+    )
+    if args.trust_remote_code:
+        tokenizer_config["trust_remote_code"] = True
    if args.eos_token is not None:
        tokenizer_config["eos_token"] = args.eos_token

+    model_path = args.model
+    if using_cache:
+        if model_path is None:
+            model_path = metadata["model"]
+        elif model_path != metadata["model"]:
+            raise ValueError(
+                f"Providing a different model ({model_path}) than that "
+                f"used to create the prompt cache ({metadata['model']}) "
+                "is an error."
+            )
+    model_path = model_path or DEFAULT_MODEL
+
    model, tokenizer = load(
-        args.model,
+        model_path,
        adapter_path=args.adapter_path,
        tokenizer_config=tokenizer_config,
    )
@@ -131,30 +205,56 @@ def main():
    if args.use_default_chat_template:
        if tokenizer.chat_template is None:
            tokenizer.chat_template = tokenizer.default_chat_template
+    elif using_cache:
+        tokenizer.chat_template = metadata["chat_template"]

    if not args.ignore_chat_template and (
        hasattr(tokenizer, "apply_chat_template")
        and tokenizer.chat_template is not None
    ):
-        messages = [{"role": "user", "content": args.prompt}]
+        messages = [
+            {
+                "role": "user",
+                "content": sys.stdin.read() if args.prompt == "-" else args.prompt,
+            }
+        ]
        prompt = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
+
+        # Treat the prompt as a suffix assuming that the prefix is in the
+        # stored kv cache.
+        if using_cache:
+            test_prompt = tokenizer.apply_chat_template(
+                [{"role": "user", "content": "<query>"}],
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+            prompt = prompt[test_prompt.index("<query>") :]
    else:
        prompt = args.prompt

+    if args.colorize and not args.verbose:
+        raise ValueError("Cannot use --colorize with --verbose=False")
    formatter = colorprint_by_t0 if args.colorize else None

-    generate(
+    response = generate(
        model,
        tokenizer,
        prompt,
        args.max_tokens,
-        verbose=True,
+        verbose=args.verbose,
        formatter=formatter,
        temp=args.temp,
        top_p=args.top_p,
+        max_kv_size=args.max_kv_size,
+        prompt_cache=prompt_cache if using_cache else None,
+        kv_bits=args.kv_bits,
+        kv_group_size=args.kv_group_size,
+        quantized_kv_start=args.quantized_kv_start,
    )
+    if not args.verbose:
+        print(response)


 if __name__ == "__main__":
--- a/llms/mlx_lm/gguf.py
+++ b/llms/mlx_lm/gguf.py
@@ -59,7 +59,7 @@ class HfVocab:
        for token_id in range(self.vocab_size_base):
            if token_id in self.added_tokens_ids:
                continue
-            token_text = reverse_vocab[token_id].encode("utf-8")
+            token_text = reverse_vocab[token_id]
            yield token_text, self.get_token_score(token_id), self.get_token_type(
                token_id, token_text, self.special_ids
            )
@@ -67,7 +67,7 @@ class HfVocab:
    def get_token_type(
        self, token_id: int, token_text: bytes, special_ids: Set[int]
    ) -> TokenType:
-        if re.fullmatch(rb"<0x[0-9A-Fa-f]{2}>", token_text):
+        if re.fullmatch(r"<0x[0-9A-Fa-f]{2}>", token_text):
            return TokenType.BYTE
        return TokenType.CONTROL if token_id in special_ids else TokenType.NORMAL

@@ -77,14 +77,12 @@ class HfVocab:
    def added_tokens(self) -> Iterable[Tuple[bytes, float, TokenType]]:
        for text in self.added_tokens_list:
            if text in self.specials:
-                toktype = self.get_token_type(
-                    self.specials[text], b"", self.special_ids
-                )
+                toktype = self.get_token_type(self.specials[text], "", self.special_ids)
                score = self.get_token_score(self.specials[text])
            else:
                toktype = TokenType.USER_DEFINED
                score = -1000.0
-            yield text.encode("utf-8"), score, toktype
+            yield text, score, toktype

    def has_newline_token(self):
        return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
@@ -243,15 +241,18 @@ def prepare_metadata(config, vocab):
    metadata["tokenizer.ggml.tokens"] = tokens
    metadata["tokenizer.ggml.scores"] = mx.array(scores, dtype=mx.float32)
    metadata["tokenizer.ggml.token_type"] = mx.array(toktypes, dtype=mx.uint32)
-    metadata["tokenizer.ggml.bos_token_id"] = mx.array(
-        vocab.tokenizer.bos_token_id, dtype=mx.uint32
-    )
-    metadata["tokenizer.ggml.eos_token_id"] = mx.array(
-        vocab.tokenizer.eos_token_id, dtype=mx.uint32
-    )
-    metadata["tokenizer.ggml.unknown_token_id"] = mx.array(
-        vocab.tokenizer.unk_token_id, dtype=mx.uint32
-    )
+    if vocab.tokenizer.bos_token_id is not None:
+        metadata["tokenizer.ggml.bos_token_id"] = mx.array(
+            vocab.tokenizer.bos_token_id, dtype=mx.uint32
+        )
+    if vocab.tokenizer.eos_token_id is not None:
+        metadata["tokenizer.ggml.eos_token_id"] = mx.array(
+            vocab.tokenizer.eos_token_id, dtype=mx.uint32
+        )
+    if vocab.tokenizer.unk_token_id is not None:
+        metadata["tokenizer.ggml.unknown_token_id"] = mx.array(
+            vocab.tokenizer.unk_token_id, dtype=mx.uint32
+        )

    metadata = {k: v for k, v in metadata.items() if v is not None}
    return metadata
--- a/llms/mlx_lm/lora.py
+++ b/llms/mlx_lm/lora.py
@@ -15,9 +15,9 @@ from .tokenizer_utils import TokenizerWrapper
 from .tuner.datasets import load_dataset
 from .tuner.trainer import TrainingArgs, TrainingCallback, evaluate, train
 from .tuner.utils import (
-    apply_lora_layers,
    build_schedule,
    linear_to_lora_layers,
+    load_adapters,
    print_trainable_parameters,
 )
 from .utils import load, save_config
@@ -41,9 +41,10 @@ yaml_loader.add_implicit_resolver(
 CONFIG_DEFAULTS = {
    "model": "mlx_model",
    "train": False,
+    "fine_tune_type": "lora",
    "data": "data/",
    "seed": 0,
-    "lora_layers": 16,
+    "num_layers": 16,
    "batch_size": 4,
    "iters": 1000,
    "val_batches": 25,
@@ -58,7 +59,6 @@ CONFIG_DEFAULTS = {
    "max_seq_length": 2048,
    "lr_schedule": None,
    "lora_parameters": {"rank": 8, "alpha": 16, "dropout": 0.0, "scale": 10.0},
-    "use_dora": False,
 }


@@ -79,10 +79,20 @@ def build_parser():
    parser.add_argument(
        "--data",
        type=str,
-        help="Directory with {train, valid, test}.jsonl files",
+        help=(
+            "Directory with {train, valid, test}.jsonl files or the name "
+            "of a Hugging Face dataset (e.g., 'mlx-community/wikisql')"
+        ),
    )
    parser.add_argument(
-        "--lora-layers",
+        "--fine-tune-type",
+        type=str,
+        choices=["lora", "dora", "full"],
+        default="lora",
+        help="Type of fine-tuning to perform: lora, dora, or full.",
+    )
+    parser.add_argument(
+        "--num-layers",
        type=int,
        help="Number of layers to fine-tune. Default is 16, use -1 for all.",
    )
@@ -107,12 +117,12 @@ def build_parser():
    parser.add_argument(
        "--resume-adapter-file",
        type=str,
-        help="Load path to resume training with the given adapters.",
+        help="Load path to resume training from the given fine-tuned weights.",
    )
    parser.add_argument(
        "--adapter-path",
        type=str,
-        help="Save/load path for the adapters.",
+        help="Save/load path for the fine-tuned weights.",
    )
    parser.add_argument(
        "--save-every",
@@ -148,9 +158,6 @@ def build_parser():
        default=None,
    )
    parser.add_argument("--seed", type=int, default=None, help="The PRNG seed")
-    parser.add_argument(
-        "--use-dora", action="store_true", default=None, help="Use DoRA to finetune."
-    )
    return parser


@@ -162,21 +169,31 @@ def train_model(
    valid_set,
    training_callback: TrainingCallback = None,
 ):
-    # Freeze all layers
    model.freeze()
+    if args.fine_tune_type == "full":
+        for l in model.layers[-min(args.num_layers, 0) :]:
+            l.unfreeze()
+    elif args.fine_tune_type in ["lora", "dora"]:
+        # Convert linear layers to lora/dora layers and unfreeze in the process
+        linear_to_lora_layers(
+            model,
+            args.num_layers,
+            args.lora_parameters,
+            use_dora=(args.fine_tune_type == "dora"),
+        )
+    else:
+        raise ValueError(f"Received unknown fine-tune-type {args.fine_tune_type}")

-    # Convert linear layers to lora layers and unfreeze in the process
-    linear_to_lora_layers(model, args.lora_layers, args.lora_parameters)
-
-    # Resume training the given adapters.
+    # Resume from weights if provided
    if args.resume_adapter_file is not None:
-        print(f"Loading pretrained adapters from {args.resume_adapter_file}")
+        print(f"Loading fine-tuned weights from {args.resume_adapter_file}")
        model.load_weights(args.resume_adapter_file, strict=False)

    print_trainable_parameters(model)

    adapter_path = Path(args.adapter_path)
    adapter_path.mkdir(parents=True, exist_ok=True)
+
    adapter_file = adapter_path / "adapters.safetensors"
    save_config(vars(args), adapter_path / "adapter_config.json")

@@ -240,7 +257,7 @@ def run(args, training_callback: TrainingCallback = None):
    if args.test and not args.train:
        # Allow testing without LoRA layers by providing empty path
        if args.adapter_path != "":
-            apply_lora_layers(model, args.adapter_path)
+            load_adapters(model, args.adapter_path)

    elif args.train:
        print("Training")
--- a/llms/mlx_lm/models/base.py
+++ b/llms/mlx_lm/models/base.py
@@ -1,46 +1,13 @@
+# Copyright © 2023-2024 Apple Inc.
+
 import inspect
 from dataclasses import dataclass
+from typing import Any, Optional

 import mlx.core as mx
+from mlx.utils import tree_map

-
-def create_additive_causal_mask(N: int, offset: int = 0):
-    rinds = mx.arange(offset + N)
-    linds = mx.arange(offset, offset + N) if offset else rinds
-    mask = linds[:, None] < rinds[None]
-    return mask * -1e9
-
-
-class KVCache:
-
-    def __init__(self, head_dim, n_kv_heads):
-        self.n_kv_heads = n_kv_heads
-        self.head_dim = head_dim
-        self.keys = None
-        self.values = None
-        self.offset = 0
-        self.step = 256
-
-    def update_and_fetch(self, keys, values):
-        prev = self.offset
-        if self.keys is None or (prev + keys.shape[2]) > self.keys.shape[2]:
-            n_steps = (self.step + keys.shape[2] - 1) // self.step
-            shape = (1, self.n_kv_heads, n_steps * self.step, self.head_dim)
-            new_k = mx.zeros(shape, keys.dtype)
-            new_v = mx.zeros(shape, values.dtype)
-            if self.keys is not None:
-                if prev % self.step != 0:
-                    self.keys = self.keys[..., :prev, :]
-                    self.values = self.values[..., :prev, :]
-                self.keys = mx.concatenate([self.keys, new_k], axis=2)
-                self.values = mx.concatenate([self.values, new_v], axis=2)
-            else:
-                self.keys, self.values = new_k, new_v
-
-        self.offset += keys.shape[2]
-        self.keys[..., prev : self.offset, :] = keys
-        self.values[..., prev : self.offset, :] = values
-        return self.keys[..., : self.offset, :], self.values[..., : self.offset, :]
+from .cache import QuantizedKVCache


@dataclass
@@ -54,3 +21,93 @@ class BaseModelArgs:
                if k in inspect.signature(cls).parameters
            }
        )
+
+
+def create_causal_mask(N: int, offset: int = 0, window_size: Optional[int] = None):
+    rinds = mx.arange(offset + N)
+    linds = mx.arange(offset, offset + N) if offset else rinds
+    linds = linds[:, None]
+    rinds = rinds[None]
+    mask = linds < rinds
+    if window_size is not None:
+        mask = mask | (linds > rinds + window_size)
+    return mask * -1e9
+
+
+def create_attention_mask(h: mx.array, cache: Optional[Any] = None):
+    T = h.shape[1]
+    if T > 1:
+        window_size = None
+        offset = 0
+        if cache is not None and cache[0] is not None:
+            c = cache[0]
+            if hasattr(c, "max_size"):
+                offset = min(c.max_size - 1, c.offset)
+                window_size = c.max_size
+            else:
+                offset = c.offset
+        mask = create_causal_mask(T, offset, window_size=window_size)
+        mask = mask.astype(h.dtype)
+    else:
+        mask = None
+    return mask
+
+
+def quantized_scaled_dot_product_attention(
+    queries: mx.array,
+    q_keys: tuple[mx.array, mx.array, mx.array],
+    q_values: tuple[mx.array, mx.array, mx.array],
+    scale: float,
+    mask: Optional[mx.array],
+    group_size: int = 64,
+    bits: int = 8,
+) -> mx.array:
+    B, n_q_heads, L, D = queries.shape
+    n_kv_heads = q_keys[0].shape[-3]
+    n_repeats = n_q_heads // n_kv_heads
+
+    queries *= scale
+
+    if n_repeats > 1:
+        queries = mx.reshape(queries, (B, n_kv_heads, n_repeats, L, D))
+        q_keys = tree_map(lambda x: mx.expand_dims(x, axis=-3), q_keys)
+        q_values = tree_map(lambda x: mx.expand_dims(x, axis=-3), q_values)
+
+    scores = mx.quantized_matmul(
+        queries, *q_keys, transpose=True, group_size=group_size, bits=bits
+    )
+    if mask is not None:
+        scores += mask
+    scores = mx.softmax(scores, axis=-1, precise=True)
+    out = mx.quantized_matmul(
+        scores, *q_values, transpose=False, group_size=group_size, bits=bits
+    )
+
+    if n_repeats > 1:
+        out = mx.reshape(out, (B, n_q_heads, L, D))
+
+    return out
+
+
+def scaled_dot_product_attention(
+    queries,
+    keys,
+    values,
+    cache,
+    scale: float,
+    mask: Optional[mx.array],
+) -> mx.array:
+    if isinstance(cache, QuantizedKVCache):
+        return quantized_scaled_dot_product_attention(
+            queries,
+            keys,
+            values,
+            scale=scale,
+            mask=mask,
+            group_size=cache.group_size,
+            bits=cache.bits,
+        )
+    else:
+        return mx.fast.scaled_dot_product_attention(
+            queries, keys, values, scale=scale, mask=mask
+        )
--- a/llms/mlx_lm/models/cache.py
+++ b/llms/mlx_lm/models/cache.py
@@ -0,0 +1,438 @@
+# Copyright © 2023-2024 Apple Inc.
+
+from typing import Any, Dict, List, Optional
+
+import mlx.core as mx
+import mlx.nn as nn
+from mlx.utils import tree_flatten, tree_map, tree_unflatten
+
+
+def make_prompt_cache(
+    model: nn.Module,
+    max_kv_size: Optional[int] = None,
+) -> List[Any]:
+    """
+    Construct the model's cache for use when cgeneration.
+
+    This function will defer the cache construction to the model if it has a
+    ``make_cache`` method, otherwise it will make a default KV cache.
+
+    Args:
+        model (nn.Module): The language model.
+        max_kv_size (Optional[int]): If provided and the model does not have a
+            ``make_cache`` method, a ``RotatingKVCache`` is used with a maximum
+            size of ``max_kv_size``
+    """
+    if hasattr(model, "make_cache"):
+        return model.make_cache()
+
+    num_layers = len(model.layers)
+    if max_kv_size is not None:
+        return [
+            RotatingKVCache(max_size=max_kv_size, keep=4) for _ in range(num_layers)
+        ]
+    else:
+        return [KVCache() for _ in range(num_layers)]
+
+
+def save_prompt_cache(file_name: str, cache: List[Any], metadata: Dict[str, str] = {}):
+    """
+    Save a pre-computed prompt cache to a file.
+
+    Args:
+        file_name (str): The ``.safetensors`` file name.
+        cache (List[Any]): The model state.
+        metadata (Dict[str, str]): Optional metadata to save along with model
+            state.
+    """
+    cache_data = [c.state for c in cache]
+    cache_info = [c.meta_state for c in cache]
+    cache_data = dict(tree_flatten(cache_data))
+    cache_classes = [type(c).__name__ for c in cache]
+    cache_metadata = [cache_info, metadata, cache_classes]
+    cache_metadata = dict(tree_flatten(cache_metadata))
+    mx.save_safetensors(file_name, cache_data, cache_metadata)
+
+
+def load_prompt_cache(file_name, return_metadata=False):
+    """
+    Load a prompt cache from a file.
+
+    Args:
+        file_name (str): The ``.safetensors`` file name.
+        return_metadata (bool): Whether or not to return metadata.
+            Default: ``False``.
+
+    Returns:
+        List[Any] or Tuple[List[Any], Dict[str, str]]: The prompt cache and
+            the metadata if requested.
+    """
+    arrays, cache_metadata = mx.load(file_name, return_metadata=True)
+    arrays = tree_unflatten(list(arrays.items()))
+    cache_metadata = tree_unflatten(list(cache_metadata.items()))
+    info, metadata, classes = cache_metadata
+    cache = [globals()[c]() for c in classes]
+    for c, state, meta_state in zip(cache, arrays, info):
+        c.state = state
+        c.meta_state = meta_state
+    if return_metadata:
+        return cache, metadata
+    return cache
+
+
+def can_trim_prompt_cache(cache: List[Any]) -> bool:
+    """
+    Check if model's cache can be trimmed.
+    """
+    return all(c.is_trimmable() for c in cache)
+
+
+def trim_prompt_cache(cache: List[Any], num_tokens: int) -> List[Any]:
+    """
+    Trim the model's cache by the given number of tokens.
+
+    This function will trim the cache if possible (in-place) and return the
+    number of tokens that were trimmed.
+
+    Args:
+        cache (List[Any]): The model's cache.
+        num_tokens (int): The number of tokens to trim.
+
+    Returns:
+        (int): The number of tokens that were trimmed.
+    """
+    if not can_trim_prompt_cache(cache) or len(cache) == 0:
+        return 0
+    return [c.trim(num_tokens) for c in cache][0]
+
+
+class _BaseCache:
+    @property
+    def state(self):
+        return []
+
+    @state.setter
+    def state(self, v):
+        if v is not None and v:
+            raise ValueError("This cache has no state but a state was set.")
+
+    @property
+    def meta_state(self):
+        return ""
+
+    @meta_state.setter
+    def meta_state(self, v):
+        if v is not None and v:
+            raise ValueError("This cache has no meta_state but a meta_state was set.")
+
+    def is_trimmable(self):
+        return False
+
+
+class QuantizedKVCache(_BaseCache):
+    def __init__(self, group_size: int = 64, bits: int = 8):
+        self.keys = None
+        self.values = None
+        self.offset = 0
+        self.step = 256
+        self.group_size = group_size
+        self.bits = bits
+
+    def update_and_fetch(self, keys, values):
+        B, n_kv_heads, num_steps, k_head_dim = keys.shape
+        v_head_dim = values.shape[-1]
+        prev = self.offset
+
+        if self.keys is None or (prev + num_steps) > self.keys[0].shape[-2]:
+            el_per_int = 8 * mx.uint32.size // self.bits
+            new_steps = (self.step + num_steps - 1) // self.step * self.step
+            shape = (B, n_kv_heads, new_steps)
+
+            def init_quant(dim):
+                return (
+                    mx.zeros((*shape, dim // el_per_int), dtype=mx.uint32),
+                    mx.zeros((*shape, dim // self.group_size), dtype=keys.dtype),
+                    mx.zeros((*shape, dim // self.group_size), dtype=keys.dtype),
+                )
+
+            def expand_quant(x):
+                new_x = mx.zeros((*shape, x.shape[-1]), dtype=x.dtype)
+                return mx.concatenate([x, new_x], axis=-2)
+
+            if self.keys is not None:
+                if prev % self.step != 0:
+                    self.keys, self.values = tree_map(
+                        lambda x: x[..., :prev, :], (self.keys, self.values)
+                    )
+
+                self.keys, self.values = tree_map(
+                    expand_quant, (self.keys, self.values)
+                )
+            else:
+                self.keys, self.values = init_quant(k_head_dim), init_quant(v_head_dim)
+
+        self.offset += num_steps
+
+        keys = mx.quantize(keys, group_size=self.group_size, bits=self.bits)
+        values = mx.quantize(values, group_size=self.group_size, bits=self.bits)
+        for i in range(len(self.keys)):
+            self.keys[i][..., prev : self.offset, :] = keys[i]
+            self.values[i][..., prev : self.offset, :] = values[i]
+
+        return tree_map(lambda x: x[..., : self.offset, :], (self.keys, self.values))
+
+    @property
+    def state(self):
+        if self.offset == self.keys[0].shape[2]:
+            return self.keys, self.values
+        else:
+            return tree_map(
+                lambda x: x[..., : self.offset, :], (self.keys, self.values)
+            )
+
+    @state.setter
+    def state(self, v):
+        self.keys, self.values = v
+
+    @property
+    def meta_state(self):
+        return tuple(map(str, (self.step, self.offset, self.group_size, self.bits)))
+
+    @meta_state.setter
+    def meta_state(self, v):
+        self.step, self.offset, self.group_size, self.bits = map(int, v)
+
+    def is_trimmable(self):
+        return True
+
+    def trim(self, n):
+        n = min(self.offset, n)
+        self.offset -= n
+        return n
+
+
+class KVCache(_BaseCache):
+    def __init__(self):
+        self.keys = None
+        self.values = None
+        self.offset = 0
+        self.step = 256
+
+    def update_and_fetch(self, keys, values):
+        prev = self.offset
+        if self.keys is None or (prev + keys.shape[2]) > self.keys.shape[2]:
+            B, n_kv_heads, _, k_head_dim = keys.shape
+            v_head_dim = values.shape[3]
+            n_steps = (self.step + keys.shape[2] - 1) // self.step
+            k_shape = (B, n_kv_heads, n_steps * self.step, k_head_dim)
+            v_shape = (B, n_kv_heads, n_steps * self.step, v_head_dim)
+            new_k = mx.zeros(k_shape, keys.dtype)
+            new_v = mx.zeros(v_shape, values.dtype)
+            if self.keys is not None:
+                if prev % self.step != 0:
+                    self.keys = self.keys[..., :prev, :]
+                    self.values = self.values[..., :prev, :]
+                self.keys = mx.concatenate([self.keys, new_k], axis=2)
+                self.values = mx.concatenate([self.values, new_v], axis=2)
+            else:
+                self.keys, self.values = new_k, new_v
+
+        self.offset += keys.shape[2]
+        self.keys[..., prev : self.offset, :] = keys
+        self.values[..., prev : self.offset, :] = values
+        return self.keys[..., : self.offset, :], self.values[..., : self.offset, :]
+
+    @property
+    def state(self):
+        if self.offset == self.keys.shape[2]:
+            return self.keys, self.values
+        else:
+            return (
+                self.keys[..., : self.offset, :],
+                self.values[..., : self.offset, :],
+            )
+
+    @state.setter
+    def state(self, v):
+        self.keys, self.values = v
+        self.offset = self.keys.shape[2]
+
+    def is_trimmable(self):
+        return True
+
+    def trim(self, n):
+        n = min(self.offset, n)
+        self.offset -= n
+        return n
+
+    def to_quantized(self, group_size: int = 64, bits: int = 4) -> QuantizedKVCache:
+        quant_cache = QuantizedKVCache(group_size=group_size, bits=bits)
+        quant_cache.offset = self.offset
+        if self.keys is not None:
+            quant_cache.keys = mx.quantize(self.keys, group_size=group_size, bits=bits)
+            quant_cache.values = mx.quantize(
+                self.values, group_size=group_size, bits=bits
+            )
+        return quant_cache
+
+
+class RotatingKVCache(_BaseCache):
+
+    def __init__(self, max_size=None, keep=0, step=256):
+        self.keep = keep
+        self.keys = None
+        self.values = None
+        self.offset = 0
+        self.max_size = max_size
+        self.step = step
+        self._idx = 0
+
+    def _trim(self, trim_size, v, append=None):
+        to_cat = []
+        if trim_size > 0:
+            to_cat = [v[..., : self.keep, :], v[..., trim_size + self.keep :, :]]
+        else:
+            to_cat = [v]
+        if append is not None:
+            to_cat.append(append)
+        return mx.concatenate(to_cat, axis=2)
+
+    def _temporal_order(self, v):
+        """
+        Rearrange the cache into temporal order, slicing off the end if unused.
+        """
+        if self._idx == v.shape[2]:
+            return v
+        elif self._idx < self.offset:
+            return mx.concatenate(
+                [
+                    v[..., : self.keep, :],
+                    v[..., self._idx :, :],
+                    v[..., self.keep : self._idx, :],
+                ],
+                axis=2,
+            )
+        else:
+            return v[..., : self._idx, :]
+
+    def _update_concat(self, keys, values):
+        if self.keys is None:
+            self.keys = keys
+            self.values = values
+        else:
+            # Put the keys/values in temporal order to
+            # preserve context
+            self.keys = self._temporal_order(self.keys)
+            self.values = self._temporal_order(self.values)
+
+            # The largest size is self.max_size + S - 1 to ensure
+            # every token gets at least self.max_size context
+            trim_size = self._idx - self.max_size + 1
+            self.keys = self._trim(trim_size, self.keys, keys)
+            self.values = self._trim(trim_size, self.values, values)
+        self.offset += keys.shape[2]
+        self._idx = self.keys.shape[2]
+        return self.keys, self.values
+
+    def _update_in_place(self, keys, values):
+        # May not have hit the max size yet, so potentially
+        # keep growing the cache
+        B, n_kv_heads, S, k_head_dim = keys.shape
+        prev = self.offset
+        if self.keys is None or (
+            prev >= self.keys.shape[2] and self.keys.shape[2] < self.max_size
+        ):
+            v_head_dim = values.shape[3]
+            new_size = min(self.step, self.max_size - prev)
+            k_shape = (B, n_kv_heads, new_size, k_head_dim)
+            v_shape = (B, n_kv_heads, new_size, v_head_dim)
+            new_k = mx.zeros(k_shape, keys.dtype)
+            new_v = mx.zeros(v_shape, values.dtype)
+            if self.keys is not None:
+                self.keys = mx.concatenate([self.keys, new_k], axis=2)
+                self.values = mx.concatenate([self.values, new_v], axis=2)
+            else:
+                self.keys, self.values = new_k, new_v
+            self._idx = prev
+
+        # Trim if needed
+        trim_size = self.keys.shape[2] - self.max_size
+        if trim_size > 0:
+            self.keys = self._trim(trim_size, self.keys)
+            self.values = self._trim(trim_size, self.values)
+            self._idx = self.max_size
+
+        # Rotate
+        if self._idx == self.max_size:
+            self._idx = self.keep
+
+        # Assign
+        self.keys[..., self._idx : self._idx + S, :] = keys
+        self.values[..., self._idx : self._idx + S, :] = values
+        self.offset += S
+        self._idx += S
+
+        # If the buffer is not full, slice off the end
+        if self.offset < self.max_size:
+            return self.keys[..., : self.offset, :], self.values[..., : self.offset, :]
+        return self.keys, self.values
+
+    def update_and_fetch(self, keys, values):
+        if keys.shape[2] == 1:
+            return self._update_in_place(keys, values)
+        return self._update_concat(keys, values)
+
+    @property
+    def state(self):
+        if self.offset < self.keys.shape[2]:
+            return self.keys[..., : self.offset, :], self.values[..., : self.offset, :]
+        else:
+            return self.keys, self.values
+
+    @state.setter
+    def state(self, v):
+        self.keys, self.values = v
+
+    @property
+    def meta_state(self):
+        return tuple(
+            map(str, (self.keep, self.max_size, self.step, self.offset, self._idx))
+        )
+
+    @meta_state.setter
+    def meta_state(self, v):
+        self.keep, self.max_size, self.step, self.offset, self._idx = map(
+            int,
+            v,
+        )
+
+    def is_trimmable(self):
+        return self.offset < self.max_size
+
+    def trim(self, n):
+        n = min(self.offset, n)
+        self.offset -= n
+        self._idx -= n
+        return n
+
+    def to_quantized(self, group_size: int = 64, bits: int = 4) -> QuantizedKVCache:
+        raise NotImplementedError("RotatingKVCache Quantization NYI")
+
+
+class MambaCache(_BaseCache):
+    def __init__(self):
+        self.cache = [None, None]
+
+    def __setitem__(self, idx, value):
+        self.cache[idx] = value
+
+    def __getitem__(self, idx):
+        return self.cache[idx]
+
+    @property
+    def state(self):
+        return self.cache
+
+    @state.setter
+    def state(self, v):
+        self.cache = v
--- a/llms/mlx_lm/models/cohere.py
+++ b/llms/mlx_lm/models/cohere.py
@@ -1,10 +1,12 @@
+# Copyright © 2023-2024 Apple Inc.
+
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Any, Optional, Tuple

 import mlx.core as mx
 import mlx.nn as nn

-from .base import BaseModelArgs
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention


@dataclass
@@ -67,7 +69,7 @@ class Attention(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        B, L, D = x.shape

@@ -91,8 +93,8 @@ class Attention(nn.Module):
            queries = self.rope(queries)
            keys = self.rope(keys)

-        output = mx.fast.scaled_dot_product_attention(
-            queries, keys, values, scale=self.scale, mask=mask
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
        )

        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
@@ -127,7 +129,7 @@ class TransformerBlock(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        h = self.input_layernorm(x)
        attn_h = self.self_attn(h, mask, cache)
@@ -157,10 +159,7 @@ class CohereModel(nn.Module):
    ):
        h = self.embed_tokens(inputs)

-        mask = None
-        if h.shape[1] > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
-            mask = mask.astype(h.dtype)
+        mask = create_attention_mask(h, cache)

        if cache is None:
            cache = [None] * len(self.layers)
@@ -191,11 +190,3 @@ class Model(nn.Module):
    @property
    def layers(self):
        return self.model.layers
-
-    @property
-    def head_dim(self):
-        return self.args.hidden_size // self.args.num_attention_heads
-
-    @property
-    def n_kv_heads(self):
-        return self.args.num_key_value_heads
--- a/llms/mlx_lm/models/dbrx.py
+++ b/llms/mlx_lm/models/dbrx.py
@@ -1,11 +1,13 @@
+# Copyright © 2023-2024 Apple Inc.
+
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Any, Optional, Tuple

 import mlx.core as mx
 import mlx.nn as nn
 import numpy as np

-from .base import BaseModelArgs
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention


@dataclass
@@ -47,7 +49,7 @@ class Attention(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:

        qkv = self.Wqkv(x)
@@ -72,8 +74,8 @@ class Attention(nn.Module):
            queries = self.rope(queries)
            keys = self.rope(keys)

-        output = mx.fast.scaled_dot_product_attention(
-            queries, keys, values, scale=self.scale, mask=mask
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
        )
        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
        return self.out_proj(output)
@@ -90,7 +92,7 @@ class NormAttnNorm(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        h = self.attn(self.norm_1(x), mask=mask, cache=cache)
        x = h + x
@@ -177,7 +179,7 @@ class DecoderLayer(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        r, h = self.norm_attn_norm(x, mask, cache)
        out = self.ffn(h) + r
@@ -199,11 +201,7 @@ class DBRX(nn.Module):
    ):
        h = self.wte(inputs)

-        mask = None
-        T = h.shape[1]
-        if T > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(T)
-            mask = mask.astype(h.dtype)
+        mask = create_attention_mask(h, cache)

        if cache is None:
            cache = [None] * len(self.blocks)
@@ -251,11 +249,3 @@ class Model(nn.Module):
                    experts = [(s, sv.T) for s, sv in experts]
                new_weights.update(experts)
        return new_weights
-
-    @property
-    def head_dim(self):
-        return self.args.d_model // self.args.n_heads
-
-    @property
-    def n_kv_heads(self):
-        return self.args.attn_config["kv_n_heads"]
--- a/llms/mlx_lm/models/deepseek.py
+++ b/llms/mlx_lm/models/deepseek.py
@@ -0,0 +1,258 @@
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+import mlx.core as mx
+import mlx.nn as nn
+
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention
+from .switch_layers import SwitchGLU
+
+
+@dataclass
+class ModelArgs(BaseModelArgs):
+    model_type: str = "deepseek"
+    vocab_size: int = 102400
+    hidden_size: int = 4096
+    intermediate_size: int = 11008
+    moe_intermediate_size: int = 1407
+    num_hidden_layers: int = 30
+    num_attention_heads: int = 32
+    num_key_value_heads: int = 32
+    n_shared_experts: Optional[int] = None
+    n_routed_experts: Optional[int] = None
+    num_experts_per_tok: Optional[int] = None
+    moe_layer_freq: int = 1
+    first_k_dense_replace: int = 0
+    max_position_embeddings: int = 2048
+    rms_norm_eps: float = 1e-6
+    rope_theta: float = 10000.0
+    rope_scaling: Optional[Dict] = None
+    attention_bias: bool = False
+
+
+class DeepseekAttention(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.num_kv_heads = config.num_key_value_heads
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        self.scale = self.head_dim**-0.5
+
+        attention_bias = getattr(config, "attention_bias", False)
+
+        self.q_proj = nn.Linear(
+            self.hidden_size,
+            config.num_attention_heads * self.head_dim,
+            bias=attention_bias,
+        )
+        self.k_proj = nn.Linear(
+            self.hidden_size,
+            config.num_key_value_heads * self.head_dim,
+            bias=attention_bias,
+        )
+        self.v_proj = nn.Linear(
+            self.hidden_size,
+            config.num_key_value_heads * self.head_dim,
+            bias=attention_bias,
+        )
+        self.o_proj = nn.Linear(
+            self.hidden_size,
+            config.num_attention_heads * self.head_dim,
+            bias=attention_bias,
+        )
+
+        rope_scale = 1.0
+        if config.rope_scaling and config.rope_scaling["type"] == "linear":
+            assert isinstance(config.rope_scaling["factor"], float)
+            rope_scale = 1 / config.rope_scaling["factor"]
+        self.rope = nn.RoPE(
+            self.head_dim,
+            base=config.rope_theta,
+            scale=rope_scale,
+        )
+
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Any] = None,
+    ) -> mx.array:
+        B, L, _ = x.shape
+
+        queries, keys, values = self.q_proj(x), self.k_proj(x), self.v_proj(x)
+
+        queries = queries.reshape(B, L, self.num_attention_heads, -1).transpose(
+            0, 2, 1, 3
+        )
+        keys = keys.reshape(B, L, self.num_kv_heads, -1).transpose(0, 2, 1, 3)
+        values = values.reshape(B, L, self.num_kv_heads, -1).transpose(0, 2, 1, 3)
+
+        if cache is not None:
+            queries = self.rope(queries, offset=cache.offset)
+            keys = self.rope(keys, offset=cache.offset)
+            keys, values = cache.update_and_fetch(keys, values)
+        else:
+            queries = self.rope(queries)
+            keys = self.rope(keys)
+
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
+        )
+        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
+        return self.o_proj(output)
+
+
+class DeepseekMLP(nn.Module):
+    def __init__(
+        self,
+        config: ModelArgs,
+        hidden_size: Optional[int] = None,
+        intermediate_size: Optional[int] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = hidden_size or config.hidden_size
+        self.intermediate_size = intermediate_size or config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = nn.silu
+
+    def __call__(self, x: mx.array) -> mx.array:
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+class MoEGate(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.config = config
+        self.top_k = config.num_experts_per_tok
+        self.n_routed_experts = config.n_routed_experts
+        self.weight = mx.zeros((self.n_routed_experts, config.hidden_size))
+
+    def __call__(self, x):
+        gates = x @ self.weight.T
+        scores = mx.softmax(gates, axis=-1, precise=True)
+        k = self.top_k
+        inds = mx.stop_gradient(mx.argpartition(-scores, kth=k - 1, axis=-1)[..., :k])
+        scores = mx.take_along_axis(scores, inds, axis=-1)
+        return inds, scores
+
+
+class DeepseekMoE(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.config = config
+        self.switch_mlp = SwitchGLU(
+            config.hidden_size, config.moe_intermediate_size, config.n_routed_experts
+        )
+
+        self.gate = MoEGate(config)
+        if config.n_shared_experts is not None:
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            self.shared_experts = DeepseekMLP(
+                config=config, intermediate_size=intermediate_size
+            )
+
+    def __call__(self, x):
+        inds, scores = self.gate(x)
+        y = self.switch_mlp(x, inds)
+        y = (y * scores[..., None]).sum(axis=-2)
+        if self.config.n_shared_experts is not None:
+            y = y + self.shared_experts(x)
+
+        return y
+
+
+class DeepseekDecoderLayer(nn.Module):
+    def __init__(self, config: ModelArgs, layer_idx: int):
+        super().__init__()
+        self.self_attn = DeepseekAttention(config)
+        self.mlp = (
+            DeepseekMoE(config)
+            if (
+                config.n_routed_experts is not None
+                and layer_idx >= config.first_k_dense_replace
+                and layer_idx % config.moe_layer_freq == 0
+            )
+            else DeepseekMLP(config)
+        )
+        self.input_layernorm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = nn.RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Any] = None,
+    ) -> mx.array:
+        r = self.self_attn(self.input_layernorm(x), mask, cache)
+        h = x + r
+        r = self.mlp(self.post_attention_layernorm(h))
+        out = h + r
+        return out
+
+
+class DeepseekModel(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.config = config
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = [
+            DeepseekDecoderLayer(config, idx) for idx in range(config.num_hidden_layers)
+        ]
+        self.norm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def __call__(
+        self,
+        x: mx.array,
+        cache: Optional[Any] = None,
+    ) -> mx.array:
+        h = self.embed_tokens(x)
+        mask = create_attention_mask(h, cache)
+
+        if cache is None:
+            cache = [None] * len(self.layers)
+
+        for layer, c in zip(self.layers, cache):
+            h = layer(h, mask, c)
+
+        return self.norm(h)
+
+
+class Model(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.args = config
+        self.model_type = config.model_type
+        self.model = DeepseekModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+    def __call__(
+        self,
+        inputs: mx.array,
+        cache: Optional[Any] = None,
+    ):
+        out = self.model(inputs, cache)
+        return self.lm_head(out)
+
+    def sanitize(self, weights):
+        for l in range(self.args.num_hidden_layers):
+            prefix = f"model.layers.{l}"
+            for m in ["gate_proj", "down_proj", "up_proj"]:
+                for k in ["weight", "scales", "biases"]:
+                    if f"{prefix}.mlp.experts.0.{m}.{k}" in weights:
+                        to_join = [
+                            weights.pop(f"{prefix}.mlp.experts.{e}.{m}.{k}")
+                            for e in range(self.args.n_routed_experts)
+                        ]
+                        weights[f"{prefix}.mlp.switch_mlp.{m}.{k}"] = mx.stack(to_join)
+        return weights
+
+    @property
+    def layers(self):
+        return self.model.layers
--- a/llms/mlx_lm/models/deepseek_v2.py
+++ b/llms/mlx_lm/models/deepseek_v2.py
@@ -0,0 +1,417 @@
+# Copyright © 2023-2024 Apple Inc.
+
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple
+
+import mlx.core as mx
+import mlx.nn as nn
+
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention
+from .switch_layers import SwitchGLU
+
+
+@dataclass
+class ModelArgs(BaseModelArgs):
+    model_type: str = "deepseek_v2"
+    vocab_size: int = 102400
+    hidden_size: int = 4096
+    intermediate_size: int = 11008
+    moe_intermediate_size: int = 1407
+    num_hidden_layers: int = 30
+    num_attention_heads: int = 32
+    num_key_value_heads: int = 32
+    n_shared_experts: Optional[int] = None
+    n_routed_experts: Optional[int] = None
+    routed_scaling_factor: float = 1.0
+    kv_lora_rank: int = 512
+    q_lora_rank: int = 1536
+    qk_rope_head_dim: int = 64
+    v_head_dim: int = 128
+    qk_nope_head_dim: int = 128
+    topk_method: str = "gready"
+    n_group: Optional[int] = None
+    topk_group: Optional[int] = None
+    num_experts_per_tok: Optional[int] = None
+    moe_layer_freq: int = 1
+    first_k_dense_replace: int = 0
+    max_position_embeddings: int = 2048
+    rms_norm_eps: float = 1e-6
+    rope_theta: float = 10000.0
+    rope_scaling: Dict = None
+    attention_bias: bool = False
+
+
+def yarn_find_correction_dim(
+    num_rotations, dim, base=10000, max_position_embeddings=2048
+):
+    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
+        2 * math.log(base)
+    )
+
+
+def yarn_find_correction_range(
+    low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
+):
+    low = math.floor(
+        yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
+    )
+    high = math.ceil(
+        yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
+    )
+    return max(low, 0), min(high, dim - 1)
+
+
+def yarn_get_mscale(scale=1, mscale=1):
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+def yarn_linear_ramp_mask(min_val, max_val, dim):
+    if min_val == max_val:
+        max_val += 0.001  # Prevent singularity
+
+    linear_func = (mx.arange(dim, dtype=mx.float32) - min_val) / (max_val - min_val)
+    return mx.clip(linear_func, 0, 1)
+
+
+class DeepseekV2YarnRotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings=2048,
+        base=10000,
+        scaling_factor=1.0,
+        original_max_position_embeddings=4096,
+        beta_fast=32,
+        beta_slow=1,
+        mscale=1,
+        mscale_all_dim=0,
+    ):
+        super().__init__()
+        self.mscale = yarn_get_mscale(scaling_factor, mscale) / yarn_get_mscale(
+            scaling_factor, mscale_all_dim
+        )
+        freq_extra = base ** (mx.arange(0, dim, 2, dtype=mx.float32) / dim)
+        freq_inter = scaling_factor * base ** (
+            mx.arange(0, dim, 2, dtype=mx.float32) / dim
+        )
+        low, high = yarn_find_correction_range(
+            beta_fast,
+            beta_slow,
+            dim,
+            base,
+            original_max_position_embeddings,
+        )
+        freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2)
+        self._freqs = (freq_inter * freq_extra) / (
+            freq_inter * freq_mask + freq_extra * (1 - freq_mask)
+        )
+
+    def __call__(self, x, offset=0):
+        if self.mscale != 1.0:
+            x = self.mscale * x
+        return mx.fast.rope(
+            x,
+            x.shape[-1],
+            traditional=True,
+            base=None,
+            scale=1.0,
+            offset=offset,
+            freqs=self._freqs,
+        )
+
+
+class DeepseekV2Attention(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.q_lora_rank = config.q_lora_rank
+        self.qk_rope_head_dim = config.qk_rope_head_dim
+        self.kv_lora_rank = config.kv_lora_rank
+        self.v_head_dim = config.v_head_dim
+        self.qk_nope_head_dim = config.qk_nope_head_dim
+        self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim
+
+        self.scale = self.q_head_dim**-0.5
+
+        if self.q_lora_rank is None:
+            self.q_proj = nn.Linear(
+                self.hidden_size, self.num_heads * self.q_head_dim, bias=False
+            )
+        else:
+            self.q_a_proj = nn.Linear(
+                self.hidden_size, self.q_lora_rank, bias=config.attention_bias
+            )
+            self.q_a_layernorm = nn.RMSNorm(self.q_lora_rank)
+            self.q_b_proj = nn.Linear(
+                self.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
+            )
+
+        self.kv_a_proj_with_mqa = nn.Linear(
+            self.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=config.attention_bias,
+        )
+        self.kv_a_layernorm = nn.RMSNorm(self.kv_lora_rank)
+        self.kv_b_proj = nn.Linear(
+            self.kv_lora_rank,
+            self.num_heads
+            * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
+            bias=False,
+        )
+
+        self.o_proj = nn.Linear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=config.attention_bias,
+        )
+
+        mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
+        scaling_factor = self.config.rope_scaling["factor"]
+        if mscale_all_dim:
+            mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
+            self.scale = self.scale * mscale * mscale
+
+        rope_kwargs = {
+            key: self.config.rope_scaling[key]
+            for key in [
+                "original_max_position_embeddings",
+                "beta_fast",
+                "beta_slow",
+                "mscale",
+                "mscale_all_dim",
+            ]
+            if key in self.config.rope_scaling
+        }
+        self.rope = DeepseekV2YarnRotaryEmbedding(
+            dim=self.qk_rope_head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            scaling_factor=scaling_factor,
+            base=self.rope_theta,
+            **rope_kwargs,
+        )
+
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Any] = None,
+    ) -> mx.array:
+        B, L, D = x.shape
+
+        if self.q_lora_rank is None:
+            q = self.q_proj(x)
+        else:
+            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(x)))
+
+        q = q.reshape(B, L, self.num_heads, self.q_head_dim).transpose(0, 2, 1, 3)
+        q_nope, q_pe = mx.split(q, [self.qk_nope_head_dim], axis=-1)
+        compressed_kv = self.kv_a_proj_with_mqa(x)
+        compressed_kv, k_pe = mx.split(compressed_kv, [self.kv_lora_rank], axis=-1)
+        k_pe = k_pe.reshape(B, L, 1, self.qk_rope_head_dim).transpose(0, 2, 1, 3)
+        kv = self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
+        kv = kv.reshape(B, L, self.num_heads, -1).transpose(0, 2, 1, 3)
+
+        k_nope, values = mx.split(kv, [self.qk_nope_head_dim], axis=-1)
+
+        if cache is not None:
+            q_pe = self.rope(q_pe, cache.offset)
+            k_pe = self.rope(k_pe, cache.offset)
+            k_pe = mx.repeat(k_pe, self.num_heads, axis=1)
+            keys, values = cache.update_and_fetch(
+                mx.concatenate([k_nope, k_pe], axis=-1), values
+            )
+        else:
+            q_pe = self.rope(q_pe)
+            k_pe = self.rope(k_pe)
+            k_pe = mx.repeat(k_pe, self.num_heads, axis=1)
+            keys = mx.concatenate([k_nope, k_pe], axis=-1)
+
+        queries = mx.concatenate([q_nope, q_pe], axis=-1)
+
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
+        )
+        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
+        return self.o_proj(output)
+
+
+class DeepseekV2MLP(nn.Module):
+    def __init__(
+        self, config: ModelArgs, hidden_size: int = None, intermediate_size: int = None
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
+        self.intermediate_size = (
+            config.intermediate_size if intermediate_size is None else intermediate_size
+        )
+
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+
+    def __call__(self, x):
+        down_proj = self.down_proj(nn.silu(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class MoEGate(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.config = config
+        self.top_k = config.num_experts_per_tok
+        self.n_routed_experts = config.n_routed_experts
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.topk_method = config.topk_method
+        self.n_group = config.n_group
+        self.topk_group = config.topk_group
+        self.weight = mx.zeros((self.n_routed_experts, config.hidden_size))
+
+    def __call__(self, x):
+        gates = x @ self.weight.T
+
+        scores = mx.softmax(gates, axis=-1, precise=True)
+
+        if self.topk_method == "group_limited_greedy":
+            bsz, seq_len = x.shape[:2]
+            scores = scores.reshape(bsz, seq_len, self.n_group, -1)
+            group_scores = scores.max(axis=-1)
+            k = self.n_group - self.topk_group
+            group_idx = mx.argpartition(group_scores, kth=k - 1, axis=-1)[..., :k]
+            batch_idx = mx.expand_dims(mx.arange(bsz), (1, 2))
+            seq_idx = mx.expand_dims(mx.arange(seq_len), (0, 2))
+            scores[batch_idx, seq_idx, group_idx] = 0.0
+            scores = scores.reshape(bsz, seq_len, -1)
+
+        k = self.top_k
+        inds = mx.argpartition(-scores, kth=k - 1, axis=-1)[..., :k]
+        scores = mx.take_along_axis(scores, inds, axis=-1)
+        scores = scores * self.routed_scaling_factor
+
+        return inds, scores
+
+
+class DeepseekV2MoE(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.config = config
+        self.num_experts_per_tok = config.num_experts_per_tok
+        self.switch_mlp = SwitchGLU(
+            config.hidden_size, config.moe_intermediate_size, config.n_routed_experts
+        )
+
+        self.gate = MoEGate(config)
+        if config.n_shared_experts is not None:
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            self.shared_experts = DeepseekV2MLP(
+                config=config, intermediate_size=intermediate_size
+            )
+
+    def __call__(self, x):
+        inds, scores = self.gate(x)
+        y = self.switch_mlp(x, inds)
+        y = (y * scores[..., None]).sum(axis=-2)
+        if self.config.n_shared_experts is not None:
+            y = y + self.shared_experts(x)
+
+        return y
+
+
+class DeepseekV2DecoderLayer(nn.Module):
+    def __init__(self, config: ModelArgs, layer_idx: int):
+        super().__init__()
+        self.self_attn = DeepseekV2Attention(config)
+        self.mlp = (
+            DeepseekV2MoE(config)
+            if (
+                config.n_routed_experts is not None
+                and layer_idx >= config.first_k_dense_replace
+                and layer_idx % config.moe_layer_freq == 0
+            )
+            else DeepseekV2MLP(config)
+        )
+        self.input_layernorm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = nn.RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Any] = None,
+    ) -> mx.array:
+        r = self.self_attn(self.input_layernorm(x), mask, cache)
+        h = x + r
+        r = self.mlp(self.post_attention_layernorm(h))
+        out = h + r
+        return out
+
+
+class DeepseekV2Model(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = [
+            DeepseekV2DecoderLayer(config, idx)
+            for idx in range(config.num_hidden_layers)
+        ]
+        self.norm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def __call__(
+        self,
+        x: mx.array,
+        cache: Optional[Any] = None,
+    ) -> mx.array:
+        h = self.embed_tokens(x)
+        mask = create_attention_mask(h, cache)
+
+        if cache is None:
+            cache = [None] * len(self.layers)
+
+        for layer, c in zip(self.layers, cache):
+            h = layer(h, mask, c)
+
+        return self.norm(h)
+
+
+class Model(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.args = config
+        self.model_type = config.model_type
+        self.model = DeepseekV2Model(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+    def __call__(
+        self,
+        inputs: mx.array,
+        cache: Optional[Any] = None,
+    ):
+        out = self.model(inputs, cache)
+        return self.lm_head(out)
+
+    def sanitize(self, weights):
+        for l in range(self.args.num_hidden_layers):
+            prefix = f"model.layers.{l}"
+            for n, m in [("w1", "gate_proj"), ("w2", "down_proj"), ("w3", "up_proj")]:
+                for k in ["weight", "scales", "biases"]:
+                    if f"{prefix}.mlp.experts.0.{m}.{k}" in weights:
+                        to_join = [
+                            weights.pop(f"{prefix}.mlp.experts.{e}.{m}.{k}")
+                            for e in range(self.args.n_routed_experts)
+                        ]
+                        weights[f"{prefix}.mlp.switch_mlp.{m}.{k}"] = mx.stack(to_join)
+        return weights
+
+    @property
+    def layers(self):
+        return self.model.layers
--- a/llms/mlx_lm/models/gemma.py
+++ b/llms/mlx_lm/models/gemma.py
@@ -1,10 +1,12 @@
+# Copyright © 2023-2024 Apple Inc.
+
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Any, Optional, Tuple

 import mlx.core as mx
 import mlx.nn as nn

-from .base import BaseModelArgs
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention


@dataclass
@@ -58,7 +60,7 @@ class Attention(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        B, L, D = x.shape

@@ -77,8 +79,8 @@ class Attention(nn.Module):
            queries = self.rope(queries)
            keys = self.rope(keys)

-        output = mx.fast.scaled_dot_product_attention(
-            queries, keys, values, scale=self.scale, mask=mask
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
        )

        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
@@ -111,7 +113,7 @@ class TransformerBlock(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        r = self.self_attn(self.input_layernorm(x), mask, cache)
        h = x + r
@@ -141,10 +143,7 @@ class GemmaModel(nn.Module):
        h = self.embed_tokens(inputs)
        h = h * (self.args.hidden_size**0.5)

-        mask = None
-        if h.shape[1] > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
-            mask = mask.astype(h.dtype)
+        mask = create_attention_mask(h, cache)

        if cache is None:
            cache = [None] * len(self.layers)
@@ -174,11 +173,3 @@ class Model(nn.Module):
    @property
    def layers(self):
        return self.model.layers
-
-    @property
-    def head_dim(self):
-        return self.args.head_dim
-
-    @property
-    def n_kv_heads(self):
-        return self.args.num_key_value_heads
--- a/llms/mlx_lm/models/gemma2.py
+++ b/llms/mlx_lm/models/gemma2.py
@@ -1,10 +1,12 @@
+# Copyright © 2023-2024 Apple Inc.
+
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Any, Optional, Tuple

 import mlx.core as mx
 import mlx.nn as nn

-from .base import BaseModelArgs
+from .base import BaseModelArgs, create_attention_mask


@dataclass
@@ -20,6 +22,9 @@ class ModelArgs(BaseModelArgs):
    num_key_value_heads: int
    rope_theta: float = 10000
    rope_traditional: bool = False
+    attn_logit_softcapping: float = 50.0
+    final_logit_softcapping: float = 30.0
+    query_pre_attn_scalar: float = 144.0


 class RMSNorm(nn.Module):
@@ -39,15 +44,16 @@ class Attention(nn.Module):
        dim = args.hidden_size
        self.n_heads = n_heads = args.num_attention_heads
        self.n_kv_heads = n_kv_heads = args.num_key_value_heads
+        self.repeats = n_heads // n_kv_heads
        self.head_dim = head_dim = args.head_dim

-        self.scale = head_dim**-0.5
+        self.scale = 1.0 / (args.query_pre_attn_scalar**0.5)

        self.q_proj = nn.Linear(dim, n_heads * head_dim, bias=False)
        self.k_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=False)
        self.v_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=False)
        self.o_proj = nn.Linear(n_heads * head_dim, dim, bias=False)
-
+        self.attn_logit_softcapping = args.attn_logit_softcapping
        self.rope = nn.RoPE(
            head_dim,
            traditional=args.rope_traditional,
@@ -58,13 +64,10 @@ class Attention(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        B, L, D = x.shape
-
        queries, keys, values = self.q_proj(x), self.k_proj(x), self.v_proj(x)
-
-        # Prepare the queries, keys and values for the attention computation
        queries = queries.reshape(B, L, self.n_heads, -1).transpose(0, 2, 1, 3)
        keys = keys.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
        values = values.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
@@ -77,10 +80,25 @@ class Attention(nn.Module):
            queries = self.rope(queries)
            keys = self.rope(keys)

-        output = mx.fast.scaled_dot_product_attention(
-            queries, keys, values, scale=self.scale, mask=mask
-        )
+        queries = queries * self.scale

+        if self.repeats > 1:
+            queries = queries.reshape(
+                B, self.n_kv_heads, self.repeats, L, self.head_dim
+            )
+            keys = mx.expand_dims(keys, 2)
+            values = mx.expand_dims(values, 2)
+
+        scores = queries @ keys.swapaxes(-1, -2)
+        scores = mx.tanh(scores / self.attn_logit_softcapping)
+        scores *= self.attn_logit_softcapping
+
+        if mask is not None:
+            scores = scores + mask
+        scores = mx.softmax(scores, precise=True, axis=-1)
+        output = scores @ values
+        if self.repeats > 1:
+            output = output.reshape(B, self.n_heads, L, self.head_dim)
        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
        return self.o_proj(output)

@@ -93,7 +111,7 @@ class MLP(nn.Module):
        self.up_proj = nn.Linear(dim, hidden_dim, bias=False)

    def __call__(self, x) -> mx.array:
-        return self.down_proj(nn.gelu(self.gate_proj(x)) * self.up_proj(x))
+        return self.down_proj(nn.gelu_approx(self.gate_proj(x)) * self.up_proj(x))


 class TransformerBlock(nn.Module):
@@ -117,7 +135,7 @@ class TransformerBlock(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        r = self.self_attn(self.input_layernorm(x), mask, cache)
        h = x + self.post_attention_layernorm(r)
@@ -147,10 +165,7 @@ class GemmaModel(nn.Module):
        h = self.embed_tokens(inputs)
        h = h * (self.args.hidden_size**0.5)

-        mask = None
-        if h.shape[1] > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
-            mask = mask.astype(h.dtype)
+        mask = create_attention_mask(h, cache)

        if cache is None:
            cache = [None] * len(self.layers)
@@ -165,6 +180,7 @@ class Model(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.model_type = args.model_type
+        self.final_logit_softcapping = args.final_logit_softcapping
        self.model = GemmaModel(args)
        self.args = args

@@ -175,16 +191,10 @@ class Model(nn.Module):
    ):
        out = self.model(inputs, cache)
        out = self.model.embed_tokens.as_linear(out)
+        out = mx.tanh(out / self.final_logit_softcapping)
+        out = out * self.final_logit_softcapping
        return out

    @property
    def layers(self):
        return self.model.layers
-
-    @property
-    def head_dim(self):
-        return self.args.head_dim
-
-    @property
-    def n_kv_heads(self):
-        return self.args.num_key_value_heads
--- a/llms/mlx_lm/models/gpt2.py
+++ b/llms/mlx_lm/models/gpt2.py
@@ -1,11 +1,13 @@
+# Copyright © 2023-2024 Apple Inc.
+
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple, Union

 import mlx.core as mx
 import mlx.nn as nn
 import numpy as np

-from .base import BaseModelArgs, create_additive_causal_mask
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention


@dataclass
@@ -44,7 +46,7 @@ class Attention(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        B, L, D = x.shape

@@ -59,8 +61,8 @@ class Attention(nn.Module):
        if cache is not None:
            keys, values = cache.update_and_fetch(keys, values)

-        output = mx.fast.scaled_dot_product_attention(
-            queries, keys, values, scale=self.scale, mask=mask
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
        )

        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
@@ -98,7 +100,7 @@ class TransformerBlock(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        r = self.attn(self.ln_1(x), mask, cache)
        h = x + r
@@ -136,10 +138,7 @@ class GPT2Model(nn.Module):
            position_ids = mx.array(np.arange(L))
            hidden_states += self.wpe(position_ids)

-            mask = create_additive_causal_mask(
-                hidden_states.shape[1], cache[0].offset if cache is not None else 0
-            )
-            mask = mask.astype(hidden_states.dtype)
+            mask = create_attention_mask(hidden_states, cache)

        if cache is None:
            cache = [None] * len(self.h)
@@ -197,11 +196,3 @@ class Model(nn.Module):
    @property
    def layers(self):
        return self.model.h
-
-    @property
-    def head_dim(self):
-        return self.args.n_embd // self.args.n_head
-
-    @property
-    def n_kv_heads(self):
-        return self.args.num_key_value_heads
--- a/llms/mlx_lm/models/gpt_bigcode.py
+++ b/llms/mlx_lm/models/gpt_bigcode.py
@@ -1,11 +1,13 @@
+# Copyright © 2023-2024 Apple Inc.
+
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple, Union

 import mlx.core as mx
 import mlx.nn as nn
 import numpy as np

-from .base import BaseModelArgs, create_additive_causal_mask
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention


@dataclass
@@ -55,7 +57,7 @@ class Attention(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        B, L, D = x.shape

@@ -72,8 +74,8 @@ class Attention(nn.Module):
        if cache is not None:
            keys, values = cache.update_and_fetch(keys, values)

-        output = mx.fast.scaled_dot_product_attention(
-            queries, keys, values, scale=self.scale, mask=mask
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
        )
        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
        return self.c_proj(output)
@@ -112,7 +114,7 @@ class TransformerBlock(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        r = self.attn(self.ln_1(x), mask, cache)
        h = x + r
@@ -147,10 +149,7 @@ class GPTBigCodeModel(nn.Module):
            position_ids = mx.array(np.arange(L))
            hidden_states += self.wpe(position_ids)

-            mask = create_additive_causal_mask(
-                hidden_states.shape[1], cache[0].offset if cache is not None else 0
-            )
-            mask = mask.astype(hidden_states.dtype)
+            mask = create_attention_mask(hidden_states, cache)

        if cache is None:
            cache = [None] * len(self.h)
@@ -185,11 +184,3 @@ class Model(nn.Module):
    @property
    def layers(self):
        return self.transformer.h
-
-    @property
-    def head_dim(self):
-        return self.args.n_embd // self.args.n_head
-
-    @property
-    def n_kv_heads(self):
-        return self.args.num_key_value_heads
--- a/llms/mlx_lm/models/gpt_neox.py
+++ b/llms/mlx_lm/models/gpt_neox.py
@@ -0,0 +1,216 @@
+# Copyright © 2023-2024 Apple Inc.
+
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple, Union
+
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention
+
+# Based on the transformers implementation at:
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+
+
+@dataclass
+class ModelArgs(BaseModelArgs):
+    model_type: str
+    max_position_embeddings: int
+    hidden_size: int
+    num_attention_heads: int
+    num_hidden_layers: int
+    layer_norm_eps: float
+    vocab_size: int
+    rotary_emb_base: int
+    rotary_pct: float
+    num_key_value_heads: int = None
+
+    def __post_init__(self):
+        if self.num_key_value_heads is None:
+            self.num_key_value_heads = self.num_attention_heads
+
+
+class Attention(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+
+        assert (
+            args.hidden_size % args.num_attention_heads == 0
+        ), "hidden_size must be divisible by num_attention_heads"
+
+        self.hidden_size = args.hidden_size
+        self.num_attention_heads = args.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_attention_heads
+
+        self.rope = nn.RoPE(
+            dims=int(self.head_dim * args.rotary_pct),
+            traditional=False,
+            base=args.rotary_emb_base,
+        )
+
+        self.scale = self.head_dim**-0.5
+
+        self.query_key_value = nn.Linear(
+            self.hidden_size, 3 * self.hidden_size, bias=True
+        )
+        self.dense = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
+
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Any] = None,
+    ) -> mx.array:
+        B, L, D = x.shape
+
+        qkv = self.query_key_value(x)
+
+        new_qkv_shape = qkv.shape[:-1] + (self.num_attention_heads, 3 * self.head_dim)
+        qkv = qkv.reshape(*new_qkv_shape)
+
+        queries, keys, values = [x.transpose(0, 2, 1, 3) for x in qkv.split(3, -1)]
+
+        if cache is not None:
+            queries = self.rope(queries, offset=cache.offset)
+            keys = self.rope(keys, offset=cache.offset)
+            keys, values = cache.update_and_fetch(keys, values)
+        else:
+            queries = self.rope(queries)
+            keys = self.rope(keys)
+
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
+        )
+
+        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
+        return self.dense(output)
+
+
+class MLP(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+
+        self.hidden_size = args.hidden_size
+        self.dense_h_to_4h = nn.Linear(self.hidden_size, 4 * self.hidden_size)
+        self.dense_4h_to_h = nn.Linear(4 * self.hidden_size, self.hidden_size)
+
+    def __call__(self, x) -> mx.array:
+        # gelu_approx corresponds to FastGELUActivation in transformers.
+        return self.dense_4h_to_h(nn.gelu_approx(self.dense_h_to_4h(x)))
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+
+        self.hidden_size = args.hidden_size
+        self.layer_norm_eps = args.layer_norm_eps
+        self.attention = Attention(args)
+        self.mlp = MLP(args)
+        self.input_layernorm = nn.LayerNorm(
+            self.hidden_size,
+            eps=self.layer_norm_eps,
+        )
+        self.post_attention_layernorm = nn.LayerNorm(
+            self.hidden_size, eps=self.layer_norm_eps
+        )
+
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Any] = None,
+    ) -> mx.array:
+        residual = x
+        # NeoX runs attention and feedforward network in parallel.
+        attn = self.attention(self.input_layernorm(x), mask, cache)
+        ffn = self.mlp(self.post_attention_layernorm(x))
+        out = attn + ffn + residual
+        return out
+
+
+class GPTNeoXModel(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.hidden_size = args.hidden_size
+        self.vocab_size = args.vocab_size
+        self.num_hidden_layers = args.num_hidden_layers
+        self.layer_norm_eps = args.layer_norm_eps
+        assert self.vocab_size > 0
+        self.embed_in = nn.Embedding(self.vocab_size, self.hidden_size)
+        self.embed_out = nn.Linear(args.hidden_size, args.vocab_size, bias=False)
+        self.h = [TransformerBlock(args=args) for _ in range(self.num_hidden_layers)]
+        self.final_layer_norm = nn.LayerNorm(self.hidden_size, eps=self.layer_norm_eps)
+
+    def __call__(
+        self,
+        inputs: mx.array,
+        cache=None,
+    ):
+        _, L = inputs.shape
+
+        hidden_states = self.embed_in(inputs)
+
+        mask = create_attention_mask(hidden_states, cache)
+
+        if cache is None:
+            cache = [None] * len(self.h)
+
+        for layer, c in zip(self.h, cache):
+            hidden_states = layer(hidden_states, mask, cache=c)
+
+        out = self.final_layer_norm(hidden_states)
+        out = self.embed_out(out)
+
+        return out
+
+
+class Model(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.args = args
+        self.model_type = args.model_type
+        self.model = GPTNeoXModel(args)
+
+    def __call__(
+        self,
+        inputs: mx.array,
+        cache=None,
+    ):
+        out = self.model(inputs, cache)
+        return out
+
+    def sanitize(self, weights):
+        new_weights = {}
+
+        for w_key, w_value in weights.items():
+            # Created through register_buffer in Pytorch, not needed here.
+            ignore_suffixes = [
+                ".attention.bias",
+                ".attention.masked_bias",
+                ".attention.rotary_emb.inv_freq",
+            ]
+
+            skip_weight = False
+            for ignored_suffix in ignore_suffixes:
+                if w_key.endswith(ignored_suffix):
+                    skip_weight = True
+                    break
+
+            if skip_weight:
+                continue
+
+            if not w_key.startswith("model."):
+                w_key = f"model.{w_key}"
+
+            w_key = w_key.replace(".gpt_neox.layers.", ".h.")
+            w_key = w_key.replace(".gpt_neox.", ".")
+
+            new_weights[w_key] = w_value
+
+        return new_weights
+
+    @property
+    def layers(self):
+        return self.model.h
--- a/llms/mlx_lm/models/internlm2.py
+++ b/llms/mlx_lm/models/internlm2.py
@@ -1,10 +1,12 @@
+# Copyright © 2023-2024 Apple Inc.
+
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple, Union

 import mlx.core as mx
 import mlx.nn as nn

-from .base import BaseModelArgs
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention


@dataclass
@@ -17,6 +19,7 @@ class ModelArgs(BaseModelArgs):
    rms_norm_eps: float
    vocab_size: int
    bias: bool = True
+    max_position_embeddings: int = 32768
    num_key_value_heads: int = None
    rope_theta: float = 10000
    rope_traditional: bool = False
@@ -32,8 +35,50 @@ class ModelArgs(BaseModelArgs):
            if not all(key in self.rope_scaling for key in required_keys):
                raise ValueError(f"rope_scaling must contain keys {required_keys}")

-            if self.rope_scaling["type"] != "linear":
-                raise ValueError("rope_scaling 'type' currently only supports 'linear'")
+            if self.rope_scaling["type"] not in ["linear", "dynamic"]:
+                raise ValueError(
+                    "rope_scaling 'type' currently only supports 'linear' or 'dynamic"
+                )
+
+
+class DynamicNTKScalingRoPE(nn.Module):
+    """Implements the rotary positional encoding with Dynamic NTK scaling."""
+
+    def __init__(
+        self,
+        dims: int,
+        max_position_embeddings: int = 2048,
+        traditional: bool = False,
+        base: float = 10000,
+        scale: float = 1.0,
+    ):
+        super().__init__()
+        self.max_position_embeddings = max_position_embeddings
+        self.original_base = base
+        self.dims = dims
+        self.traditional = traditional
+        self.scale = scale
+
+    def extra_repr(self):
+        return f"{self.dims}, traditional={self.traditional}, max_position_embeddings={self.max_position_embeddings}, scaling_factor={self.scaling_factor}"
+
+    def __call__(self, x, offset: int = 0):
+        seq_len = x.shape[1] + offset
+        if seq_len > self.max_position_embeddings:
+            base = self.original_base * (
+                (self.scale * seq_len / self.max_position_embeddings) - (self.scale - 1)
+            ) ** (self.dims / (self.dims - 2))
+        else:
+            base = self.original_base
+
+        return mx.fast.rope(
+            x,
+            self.dims,
+            traditional=self.traditional,
+            base=base,
+            scale=self.scale,
+            offset=offset,
+        )


 class Attention(nn.Module):
@@ -56,10 +101,12 @@ class Attention(nn.Module):
        rope_scale = (
            1 / args.rope_scaling["factor"]
            if args.rope_scaling is not None and args.rope_scaling["type"] == "linear"
-            else 1
+            else 2.0
        )
-        self.rope = nn.RoPE(
+
+        self.rope = DynamicNTKScalingRoPE(
            head_dim,
+            max_position_embeddings=args.max_position_embeddings,
            traditional=args.rope_traditional,
            base=args.rope_theta,
            scale=rope_scale,
@@ -69,7 +116,7 @@ class Attention(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        B, L, D = x.shape

@@ -94,8 +141,8 @@ class Attention(nn.Module):
            queries = self.rope(queries)
            keys = self.rope(keys)

-        output = mx.fast.scaled_dot_product_attention(
-            queries, keys, values, scale=self.scale, mask=mask
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
        )
        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
        return self.wo(output)
@@ -124,7 +171,7 @@ class TransformerBlock(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        r = self.attention(self.attention_norm(x), mask, cache)
        h = x + r
@@ -150,10 +197,7 @@ class InternLM2Model(nn.Module):
    ):
        h = self.tok_embeddings(inputs)

-        mask = None
-        if h.shape[1] > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
-            mask = mask.astype(h.dtype)
+        mask = create_attention_mask(h, cache)

        if cache is None:
            cache = [None] * len(self.layers)
@@ -185,14 +229,10 @@ class Model(nn.Module):
            out = self.output(out)
        return out

+    def sanitize(self, weights):
+        # Remove unused precomputed rotary freqs
+        return {k: v for k, v in weights.items() if "attention.rope.inv_freq" not in k}
+
    @property
    def layers(self):
        return self.model.layers
-
-    @property
-    def head_dim(self):
-        return self.args.hidden_size // self.args.num_attention_heads
-
-    @property
-    def n_kv_heads(self):
-        return self.args.num_key_value_heads
--- a/llms/mlx_lm/models/llama.py
+++ b/llms/mlx_lm/models/llama.py
@@ -1,10 +1,12 @@
+# Copyright © 2023-2024 Apple Inc.
+
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Union

 import mlx.core as mx
 import mlx.nn as nn

-from .base import BaseModelArgs, KVCache, create_additive_causal_mask
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention


@dataclass
@@ -16,6 +18,8 @@ class ModelArgs(BaseModelArgs):
    num_attention_heads: int
    rms_norm_eps: float
    vocab_size: int
+    head_dim: Optional[int] = None
+    max_position_embeddings: Optional[int] = None
    num_key_value_heads: Optional[int] = None
    attention_bias: bool = False
    mlp_bias: bool = False
@@ -29,12 +33,115 @@ class ModelArgs(BaseModelArgs):
            self.num_key_value_heads = self.num_attention_heads

        if self.rope_scaling:
-            required_keys = {"factor", "type"}
-            if not all(key in self.rope_scaling for key in required_keys):
-                raise ValueError(f"rope_scaling must contain keys {required_keys}")
+            if not "factor" in self.rope_scaling:
+                raise ValueError(f"rope_scaling must contain 'factor'")
+            rope_type = self.rope_scaling.get("type") or self.rope_scaling.get(
+                "rope_type"
+            )
+            if rope_type is None:
+                raise ValueError(
+                    f"rope_scaling must contain either 'type' or 'rope_type'"
+                )
+            if rope_type not in ["linear", "dynamic", "llama3"]:
+                raise ValueError(
+                    "rope_scaling 'type' currently only supports 'linear', 'dynamic' or 'llama3'"
+                )

-            if self.rope_scaling["type"] != "linear":
-                raise ValueError("rope_scaling 'type' currently only supports 'linear'")
+
+class DynamicNTKScalingRoPE(nn.Module):
+    """Implements the rotary positional encoding with Dynamic NTK scaling and Llama 3 RoPE."""
+
+    def __init__(
+        self,
+        dims: int,
+        max_position_embeddings: int = 2048,
+        traditional: bool = False,
+        base: float = 10000,
+        scale: float = 1.0,
+        rope_type: str = "default",
+        rope_scaling: dict = None,
+    ):
+        super().__init__()
+        self.dims = dims
+        self.max_position_embeddings = max_position_embeddings
+        self.traditional = traditional
+        self.scale = scale
+        self.rope_type = rope_type
+        self.rope_scaling = rope_scaling
+        self.base = base
+        self.compute_freqs()
+
+    def compute_freqs(self):
+        if self.rope_type != "llama3":
+            self._freqs = None
+            return
+        factor = self.rope_scaling["factor"]
+        low_freq_factor = self.rope_scaling.get("low_freq_factor", 1.0)
+        high_freq_factor = self.rope_scaling.get("high_freq_factor", 4.0)
+        old_context_len = self.rope_scaling.get(
+            "original_max_position_embeddings",
+            8192,
+        )
+
+        low_freq_wavelen = old_context_len / low_freq_factor
+        high_freq_wavelen = old_context_len / high_freq_factor
+
+        freqs = self.base ** (mx.arange(0, self.dims, 2) / self.dims)
+        wavelens = 2 * mx.pi * freqs
+
+        freqs = mx.where(wavelens > low_freq_wavelen, freqs * factor, freqs)
+        is_medium_freq = (wavelens > high_freq_wavelen) & (wavelens < low_freq_wavelen)
+        smooth_factors = (old_context_len / wavelens - low_freq_factor) / (
+            high_freq_factor - low_freq_factor
+        )
+        smooth_freqs = freqs / ((1 - smooth_factors) / factor + smooth_factors)
+        self._freqs = mx.where(is_medium_freq, smooth_freqs, freqs)
+        self.base = None
+
+    def extra_repr(self):
+        return (
+            f"{self.dims}, traditional={self.traditional}, "
+            f"max_position_embeddings={self.max_position_embeddings}, "
+            f"scaling_factor={self.scale}, rope_type={self.rope_type}"
+        )
+
+    def __call__(self, x, offset: int = 0):
+        return mx.fast.rope(
+            x,
+            self.dims,
+            traditional=self.traditional,
+            base=self.base,
+            scale=self.scale,
+            offset=offset,
+            freqs=self._freqs,
+        )
+
+
+def initialize_rope(args: ModelArgs):
+    head_dim = args.head_dim or args.hidden_size // args.num_attention_heads
+
+    rope_scaling = args.rope_scaling
+    rope_type = "default"
+    rope_scale = 1.0
+
+    if rope_scaling is not None:
+        rope_type = (
+            rope_scaling.get("type") or rope_scaling.get("rope_type") or "default"
+        )
+        if rope_type == "linear":
+            rope_scale = 1 / rope_scaling["factor"]
+        elif rope_type == "llama3":
+            rope_scale = 1.0  # The scaling is handled internally for llama3
+
+    return DynamicNTKScalingRoPE(
+        dims=head_dim,
+        max_position_embeddings=args.max_position_embeddings,
+        traditional=args.rope_traditional,
+        base=args.rope_theta,
+        scale=rope_scale,
+        rope_type=rope_type,
+        rope_scaling=rope_scaling,
+    )


 class Attention(nn.Module):
@@ -45,7 +152,8 @@ class Attention(nn.Module):
        self.n_heads = n_heads = args.num_attention_heads
        self.n_kv_heads = n_kv_heads = args.num_key_value_heads

-        head_dim = args.hidden_size // n_heads
+        self.head_dim = head_dim = args.head_dim or args.hidden_size // n_heads
+
        self.scale = head_dim**-0.5
        if hasattr(args, "attention_bias"):
            attention_bias = args.attention_bias
@@ -57,23 +165,13 @@ class Attention(nn.Module):
        self.v_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=attention_bias)
        self.o_proj = nn.Linear(n_heads * head_dim, dim, bias=attention_bias)

-        rope_scale = (
-            1 / args.rope_scaling["factor"]
-            if args.rope_scaling is not None and args.rope_scaling["type"] == "linear"
-            else 1
-        )
-        self.rope = nn.RoPE(
-            head_dim,
-            traditional=args.rope_traditional,
-            base=args.rope_theta,
-            scale=rope_scale,
-        )
+        self.rope = initialize_rope(args)

    def __call__(
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[KVCache] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        B, L, D = x.shape

@@ -92,9 +190,10 @@ class Attention(nn.Module):
            queries = self.rope(queries)
            keys = self.rope(keys)

-        output = mx.fast.scaled_dot_product_attention(
-            queries, keys, values, scale=self.scale, mask=mask
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
        )
+
        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
        return self.o_proj(output)

@@ -135,7 +234,7 @@ class TransformerBlock(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[KVCache] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        r = self.self_attn(self.input_layernorm(x), mask, cache)
        h = x + r
@@ -164,12 +263,7 @@ class LlamaModel(nn.Module):
    ):
        h = self.embed_tokens(inputs)

-        mask = None
-        if h.shape[1] > 1:
-            mask = create_additive_causal_mask(
-                h.shape[1], cache[0].offset if cache is not None else 0
-            )
-            mask = mask.astype(h.dtype)
+        mask = create_attention_mask(h, cache)

        if cache is None:
            cache = [None] * len(self.layers)
@@ -210,11 +304,3 @@ class Model(nn.Module):
    @property
    def layers(self):
        return self.model.layers
-
-    @property
-    def head_dim(self):
-        return self.args.hidden_size // self.args.num_attention_heads
-
-    @property
-    def n_kv_heads(self):
-        return self.args.num_key_value_heads
--- a/llms/mlx_lm/models/mamba.py
+++ b/llms/mlx_lm/models/mamba.py
@@ -0,0 +1,217 @@
+# Copyright © 2024 Apple Inc.
+
+import math
+from dataclasses import dataclass
+
+import mlx.core as mx
+import mlx.nn as nn
+
+from .base import BaseModelArgs
+from .cache import MambaCache
+
+
+@dataclass
+class ModelArgs(BaseModelArgs):
+    model_type: str
+    vocab_size: int
+    hidden_size: int
+    intermediate_size: int
+    state_size: int
+    num_hidden_layers: int
+    conv_kernel: int
+    use_bias: bool
+    use_conv_bias: bool
+    time_step_rank: int
+    tie_word_embeddings: bool = True
+
+    def __post_init__(self):
+        if not hasattr(self, "hidden_size") and hasattr(self, "d_model"):
+            self.hidden_size = self.d_model
+        if not hasattr(self, "intermediate_size") and hasattr(self, "d_inner"):
+            self.intermediate_size = self.d_inner
+        if not hasattr(self, "state_size") and hasattr(self, "d_state"):
+            self.state_size = self.d_state
+        if not hasattr(self, "num_hidden_layers") and hasattr(self, "n_layer"):
+            self.num_hidden_layers = self.n_layer
+        if not hasattr(self, "num_hidden_layers") and hasattr(self, "n_layers"):
+            self.num_hidden_layers = self.n_layers
+        if not hasattr(self, "conv_kernel") and hasattr(self, "d_conv"):
+            self.conv_kernel = self.d_conv
+        if not hasattr(self, "use_bias") and hasattr(self, "bias"):
+            self.use_bias = self.bias
+        if not hasattr(self, "use_conv_bias") and hasattr(self, "conv_bias"):
+            self.use_conv_bias = self.conv_bias
+
+        if self.time_step_rank == "auto":
+            self.time_step_rank = math.ceil(self.hidden_size / 16)
+
+
+class DepthWiseConv1d(nn.Module):
+    def __init__(self, channels, kernel_size, bias=True, padding=0):
+        super().__init__()
+        self.channels = channels
+        self.kernel_size = kernel_size
+        self.padding = padding
+        self.weight = mx.random.normal((self.channels, kernel_size, 1))
+        self.bias = mx.zeros((channels,)) if bias else None
+
+    def __call__(self, x, cache=None):
+        B, L, C = x.shape
+        groups, K, _ = self.weight.shape
+
+        if cache is not None:
+            x = mx.concatenate([cache, x], axis=1)
+        else:
+            x = mx.pad(x, [(0, 0), (K - 1, 0), (0, 0)])
+
+        y = mx.conv_general(x, self.weight, groups=groups)
+
+        if self.bias is not None:
+            y = y + self.bias
+
+        return y, x[:, -K + 1 :, :]
+
+
+class MambaBlock(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.args = args
+
+        self.hidden_size = args.hidden_size
+        self.ssm_state_size = args.state_size
+        self.conv_kernel_size = args.conv_kernel
+        self.intermediate_size = args.intermediate_size
+        self.time_step_rank = int(args.time_step_rank)
+        self.use_conv_bias = args.use_conv_bias
+
+        self.in_proj = nn.Linear(
+            self.hidden_size, self.intermediate_size * 2, bias=args.use_bias
+        )
+
+        self.conv1d = DepthWiseConv1d(
+            channels=self.intermediate_size,
+            kernel_size=self.conv_kernel_size,
+            bias=self.use_conv_bias,
+            padding=self.conv_kernel_size - 1,
+        )
+
+        self.x_proj = nn.Linear(
+            self.intermediate_size,
+            self.time_step_rank + 2 * self.ssm_state_size,
+            bias=False,
+        )
+        self.dt_proj = nn.Linear(self.time_step_rank, self.intermediate_size, bias=True)
+
+        A = mx.repeat(
+            mx.arange(1.0, self.ssm_state_size + 1.0).reshape([1, self.ssm_state_size]),
+            repeats=self.intermediate_size,
+            axis=0,
+        )
+        self.A_log = mx.log(A)
+        self.D = mx.ones([self.intermediate_size])
+
+        self.out_proj = nn.Linear(
+            self.intermediate_size, self.hidden_size, bias=args.use_bias
+        )
+
+    def ssm_step(self, x, state=None):
+        A = -mx.exp(self.A_log)
+        D = self.D
+        deltaBC = self.x_proj(x)
+        delta, B, C = mx.split(
+            deltaBC,
+            indices_or_sections=[
+                self.time_step_rank,
+                self.time_step_rank + self.ssm_state_size,
+            ],
+            axis=-1,
+        )
+        delta = nn.softplus(self.dt_proj(delta))
+        new_state = mx.expand_dims(delta * x, -1) * mx.expand_dims(B, 1)
+        if state is not None:
+            new_state += state * mx.exp(mx.expand_dims(delta, -1) * A)
+        y = (new_state @ mx.expand_dims(C, -1)).squeeze(2)
+        y = y + D * x
+        return y, new_state
+
+    def __call__(self, x, cache):
+        B, T, D = x.shape
+        if cache is None:
+            cache = [None, None]
+
+        outputs = []
+        for t in range(T):
+            xt = x[:, t, :]
+            xz = self.in_proj(xt)
+            x_t, z_t = xz.split(indices_or_sections=2, axis=1)
+            conv_out, cache[0] = self.conv1d(mx.expand_dims(x_t, 1), cache[0])
+            x_t = conv_out.squeeze(1)
+            x_t = nn.silu(x_t)
+            y_t, cache[1] = self.ssm_step(x_t, cache[1])
+            z_t = nn.silu(z_t)
+            output_t = y_t * z_t
+            output_t = self.out_proj(output_t)
+            outputs.append(output_t)
+        output = mx.stack(outputs, axis=1)
+        return output
+
+
+class ResidualBlock(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.mixer = MambaBlock(args)
+        self.norm = nn.RMSNorm(args.hidden_size)
+
+    def __call__(self, x: mx.array, cache):
+        return self.mixer(self.norm(x), cache) + x
+
+
+class Mamba(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.embeddings = nn.Embedding(args.vocab_size, args.hidden_size)
+        self.layers = [ResidualBlock(args) for _ in range(args.num_hidden_layers)]
+        self.norm_f = nn.RMSNorm(args.hidden_size)
+
+    def __call__(self, x: mx.array, cache):
+        x = self.embeddings(x)
+        if cache is None:
+            cache = [None] * len(self.layers)
+        for layer, c in zip(self.layers, cache):
+            x = layer(x, c)
+        return self.norm_f(x)
+
+
+class Model(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.args = args
+        self.model_type = args.model_type
+        self.backbone = Mamba(args)
+        if not args.tie_word_embeddings:
+            self.lm_head = nn.Linear(args.hidden_size, args.vocab_size, bias=False)
+
+    def __call__(self, inputs: mx.array, cache=None):
+        B, T = inputs.shape
+
+        x = self.backbone(inputs, cache)
+
+        if self.args.tie_word_embeddings:
+            logits = self.backbone.embeddings.as_linear(x)
+        else:
+            logits = self.lm_head(x)
+
+        return logits
+
+    def sanitize(self, weights):
+        for k, v in weights.items():
+            if "conv1d.weight" in k and v.shape[-1] != 1:
+                weights[k] = v.moveaxis(2, 1)
+        return weights
+
+    def make_cache(self):
+        return [MambaCache() for _ in range(len(self.layers))]
+
+    @property
+    def layers(self):
+        return self.backbone.layers
--- a/llms/mlx_lm/models/minicpm.py
+++ b/llms/mlx_lm/models/minicpm.py
@@ -1,11 +1,13 @@
+# Copyright © 2023-2024 Apple Inc.
+
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple, Union

 import mlx.core as mx
 import mlx.nn as nn
 import numpy as np

-from .base import BaseModelArgs
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention


@dataclass
@@ -83,7 +85,7 @@ class Attention(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ):
        B, L, _ = x.shape

@@ -103,8 +105,8 @@ class Attention(nn.Module):
            queries = self.rope(queries)
            keys = self.rope(keys)

-        attn_output = mx.fast.scaled_dot_product_attention(
-            queries, keys, values, scale=self.scale, mask=mask
+        attn_output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
        )

        attn_output = attn_output.transpose(0, 2, 1, 3).reshape(B, L, -1)
@@ -133,7 +135,7 @@ class DecoderLayer(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        r = self.self_attn(self.input_layernorm(x), mask, cache)
        h = x + r * (self.scale_depth / np.sqrt(self.num_hidden_layers))
@@ -160,10 +162,7 @@ class MiniCPMModel(nn.Module):
    ):
        h = self.embed_tokens(inputs) * self.args.scale_emb

-        mask = None
-        if h.shape[1] > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
-            mask = mask.astype(h.dtype)
+        mask = create_attention_mask(h, cache)

        if cache is None:
            cache = [None] * len(self.layers)
@@ -206,11 +205,3 @@ class Model(nn.Module):
    @property
    def layers(self):
        return self.model.layers
-
-    @property
-    def head_dim(self):
-        return self.args.hidden_size // self.args.num_attention_heads
-
-    @property
-    def n_kv_heads(self):
-        return self.args.num_key_value_heads
--- a/llms/mlx_lm/models/mixtral.py
+++ b/llms/mlx_lm/models/mixtral.py
@@ -1,11 +1,13 @@
+# Copyright © 2023-2024 Apple Inc.
+
 import math
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple, Union

 import mlx.core as mx
 import mlx.nn as nn

-from .base import BaseModelArgs
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention
 from .switch_layers import SwitchGLU


@@ -64,7 +66,7 @@ class MixtralAttention(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        B, L, D = x.shape

@@ -85,8 +87,8 @@ class MixtralAttention(nn.Module):
            queries = self.rope(queries)
            keys = self.rope(keys)

-        output = mx.fast.scaled_dot_product_attention(
-            queries, keys, values, scale=self.scale, mask=mask
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
        )
        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
        return self.o_proj(output)
@@ -136,7 +138,7 @@ class MixtralDecoderLayer(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        r = self.self_attn(self.input_layernorm(x), mask, cache)
        h = x + r
@@ -164,11 +166,7 @@ class MixtralModel(nn.Module):
    ):
        h = self.embed_tokens(inputs)

-        mask = None
-        T = h.shape[1]
-        if T > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(T)
-            mask = mask.astype(h.dtype)
+        mask = create_attention_mask(h, cache)

        if cache is None:
            cache = [None] * len(self.layers)
@@ -217,11 +215,3 @@ class Model(nn.Module):
    @property
    def layers(self):
        return self.model.layers
-
-    @property
-    def head_dim(self):
-        return self.args.hidden_size // self.args.num_attention_heads
-
-    @property
-    def n_kv_heads(self):
-        return self.args.num_key_value_heads
--- a/llms/mlx_lm/models/nemotron.py
+++ b/llms/mlx_lm/models/nemotron.py
@@ -0,0 +1,217 @@
+# Copyright © 2024 Apple Inc.
+
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Dict, Optional, Union
+
+import mlx.core as mx
+import mlx.nn as nn
+
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention
+
+
+@dataclass
+class ModelArgs(BaseModelArgs):
+    model_type: str
+    hidden_size: int
+    hidden_act: str
+    num_hidden_layers: int
+    intermediate_size: int
+    num_attention_heads: int
+    norm_eps: float
+    vocab_size: int
+    num_key_value_heads: int
+    head_dim: Optional[int] = None
+    max_position_embeddings: Optional[int] = None
+    attention_bias: bool = False
+    mlp_bias: bool = False
+    partial_rotary_factor: float = 0.5
+    rope_theta: float = 10000.0
+    rope_traditional: bool = False
+    rope_scaling: Optional[Dict[str, Union[float, str]]] = None
+    tie_word_embeddings: bool = False
+
+    def __post_init__(self):
+        if self.rope_scaling:
+            if not "factor" in self.rope_scaling:
+                raise ValueError(f"rope_scaling must contain 'factor'")
+            rope_type = self.rope_scaling.get("type") or self.rope_scaling.get(
+                "rope_type"
+            )
+            if rope_type is None:
+                raise ValueError(
+                    f"rope_scaling must contain either 'type' or 'rope_type'"
+                )
+            if rope_type not in ["linear"]:
+                raise ValueError("rope_scaling 'type' currently only supports 'linear'")
+
+
+@partial(mx.compile, shapeless=True)
+def relu_squared(x):
+    return nn.relu(x).square()
+
+
+class NemotronLayerNorm1P(nn.LayerNorm):
+    def __call__(self, x):
+        weight = self.weight + 1 if "weight" in self else None
+        bias = self.bias if "bias" in self else None
+        return mx.fast.layer_norm(x, weight, bias, self.eps)
+
+
+class Attention(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+
+        dim = args.hidden_size
+        self.n_heads = n_heads = args.num_attention_heads
+        self.n_kv_heads = n_kv_heads = args.num_key_value_heads
+
+        self.head_dim = head_dim = args.head_dim or args.hidden_size // n_heads
+        self.partial_rotary_factor = args.partial_rotary_factor
+
+        self.scale = head_dim**-0.5
+        if hasattr(args, "attention_bias"):
+            attention_bias = args.attention_bias
+        else:
+            attention_bias = False
+
+        self.q_proj = nn.Linear(dim, n_heads * head_dim, bias=attention_bias)
+        self.k_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=attention_bias)
+        self.v_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=attention_bias)
+        self.o_proj = nn.Linear(n_heads * head_dim, dim, bias=attention_bias)
+
+        rope_scale = 1.0
+        if args.rope_scaling and args.rope_scaling["type"] == "linear":
+            assert isinstance(args.rope_scaling["factor"], float)
+            rope_scale = 1 / args.rope_scaling["factor"]
+        self.rope = nn.RoPE(
+            int(self.partial_rotary_factor * self.head_dim),
+            base=args.rope_theta,
+            scale=rope_scale,
+        )
+
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Any] = None,
+    ) -> mx.array:
+        B, L, _ = x.shape
+
+        queries, keys, values = self.q_proj(x), self.k_proj(x), self.v_proj(x)
+
+        # Prepare the queries, keys and values for the attention computation
+        queries = queries.reshape(B, L, self.n_heads, -1).transpose(0, 2, 1, 3)
+        keys = keys.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
+        values = values.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
+
+        if cache is not None:
+            queries = self.rope(queries, offset=cache.offset)
+            keys = self.rope(keys, offset=cache.offset)
+            keys, values = cache.update_and_fetch(keys, values)
+        else:
+            queries = self.rope(queries)
+            keys = self.rope(keys)
+
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
+        )
+        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
+        return self.o_proj(output)
+
+
+class MLP(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+
+        dim = args.hidden_size
+        hidden_dim = args.intermediate_size
+        mlp_bias = args.mlp_bias
+
+        self.down_proj = nn.Linear(hidden_dim, dim, bias=mlp_bias)
+        self.up_proj = nn.Linear(dim, hidden_dim, bias=mlp_bias)
+
+    def __call__(self, x) -> mx.array:
+        return self.down_proj(relu_squared(self.up_proj(x)))
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.num_attention_heads = args.num_attention_heads
+        self.hidden_size = args.hidden_size
+        self.self_attn = Attention(args)
+        self.mlp = MLP(args)
+        self.input_layernorm = NemotronLayerNorm1P(args.hidden_size, eps=args.norm_eps)
+        self.post_attention_layernorm = NemotronLayerNorm1P(
+            args.hidden_size, eps=args.norm_eps
+        )
+
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Any] = None,
+    ) -> mx.array:
+        r = self.self_attn(self.input_layernorm(x), mask, cache)
+        h = x + r
+        r = self.mlp(self.post_attention_layernorm(h))
+        out = h + r
+        return out
+
+
+class NemotronModel(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.args = args
+        self.vocab_size = args.vocab_size
+        self.num_hidden_layers = args.num_hidden_layers
+        assert self.vocab_size > 0
+        self.embed_tokens = nn.Embedding(args.vocab_size, args.hidden_size)
+        self.layers = [
+            TransformerBlock(args=args) for _ in range(args.num_hidden_layers)
+        ]
+        self.norm = NemotronLayerNorm1P(args.hidden_size, eps=args.norm_eps)
+
+    def __call__(
+        self,
+        inputs: mx.array,
+        cache=None,
+    ):
+        h = self.embed_tokens(inputs)
+
+        mask = create_attention_mask(h, cache)
+
+        if cache is None:
+            cache = [None] * len(self.layers)
+
+        for layer, c in zip(self.layers, cache):
+            h = layer(h, mask, cache=c)
+
+        return self.norm(h)
+
+
+class Model(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.args = args
+        self.model_type = args.model_type
+        self.model = NemotronModel(args)
+        if not args.tie_word_embeddings:
+            self.lm_head = nn.Linear(args.hidden_size, args.vocab_size, bias=False)
+
+    def __call__(
+        self,
+        inputs: mx.array,
+        cache=None,
+    ):
+        out = self.model(inputs, cache)
+        if self.args.tie_word_embeddings:
+            out = self.model.embed_tokens.as_linear(out)
+        else:
+            out = self.lm_head(out)
+        return out
+
+    @property
+    def layers(self):
+        return self.model.layers
--- a/llms/mlx_lm/models/olmo.py
+++ b/llms/mlx_lm/models/olmo.py
@@ -1,17 +1,19 @@
+# Copyright © 2023-2024 Apple Inc.
+
+import sys
 from dataclasses import dataclass
-from sys import exit
-from typing import Optional, Tuple
+from typing import Any, Optional, Tuple

 import mlx.core as mx
 import mlx.nn as nn

-from .base import BaseModelArgs
+from .base import BaseModelArgs, create_attention_mask

 try:
    import hf_olmo
 except ImportError:
    print("To run olmo install ai2-olmo: pip install ai2-olmo")
-    exit(1)
+    sys.exit(1)


@dataclass
@@ -66,7 +68,7 @@ class TransformerBlock(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        B, L, D = x.shape

@@ -96,7 +98,7 @@ class TransformerBlock(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        r = self.attend(self.att_norm(x), mask, cache)
        h = x + r
@@ -126,10 +128,7 @@ class Transformer(nn.Module):
    ):
        h = self.wte(inputs)

-        mask = None
-        if h.shape[1] > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
-            mask = mask.astype(h.dtype)
+        mask = create_attention_mask(h, cache)

        if cache is None:
            cache = [None] * len(self.blocks)
@@ -175,11 +174,3 @@ class Model(nn.Module):
    @property
    def layers(self):
        return self.model.transformer.blocks
-
-    @property
-    def head_dim(self):
-        return self.args.d_model // self.args.n_heads
-
-    @property
-    def n_kv_heads(self):
-        return self.args.n_heads
--- a/llms/mlx_lm/models/openelm.py
+++ b/llms/mlx_lm/models/openelm.py
@@ -1,10 +1,12 @@
+# Copyright © 2023-2024 Apple Inc.
+
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union

 import mlx.core as mx
 import mlx.nn as nn

-from .base import BaseModelArgs
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention


@dataclass
@@ -78,7 +80,7 @@ class Attention(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        B, L, D = x.shape

@@ -105,8 +107,8 @@ class Attention(nn.Module):
            queries = self.rope(queries)
            keys = self.rope(keys)

-        output = mx.fast.scaled_dot_product_attention(
-            queries, keys, values, scale=self.scale, mask=mask
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
        )

        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
@@ -150,7 +152,7 @@ class TransformerBlock(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        r = self.attn(self.attn_norm(x), mask, cache)
        h = x + r
@@ -180,10 +182,7 @@ class OpenELMModel(nn.Module):
    ):
        h = self.token_embeddings(inputs)

-        mask = None
-        if h.shape[1] > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
-            mask = mask.astype(h.dtype)
+        mask = create_attention_mask(h, cache)

        if cache is None:
            cache = [None] * len(self.layers)
@@ -219,11 +218,3 @@ class Model(nn.Module):
    @property
    def layers(self):
        return self.transformer.layers
-
-    @property
-    def head_dim(self):
-        return self.args.head_dim
-
-    @property
-    def n_kv_heads(self):
-        return self.args.num_kv_heads
--- a/llms/mlx_lm/models/phi.py
+++ b/llms/mlx_lm/models/phi.py
@@ -1,3 +1,5 @@
+# Copyright © 2023-2024 Apple Inc.
+
 import math
 from dataclasses import dataclass
 from typing import Tuple
@@ -5,7 +7,7 @@ from typing import Tuple
 import mlx.core as mx
 import mlx.nn as nn

-from .base import BaseModelArgs
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention


@dataclass
@@ -91,8 +93,13 @@ class PhiAttention(nn.Module):
            keys = self.rope(keys)

        scale = math.sqrt(1 / queries.shape[-1])
-        output = mx.fast.scaled_dot_product_attention(
-            queries.astype(mx.float32), keys, values, scale=scale, mask=mask
+        output = scaled_dot_product_attention(
+            queries.astype(mx.float32),
+            keys,
+            values,
+            cache=cache,
+            scale=scale,
+            mask=mask,
        ).astype(values.dtype)

        output = output.moveaxis(2, 1).reshape(B, L, -1)
@@ -138,14 +145,12 @@ class PhiModel(nn.Module):

    def __call__(self, x, cache):
        x = self.embed_tokens(x)
+
+        mask = create_attention_mask(x, cache)
+
        if cache is None:
            cache = [None] * len(self.layers)

-        mask = None
-        if x.shape[1] > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(x.shape[1])
-            mask = mask.astype(x.dtype)
-
        for layer, c in zip(self.layers, cache):
            x = layer(x, mask, c)
        return self.final_layernorm(x)
@@ -162,19 +167,11 @@ class Model(nn.Module):
    def __call__(
        self,
        x: mx.array,
-        cache: mx.array = None,
-    ) -> Tuple[mx.array, mx.array]:
+        cache=None,
+    ) -> mx.array:
        y = self.model(x, cache)
        return self.lm_head(y)

    @property
    def layers(self):
        return self.model.layers
-
-    @property
-    def head_dim(self):
-        return self.args.hidden_size // self.args.num_attention_heads
-
-    @property
-    def n_kv_heads(self):
-        return self.args.num_key_value_heads
--- a/llms/mlx_lm/models/phi3.py
+++ b/llms/mlx_lm/models/phi3.py
@@ -1,10 +1,12 @@
+# Copyright © 2023-2024 Apple Inc.
+
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union

 import mlx.core as mx
 import mlx.nn as nn

-from .base import BaseModelArgs, KVCache
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention
 from .su_rope import SuScaledRotaryEmbedding


@@ -33,9 +35,9 @@ class ModelArgs(BaseModelArgs):
            if not all(key in self.rope_scaling for key in required_keys):
                raise ValueError(f"rope_scaling must contain keys {required_keys}")

-            if self.rope_scaling["type"] not in ["su", "linear"]:
+            if self.rope_scaling["type"] not in ["longrope", "su", "linear"]:
                print(
-                    "[WARNING] rope_scaling 'type' currently only supports 'linear' and 'su'; setting rope scaling to false."
+                    "[WARNING] rope_scaling 'type' currently only supports 'linear', 'su', and 'longrope'; setting rope scaling to false."
                )
                self.rope_scaling = None

@@ -57,19 +59,17 @@ class Attention(nn.Module):
        self.qkv_proj = nn.Linear(dim, op_size, bias=False)
        self.o_proj = nn.Linear(n_heads * head_dim, dim, bias=False)

-        rope_scale = 1.0
-        if args.rope_scaling and args.rope_scaling["type"] == "su":
+        if args.rope_scaling and args.rope_scaling["type"] in ["longrope", "su"]:
            self.rope = SuScaledRotaryEmbedding(
                head_dim,
-                traditional=False,
                base=args.rope_theta,
-                scale=rope_scale,
                max_position_embeddings=args.max_position_embeddings,
                original_max_position_embeddings=args.original_max_position_embeddings,
                short_factor=args.rope_scaling["short_factor"],
                long_factor=args.rope_scaling["long_factor"],
            )
        else:
+            rope_scale = 1.0
            if args.rope_scaling and args.rope_scaling["type"] == "linear":
                assert isinstance(args.rope_scaling["factor"], float)
                rope_scale = 1 / args.rope_scaling["factor"]
@@ -84,7 +84,7 @@ class Attention(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[KVCache] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        B, L, D = x.shape

@@ -107,8 +107,8 @@ class Attention(nn.Module):
            queries = self.rope(queries)
            keys = self.rope(keys)

-        output = mx.fast.scaled_dot_product_attention(
-            queries, keys, values, scale=self.scale, mask=mask
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
        )
        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
        return self.o_proj(output)
@@ -143,7 +143,7 @@ class TransformerBlock(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[KVCache] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        r = self.self_attn(self.input_layernorm(x), mask, cache)
        h = x + r
@@ -172,10 +172,7 @@ class Phi3Model(nn.Module):
    ):
        h = self.embed_tokens(inputs)

-        mask = None
-        if h.shape[1] > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
-            mask = mask.astype(h.dtype)
+        mask = create_attention_mask(h, cache)

        if cache is None:
            cache = [None] * len(self.layers)
@@ -205,11 +202,3 @@ class Model(nn.Module):
    @property
    def layers(self):
        return self.model.layers
-
-    @property
-    def head_dim(self):
-        return self.args.hidden_size // self.args.num_attention_heads
-
-    @property
-    def n_kv_heads(self):
-        return self.args.num_key_value_heads
--- a/llms/mlx_lm/models/phi3small.py
+++ b/llms/mlx_lm/models/phi3small.py
@@ -1,12 +1,14 @@
+# Copyright © 2023-2024 Apple Inc.
+
 import math
 from dataclasses import dataclass
 from functools import partial
-from typing import Dict, Optional, Tuple, Union
+from typing import Any, Optional

 import mlx.core as mx
 import mlx.nn as nn

-from .base import BaseModelArgs, KVCache
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention


@dataclass
@@ -20,14 +22,14 @@ class ModelArgs(BaseModelArgs):
    num_attention_heads: int
    layer_norm_epsilon: float
    vocab_size: int
-    num_key_value_heads: Optional[int] = None
+    num_key_value_heads: int
    mup_attn_multiplier: float = 1.0
    mup_use_scaling: bool = True
    mup_embedding_multiplier: float = 10.0
    mup_width_multiplier: float = 8.0
    rope_embedding_base: float = 1000000
    rope_position_scale: float = 1.0
-    blocksparse_block_size: Tuple[int] = (64,)
+    blocksparse_block_size: int = 64
    blocksparse_num_local_blocks: int = 16
    blocksparse_vert_stride: int = 8

@@ -59,7 +61,6 @@ class Attention(nn.Module):

        dim = args.hidden_size
        self.n_heads = n_heads = args.num_attention_heads
-        assert args.num_key_value_heads is not None
        self.n_kv_heads = n_kv_heads = args.num_key_value_heads
        self.n_q_per_kv = n_heads // n_kv_heads

@@ -159,7 +160,7 @@ class Attention(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[KVCache] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        B, L, D = x.shape

@@ -187,8 +188,8 @@ class Attention(nn.Module):
                queries, keys, values, scale=self.scale, mask=mask
            )
        else:
-            output = mx.fast.scaled_dot_product_attention(
-                queries, keys, values, scale=self.scale, mask=mask
+            output = scaled_dot_product_attention(
+                queries, keys, values, cache=cache, scale=self.scale, mask=mask
            )
        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
        return self.dense(output)
@@ -228,7 +229,7 @@ class TransformerBlock(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[KVCache] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        r = self.self_attn(self.input_layernorm(x), mask, cache)
        h = x + r
@@ -263,10 +264,7 @@ class Phi3Model(nn.Module):
        if self.mup_embedding_multiplier:
            h = self.mup_embedding_multiplier * h

-        mask = None
-        if h.shape[1] > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
-            mask = mask.astype(h.dtype)
+        mask = create_attention_mask(h, cache)

        if cache is None:
            cache = [None] * len(self.layers)
@@ -305,16 +303,8 @@ class Model(nn.Module):
    def layers(self):
        return self.model.layers

-    @property
-    def head_dim(self):
-        return self.args.hidden_size // self.args.num_attention_heads
-
    def sanitize(self, weights):
        # Remove unused precomputed rotary freqs
        return {
            k: v for k, v in weights.items() if "self_attn.rotary_emb.inv_freq" not in k
        }
-
-    @property
-    def n_kv_heads(self):
-        return self.args.num_key_value_heads
--- a/llms/mlx_lm/models/phimoe.py
+++ b/llms/mlx_lm/models/phimoe.py
@@ -0,0 +1,211 @@
+# Copyright © 2024 Apple Inc.
+import math
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Union
+
+import mlx.core as mx
+import mlx.nn as nn
+
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention
+from .su_rope import SuScaledRotaryEmbedding
+from .switch_layers import SwitchGLU
+
+
+@dataclass
+class ModelArgs(BaseModelArgs):
+    model_type: str = "phimoe"
+    vocab_size: int = 32064
+    hidden_size: int = 4096
+    intermediate_size: int = 6400
+    num_hidden_layers: int = 32
+    num_attention_heads: int = 32
+    num_key_value_heads: int = 8
+    max_position_embeddings: int = 131072
+    original_max_position_embeddings: int = 4096
+    rms_norm_eps: float = 1e-6
+    rope_scaling: Dict[str, Union[float, List[float]]] = None
+    num_local_experts: int = 16
+    num_experts_per_tok: int = 2
+    rope_theta: float = 10000.0
+
+
+class Attention(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+
+        dim = args.hidden_size
+        self.n_heads = n_heads = args.num_attention_heads
+        self.n_kv_heads = n_kv_heads = args.num_key_value_heads
+
+        head_dim = args.hidden_size // n_heads
+        self.scale = head_dim**-0.5
+
+        self.q_proj = nn.Linear(dim, n_heads * head_dim, bias=True)
+        self.k_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=True)
+        self.v_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=True)
+        self.o_proj = nn.Linear(n_heads * head_dim, dim, bias=True)
+
+        self.rope = SuScaledRotaryEmbedding(
+            head_dim,
+            base=args.rope_theta,
+            max_position_embeddings=args.max_position_embeddings,
+            original_max_position_embeddings=args.original_max_position_embeddings,
+            short_factor=args.rope_scaling["short_factor"],
+            long_factor=args.rope_scaling["long_factor"],
+            short_mscale=args.rope_scaling["short_mscale"],
+            long_mscale=args.rope_scaling["long_mscale"],
+        )
+
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache=None,
+    ) -> mx.array:
+        B, L, D = x.shape
+
+        queries, keys, values = self.q_proj(x), self.k_proj(x), self.v_proj(x)
+
+        # Prepare the queries, keys and values for the attention computation
+        queries = queries.reshape(B, L, self.n_heads, -1).transpose(0, 2, 1, 3)
+        keys = keys.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
+        values = values.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
+
+        if cache is not None:
+            queries = self.rope(queries, offset=cache.offset)
+            keys = self.rope(keys, offset=cache.offset)
+            keys, values = cache.update_and_fetch(keys, values)
+        else:
+            queries = self.rope(queries)
+            keys = self.rope(keys)
+
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
+        )
+        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
+        return self.o_proj(output)
+
+
+class PhiMoESparseMoeBlock(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.hidden_dim = args.hidden_size
+        self.ffn_dim = args.intermediate_size
+        self.num_experts = args.num_local_experts
+        self.top_k = args.num_experts_per_tok
+
+        self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
+        self.switch_mlp = SwitchGLU(self.hidden_dim, self.ffn_dim, self.num_experts)
+
+    def __call__(self, x: mx.array) -> mx.array:
+        gates = self.gate(x)
+
+        k = self.top_k
+        inds = mx.stop_gradient(mx.argpartition(-gates, kth=k - 1, axis=-1)[..., :k])
+        scores = mx.take_along_axis(gates, inds, axis=-1)
+        scores = mx.softmax(scores, axis=-1, precise=True)
+
+        y = self.switch_mlp(x, inds)
+        y = (y * scores[..., None]).sum(axis=-2)
+
+        return y
+
+
+class PhiMoEDecoderLayer(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.hidden_size = args.hidden_size
+
+        self.self_attn = Attention(args)
+        self.block_sparse_moe = PhiMoESparseMoeBlock(args)
+        self.input_layernorm = nn.LayerNorm(args.hidden_size, eps=args.rms_norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(
+            args.hidden_size, eps=args.rms_norm_eps
+        )
+
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache=None,
+    ) -> mx.array:
+        residual = x
+        hidden_states = self.input_layernorm(x)
+        hidden_states = self.self_attn(hidden_states, mask=mask, cache=cache)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.block_sparse_moe(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class PhiMoEModel(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.args = args
+        self.vocab_size = args.vocab_size
+
+        self.embed_tokens = nn.Embedding(args.vocab_size, args.hidden_size)
+        self.layers = [PhiMoEDecoderLayer(args) for _ in range(args.num_hidden_layers)]
+        self.norm = nn.LayerNorm(args.hidden_size, eps=args.rms_norm_eps)
+
+    def __call__(
+        self,
+        inputs: mx.array,
+        cache=None,
+    ) -> mx.array:
+        h = self.embed_tokens(inputs)
+
+        mask = create_attention_mask(h, cache)
+
+        if cache is None:
+            cache = [None] * len(self.layers)
+
+        for layer, c in zip(self.layers, cache):
+            h = layer(h, mask, c)
+
+        return self.norm(h)
+
+
+class Model(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.model_type = args.model_type
+        self.args = args
+        self.model = PhiMoEModel(args)
+        self.lm_head = nn.Linear(args.hidden_size, args.vocab_size, bias=True)
+
+    def __call__(
+        self,
+        inputs: mx.array,
+        cache=None,
+    ):
+        out = self.model(inputs, cache)
+        return self.lm_head(out)
+
+    def sanitize(self, weights):
+        if "model.layers.0.block_sparse_moe.experts.0.w1.weight" not in weights:
+            return weights
+        for l in range(self.args.num_hidden_layers):
+            prefix = f"model.layers.{l}"
+            for n, m in [("w1", "gate_proj"), ("w2", "down_proj"), ("w3", "up_proj")]:
+                for k in ["weight", "scales", "biases"]:
+                    if f"{prefix}.block_sparse_moe.experts.0.{n}.{k}" in weights:
+                        to_join = [
+                            weights.pop(
+                                f"{prefix}.block_sparse_moe.experts.{e}.{n}.{k}"
+                            )
+                            for e in range(self.args.num_local_experts)
+                        ]
+                        weights[f"{prefix}.block_sparse_moe.switch_mlp.{m}.{k}"] = (
+                            mx.stack(to_join)
+                        )
+
+        return weights
+
+    @property
+    def layers(self):
+        return self.model.layers
--- a/llms/mlx_lm/models/phixtral.py
+++ b/llms/mlx_lm/models/phixtral.py
@@ -1,3 +1,5 @@
+# Copyright © 2023-2024 Apple Inc.
+
 import inspect
 import math
 from dataclasses import dataclass
@@ -6,6 +8,7 @@ from typing import Tuple
 import mlx.core as mx
 import mlx.nn as nn

+from .base import create_attention_mask, scaled_dot_product_attention
 from .switch_layers import SwitchMLP


@@ -68,8 +71,13 @@ class RoPEAttention(nn.Module):
        # Finally perform the attention computation
        scale = math.sqrt(1 / queries.shape[-1])

-        output = mx.fast.scaled_dot_product_attention(
-            queries.astype(mx.float32), keys, values, scale=scale, mask=mask
+        output = scaled_dot_product_attention(
+            queries.astype(mx.float32),
+            keys,
+            values,
+            cache=cache,
+            scale=scale,
+            mask=mask,
        ).astype(values.dtype)
        output = output.moveaxis(2, 1).reshape(B, L, -1)

@@ -165,12 +173,9 @@ class Model(nn.Module):
        self,
        x: mx.array,
        mask: mx.array = None,
-        cache: mx.array = None,
-    ) -> Tuple[mx.array, mx.array]:
-        mask = None
-        if x.shape[1] > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(x.shape[1])
-            mask = mask.astype(x.dtype)
+        cache=None,
+    ) -> mx.array:
+        mask = create_attention_mask(x, cache)

        y = self.transformer(x, mask, cache)
        return self.lm_head(y)
@@ -193,11 +198,3 @@ class Model(nn.Module):
    @property
    def layers(self):
        return self.transformer.h
-
-    @property
-    def head_dim(self):
-        return self.args.model_dim // self.args.num_heads
-
-    @property
-    def n_kv_heads(self):
-        return self.args.num_heads
--- a/llms/mlx_lm/models/plamo.py
+++ b/llms/mlx_lm/models/plamo.py
@@ -1,11 +1,13 @@
+# Copyright © 2023-2024 Apple Inc.
+
 from dataclasses import dataclass
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any, Optional

 import mlx.core as mx
 import mlx.nn as nn
 import numpy as np

-from .base import BaseModelArgs
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention


@dataclass
@@ -60,8 +62,8 @@ class Attention(nn.Module):
        self,
        hidden_states: mx.array,
        attention_mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
-    ) -> Tuple[mx.array, Tuple[mx.array, mx.array]]:
+        cache: Optional[Any] = None,
+    ) -> mx.array:
        bsz, q_len, _ = hidden_states.shape

        queries = self.q_proj(hidden_states)
@@ -87,10 +89,14 @@ class Attention(nn.Module):
            queries = self.rotary_emb(queries)
            keys = self.rotary_emb(keys)

-        output = mx.fast.scaled_dot_product_attention(
+        keys = mx.tile(keys, [1, self.config.n_shared_head, 1, 1])
+        values = mx.tile(values, [1, self.config.n_shared_head, 1, 1])
+
+        output = scaled_dot_product_attention(
            queries,
            keys,
            values,
+            cache=cache,
            scale=self.scale,
            mask=attention_mask,
        )
@@ -125,8 +131,8 @@ class PlamoDecoderLayer(nn.Module):
        self,
        hidden_states: mx.array,
        attention_mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
-    ) -> Tuple[Any, ...]:
+        cache: Optional[Any] = None,
+    ):
        # from LlamaDecoder
        residual = hidden_states

@@ -167,14 +173,11 @@ class PlamoModel(nn.Module):
    def __call__(
        self,
        inputs: mx.array,
-        cache: Optional[List[Union[Tuple[mx.array, mx.array], None]]] = None,
-    ) -> Tuple[mx.array, Optional[List[Union[Tuple[mx.array, mx.array], None]]]]:
+        cache: Optional[Any] = None,
+    ) -> mx.array:
        h = self.embed_tokens(inputs)

-        mask = None
-        if h.shape[1] > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
-            mask = mask.astype(self.embed_tokens.weight.dtype)
+        mask = create_attention_mask(h, cache)

        if cache is None:
            cache = [None for _ in range(len(self.layers.layers))]
@@ -198,19 +201,11 @@ class Model(nn.Module):
    def __call__(
        self,
        inputs: mx.array,
-        cache: Optional[List[Tuple[mx.array, mx.array]]] = None,
-    ) -> Tuple[mx.array, mx.array]:
+        cache: Optional[Any] = None,
+    ) -> mx.array:
        out = self.model(inputs, cache)
        return self.lm_head(out)

    @property
    def layers(self):
        return self.model.layers.layers
-
-    @property
-    def head_dim(self):
-        return self.args.hidden_size // self.args.num_attention_heads
-
-    @property
-    def n_kv_heads(self):
-        return self.args.num_attention_heads // self.args.n_shared_head
--- a/llms/mlx_lm/models/qwen.py
+++ b/llms/mlx_lm/models/qwen.py
@@ -1,10 +1,11 @@
+# Copyright © 2023-2024 Apple Inc.
+
 from dataclasses import dataclass
-from typing import Tuple

 import mlx.core as mx
 import mlx.nn as nn

-from .base import BaseModelArgs
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention


@dataclass
@@ -63,8 +64,8 @@ class Attention(nn.Module):
            queries = self.rotary_emb(queries)
            keys = self.rotary_emb(keys)

-        output = mx.fast.scaled_dot_product_attention(
-            queries, keys, values, scale=self.scale, mask=mask
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
        )
        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)

@@ -122,11 +123,7 @@ class QwenModel(nn.Module):
    def __call__(self, inputs, mask=None, cache=None):
        x = self.wte(inputs)

-        mask = None
-        T = x.shape[1]
-        if T > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(T)
-            mask = mask.astype(x.dtype)
+        mask = create_attention_mask(x, cache)

        if cache is None:
            cache = [None] * len(self.h)
@@ -151,19 +148,11 @@ class Model(nn.Module):
        self,
        x: mx.array,
        mask: mx.array = None,
-        cache: mx.array = None,
-    ) -> Tuple[mx.array, mx.array]:
+        cache=None,
+    ) -> mx.array:
        y = self.transformer(x, mask, cache)
        return self.lm_head(y)

    @property
    def layers(self):
        return self.transformer.h
-
-    @property
-    def head_dim(self):
-        return self.args.hidden_size // self.args.num_attention_heads
-
-    @property
-    def n_kv_heads(self):
-        return self.args.num_attention_heads
--- a/llms/mlx_lm/models/qwen2.py
+++ b/llms/mlx_lm/models/qwen2.py
@@ -1,10 +1,12 @@
+# Copyright © 2023-2024 Apple Inc.
+
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Union

 import mlx.core as mx
 import mlx.nn as nn

-from .base import BaseModelArgs, KVCache
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention


@dataclass
@@ -68,7 +70,7 @@ class Attention(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[KVCache] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        B, L, D = x.shape

@@ -87,8 +89,8 @@ class Attention(nn.Module):
            queries = self.rope(queries)
            keys = self.rope(keys)

-        output = mx.fast.scaled_dot_product_attention(
-            queries, keys, values, scale=self.scale, mask=mask
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
        )
        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
        return self.o_proj(output)
@@ -122,7 +124,7 @@ class TransformerBlock(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[KVCache] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        r = self.self_attn(self.input_layernorm(x), mask, cache)
        h = x + r
@@ -151,10 +153,7 @@ class Qwen2Model(nn.Module):
    ):
        h = self.embed_tokens(inputs)

-        mask = None
-        if h.shape[1] > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
-            mask = mask.astype(h.dtype)
+        mask = create_attention_mask(h, cache)

        if cache is None:
            cache = [None] * len(self.layers)
@@ -197,11 +196,3 @@ class Model(nn.Module):
    @property
    def layers(self):
        return self.model.layers
-
-    @property
-    def head_dim(self):
-        return self.args.hidden_size // self.args.num_attention_heads
-
-    @property
-    def n_kv_heads(self):
-        return self.args.num_key_value_heads
--- a/llms/mlx_lm/models/qwen2_moe.py
+++ b/llms/mlx_lm/models/qwen2_moe.py
@@ -1,11 +1,13 @@
+# Copyright © 2023-2024 Apple Inc.
+
 import math
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Union

 import mlx.core as mx
 import mlx.nn as nn

-from .base import BaseModelArgs, KVCache
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention
 from .switch_layers import SwitchGLU


@@ -68,7 +70,7 @@ class Attention(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[KVCache] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        B, L, D = x.shape

@@ -87,8 +89,8 @@ class Attention(nn.Module):
            queries = self.rope(queries)
            keys = self.rope(keys)

-        output = mx.fast.scaled_dot_product_attention(
-            queries, keys, values, scale=self.scale, mask=mask
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
        )
        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
        return self.o_proj(output)
@@ -160,7 +162,7 @@ class Qwen2MoeDecoderLayer(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[KVCache] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        r = self.self_attn(self.input_layernorm(x), mask, cache)
        h = x + r
@@ -189,10 +191,7 @@ class Qwen2MoeModel(nn.Module):
    ):
        h = self.embed_tokens(inputs)

-        mask = None
-        if h.shape[1] > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
-            mask = mask.astype(h.dtype)
+        mask = create_attention_mask(h, cache)

        if cache is None:
            cache = [None] * len(self.layers)
@@ -237,11 +236,3 @@ class Model(nn.Module):
    @property
    def layers(self):
        return self.model.layers
-
-    @property
-    def head_dim(self):
-        return self.args.hidden_size // self.args.num_attention_heads
-
-    @property
-    def n_kv_heads(self):
-        return self.args.num_key_value_heads
--- a/llms/mlx_lm/models/recurrent_gemma.py
+++ b/llms/mlx_lm/models/recurrent_gemma.py
@@ -0,0 +1,456 @@
+# Copyright © 2023-2024 Apple Inc.
+
+import math
+from dataclasses import dataclass
+from typing import List, Literal, Optional
+
+import mlx.core as mx
+import mlx.nn as nn
+
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention
+from .cache import MambaCache, RotatingKVCache
+
+
+@dataclass
+class ModelArgs(BaseModelArgs):
+    model_type: str
+    attention_bias: bool
+    conv1d_width: int
+    hidden_size: int
+    intermediate_size: int
+    logits_soft_cap: float
+    num_attention_heads: int
+    num_hidden_layers: int
+    num_key_value_heads: int
+    rms_norm_eps: float
+    rope_theta: float
+    attention_window_size: int
+    vocab_size: int
+    embeddings_scale_by_sqrt_dim: bool = True
+    block_types: Optional[List[str]] = None
+    _block_types: Optional[List[str]] = None
+
+    def __post_init__(self):
+        # For some reason these have different names in 2B and 9B
+        if self.block_types is None:
+            self.block_types = self._block_types
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dims: int, eps: float = 1e-5):
+        super().__init__()
+        self.weight = mx.ones((dims,))
+        self.eps = eps
+
+    def __call__(self, x):
+        return mx.fast.rms_norm(x, 1.0 + self.weight, self.eps)
+
+
+def rnn_scan(x, a, h0):
+    assert x.ndim == 3
+    assert a.shape == x.shape[-a.ndim :]
+    assert a.dtype == x.dtype
+
+    if x.shape[1] == 1:
+        # Using scan in sampling mode.
+        if h0 is None:
+            return x, x[:, 0]
+
+        else:
+            y = a * h0[:, None] + x
+            return y, y[:, -1]
+
+    else:
+        # Using scan in linear mode.
+        if h0 is not None:
+            h_t = h0
+        else:
+            B, _, D = x.shape
+            h_t = mx.zeros((B, D), dtype=x.dtype)
+
+        y = mx.zeros_like(x)
+        for t in range(x.shape[1]):
+            h_t = a[:, t] * h_t + x[:, t]
+            y[:, t] = h_t
+
+    return y, h_t
+
+
+class Conv1d(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        kernel_size: int,
+    ):
+        super().__init__()
+        self.weight = mx.zeros((channels, kernel_size, 1))
+        self.bias = mx.zeros((channels,))
+
+    def __call__(self, x, cache=None):
+        B, L, C = x.shape
+        groups, K, _ = self.weight.shape
+
+        if cache is not None:
+            x = mx.concatenate([cache, x], axis=1)
+        else:
+            x = mx.pad(x, [(0, 0), (K - 1, 0), (0, 0)])
+
+        y = mx.conv_general(x, self.weight, groups=groups)
+        y = y + self.bias
+
+        return y, x[:, -K + 1 :, :]
+
+
+class RGLRU(nn.Module):
+    """A Real-Gated Linear Recurrent Unit (RG-LRU) layer."""
+
+    def __init__(
+        self,
+        width: int,
+        num_heads: int,
+    ):
+        super().__init__()
+        self.width = width
+        self.num_heads = num_heads
+        self.head_dim = self.width // self.num_heads
+
+        self.recurrent_param = mx.zeros((self.width,))
+
+        self.input_gate_weight = mx.zeros(
+            (self.num_heads, self.head_dim, self.head_dim),
+        )
+        self.input_gate_bias = mx.zeros((self.num_heads, self.head_dim))
+
+        self.recurrent_gate_weight = mx.zeros(
+            (self.num_heads, self.head_dim, self.head_dim),
+        )
+        self.recurrent_gate_bias = mx.zeros((self.num_heads, self.head_dim))
+
+    def __call__(
+        self,
+        x: mx.array,
+        cache=None,
+    ):
+        B, L, _ = x.shape
+
+        def apply_block_linear(h, w, b):
+            h = h.reshape((B, L, self.num_heads, self.head_dim))
+            h = (h.swapaxes(1, 2) @ w).swapaxes(1, 2) + b
+            return mx.sigmoid(h.flatten(2, 3))
+
+        # Gates for x and a.
+        gate_x = apply_block_linear(x, self.input_gate_weight, self.input_gate_bias)
+        gate_a = apply_block_linear(
+            x, self.recurrent_gate_weight, self.recurrent_gate_bias
+        )
+
+        # Compute the parameter `A` of the recurrence.
+        log_a = -8.0 * gate_a * nn.softplus(self.recurrent_param)
+        a = mx.exp(log_a)
+        a_square = mx.exp(2 * log_a)
+
+        # Gate the input.
+        gated_x = x * gate_x
+
+        # Apply gamma normalization to the input.
+        multiplier = mx.sqrt(1 - a_square)
+        if cache is None:
+            multiplier[:, 0, :] = 1.0
+        normalized_x = gated_x * multiplier.astype(x.dtype)
+
+        y, last_h = rnn_scan(
+            x=normalized_x,
+            a=a,
+            h0=cache,
+        )
+
+        return y, last_h
+
+
+class RecurrentBlock(nn.Module):
+
+    def __init__(
+        self,
+        width: int,
+        num_heads: int,
+        lru_width: int = None,
+        conv1d_temporal_width: int = 4,
+    ):
+        super().__init__()
+        self.width = width
+        self.num_heads = num_heads
+        self.lru_width = lru_width or width
+        self.conv1d_temporal_width = conv1d_temporal_width
+
+        self.linear_y = nn.Linear(width, self.lru_width)
+        self.linear_x = nn.Linear(width, self.lru_width)
+        self.linear_out = nn.Linear(self.lru_width, width)
+        self.conv_1d = Conv1d(
+            channels=self.lru_width,
+            kernel_size=self.conv1d_temporal_width,
+        )
+        self.rg_lru = RGLRU(
+            width=self.lru_width,
+            num_heads=self.num_heads,
+        )
+
+    def __call__(
+        self,
+        x: mx.array,
+        cache=None,
+        mask=None,
+    ):
+        # y branch.
+        y = self.linear_y(x)
+        y = nn.gelu_approx(y)
+
+        # x branch.
+        x = self.linear_x(x)
+        if cache is None:
+            cache = [None, None]
+        x, cache[0] = self.conv_1d(x=x, cache=cache[0])
+        x, cache[1] = self.rg_lru(x=x, cache=cache[1])
+
+        x = x * y
+        x = self.linear_out(x)
+
+        return x
+
+
+class LocalAttentionBlock(nn.Module):
+
+    def __init__(
+        self,
+        width: int,
+        num_heads: int,
+        window_size: int,
+    ):
+        super().__init__()
+        self.width = width
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.scale = (width // num_heads) ** (-0.5)
+
+        self.head_dim = self.width // self.num_heads
+        self.q_proj = nn.Linear(self.width, self.width, bias=False)
+        self.k_proj = nn.Linear(self.width, self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.width, self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.width, self.width, bias=True)
+        self.rope = nn.RoPE(
+            self.head_dim // 2,
+            traditional=False,
+        )
+
+    def __call__(
+        self,
+        x: mx.array,
+        cache=None,
+        mask=None,
+    ):
+        B, L, D = x.shape
+
+        queries, keys, values = self.q_proj(x), self.k_proj(x), self.v_proj(x)
+
+        queries = queries.reshape(B, L, self.num_heads, -1).transpose(0, 2, 1, 3)
+        keys = keys.reshape(B, L, 1, -1).transpose(0, 2, 1, 3)
+        values = values.reshape(B, L, 1, -1).transpose(0, 2, 1, 3)
+
+        if cache is not None:
+            queries = self.rope(queries, offset=cache.offset)
+            keys = self.rope(keys, offset=cache.offset)
+            keys, values = cache.update_and_fetch(keys, values)
+        else:
+            queries = self.rope(queries)
+            keys = self.rope(keys)
+
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
+        )
+        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
+        return self.o_proj(output)
+
+
+class MLPBlock(nn.Module):
+
+    def __init__(self, width: int, expanded_width: int):
+        super().__init__()
+        self.up_proj = nn.Linear(width, expanded_width // 2)
+        self.gate_proj = nn.Linear(width, expanded_width // 2)
+        self.down_proj = nn.Linear(expanded_width // 2, width)
+
+    def __call__(self, x: mx.array):
+        gate = self.gate_proj(x)
+        x = self.up_proj(x)
+        return self.down_proj(nn.gelu_approx(gate) * x)
+
+
+class ResidualBlock(nn.Module):
+
+    def __init__(
+        self,
+        width: int,
+        mlp_expanded_width: int,
+        num_heads: int,
+        attention_window_size: int,
+        temporal_block_type: str,
+        lru_width: Optional[int] = None,
+        conv1d_temporal_width: int = 4,
+    ):
+        """Initializes the residual block.
+
+        Args:
+          width: The width of the block.
+          mlp_expanded_width: The width of the expansion inside the MLP block.
+          num_heads: The number of heads for the Attention or the RG-LRU.
+          attention_window_size: The window size for the local attention block.
+          temporal_block_type: Either "recurrent" or "attention", specifying the
+            type of recurrent block to use.
+          lru_width: The width of the RG-LRU if different from `width`.
+          conv1d_temporal_width: The width of the temporal convolution.
+        """
+        super().__init__()
+        self.width = width
+        self.mlp_expanded_width = mlp_expanded_width
+        self.num_heads = num_heads
+        self.attention_window_size = attention_window_size
+        self.temporal_block_type = temporal_block_type
+        self.lru_width = lru_width
+        self.conv1d_temporal_width = conv1d_temporal_width
+
+        self.temporal_pre_norm = RMSNorm(width)
+        if self.temporal_block_type == "recurrent":
+            self.temporal_block = RecurrentBlock(
+                width=self.width,
+                num_heads=self.num_heads,
+                lru_width=self.lru_width,
+                conv1d_temporal_width=self.conv1d_temporal_width,
+            )
+
+        else:
+            self.temporal_block = LocalAttentionBlock(
+                width=self.width,
+                num_heads=self.num_heads,
+                window_size=self.attention_window_size,
+            )
+
+        self.channel_pre_norm = RMSNorm(width)
+        self.mlp_block = MLPBlock(
+            width=self.width,
+            expanded_width=self.mlp_expanded_width,
+        )
+
+    def __call__(
+        self,
+        x: mx.array,
+        cache=None,
+        mask=None,
+    ):
+        raw_x = x
+
+        inputs_normalized = self.temporal_pre_norm(raw_x)
+
+        x = self.temporal_block(inputs_normalized, cache=cache, mask=mask)
+        residual = x + raw_x
+
+        x = self.channel_pre_norm(residual)
+        x = self.mlp_block(x)
+
+        x = x + residual
+
+        return x
+
+
+class Griffin(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+        self.embed_tokens = nn.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+
+        self.scale_by_sqrt_dim = config.embeddings_scale_by_sqrt_dim
+        block_types = config.block_types
+
+        self.layers = [
+            ResidualBlock(
+                width=config.hidden_size,
+                mlp_expanded_width=config.intermediate_size,
+                num_heads=config.num_attention_heads,
+                attention_window_size=config.attention_window_size,
+                temporal_block_type=block_types[i % len(block_types)],
+                lru_width=None,
+            )
+            for i in range(config.num_hidden_layers)
+        ]
+        self.final_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def __call__(
+        self,
+        tokens,
+        cache=None,
+    ):
+        x = self.embed_tokens(tokens)
+        if self.scale_by_sqrt_dim:
+            x = x * math.sqrt(x.shape[-1])
+
+        if cache is None:
+            cache = [None] * len(self.layers)
+
+        for i, block in enumerate(self.layers):
+            if block.temporal_block_type != "recurrent":
+                mask_cache = [cache[i]]
+
+        mask = create_attention_mask(x, mask_cache)
+
+        for i, block in enumerate(self.layers):
+            x = block(x, mask=mask, cache=cache[i])
+
+        return self.final_norm(x)
+
+
+class Model(nn.Module):
+
+    def __init__(self, config):
+        self.args = config
+        self.model = Griffin(config)
+        self.model_type = config.model_type
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+    def __call__(self, tokens: mx.array, cache=None) -> mx.array:
+        """
+        Args:
+          tokens: Sequence of input tokens.
+        """
+        logits = self.model(tokens, cache=cache)
+        if "lm_head" in self:
+            logits = self.lm_head(logits)
+        else:
+            logits = self.model.embed_tokens.as_linear(logits)
+
+        c = self.args.logits_soft_cap
+        if c:
+            logits = mx.tanh(logits / c) * c
+        return logits
+
+    @property
+    def layers(self):
+        return self.model.layers
+
+    def sanitize(self, weights):
+        for k, v in weights.items():
+            if "conv_1d.weight" in k and v.shape[-1] != 1:
+                weights[k] = v.moveaxis(2, 1)
+        if "lm_head.weight" not in weights:
+            self.pop("lm_head")
+        return weights
+
+    def make_cache(self):
+        cache = []
+        for layer in self.layers:
+            if layer.temporal_block_type == "recurrent":
+                cache.append(MambaCache())
+            else:
+                cache.append(RotatingKVCache(max_size=self.args.attention_window_size))
+        return cache
--- a/llms/mlx_lm/models/stablelm.py
+++ b/llms/mlx_lm/models/stablelm.py
@@ -1,11 +1,12 @@
+# Copyright © 2023-2024 Apple Inc.
+
 import math
 from dataclasses import dataclass
-from typing import Tuple

 import mlx.core as mx
 import mlx.nn as nn

-from .base import BaseModelArgs
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention


@dataclass
@@ -119,8 +120,8 @@ class Attention(nn.Module):

        # Finally perform the attention computation
        scale = math.sqrt(1 / queries.shape[-1])
-        output = mx.fast.scaled_dot_product_attention(
-            queries, keys, values, scale=scale, mask=mask
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=scale, mask=mask
        ).astype(values.dtype)
        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
        return self.o_proj(output)
@@ -196,24 +197,12 @@ class Model(nn.Module):
        self,
        x: mx.array,
        mask: mx.array = None,
-        cache: mx.array = None,
-    ) -> Tuple[mx.array, mx.array]:
-        mask = None
-        if x.shape[1] > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(x.shape[1])
-            mask = mask.astype(x.dtype)
-
+        cache=None,
+    ) -> mx.array:
+        mask = create_attention_mask(x, cache)
        y = self.model(x, mask, cache)
        return self.lm_head(y)

    @property
    def layers(self):
        return self.model.layers
-
-    @property
-    def head_dim(self):
-        return self.args.hidden_size // self.args.num_attention_heads
-
-    @property
-    def n_kv_heads(self):
-        return self.args.num_key_value_heads
--- a/llms/mlx_lm/models/starcoder2.py
+++ b/llms/mlx_lm/models/starcoder2.py
@@ -1,10 +1,12 @@
+# Copyright © 2023-2024 Apple Inc.
+
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Any, Optional

 import mlx.core as mx
 import mlx.nn as nn

-from .base import BaseModelArgs, KVCache
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention


@dataclass
@@ -43,7 +45,7 @@ class Attention(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[KVCache] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        B, L, D = x.shape

@@ -62,8 +64,8 @@ class Attention(nn.Module):
            queries = self.rope(queries)
            keys = self.rope(keys)

-        output = mx.fast.scaled_dot_product_attention(
-            queries, keys, values, scale=self.scale, mask=mask
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
        )

        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
@@ -98,7 +100,7 @@ class TransformerBlock(nn.Module):
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
-        cache: Optional[KVCache] = None,
+        cache: Optional[Any] = None,
    ) -> mx.array:
        r = self.self_attn(self.input_layernorm(x), mask, cache)
        h = x + r
@@ -127,10 +129,7 @@ class Starcoder2Model(nn.Module):
    ):
        h = self.embed_tokens(inputs)

-        mask = None
-        if h.shape[1] > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
-            mask = mask.astype(h.dtype)
+        mask = create_attention_mask(h, cache)

        if cache is None:
            cache = [None] * len(self.layers)
@@ -165,11 +164,3 @@ class Model(nn.Module):
    @property
    def layers(self):
        return self.model.layers
-
-    @property
-    def head_dim(self):
-        return self.args.hidden_size // self.args.num_attention_heads
-
-    @property
-    def n_kv_heads(self):
-        return self.args.num_key_value_heads
--- a/llms/mlx_lm/models/su_rope.py
+++ b/llms/mlx_lm/models/su_rope.py
@@ -1,30 +1,30 @@
+# Copyright © 2023-2024 Apple Inc.
+
 import math
 from typing import List, Union

 import mlx.core as mx
+import mlx.nn as nn


-class SuScaledRotaryEmbedding:
+class SuScaledRotaryEmbedding(nn.Module):
    def __init__(
        self,
        dims: int,
-        traditional: bool = False,
        base: float = 10000.0,
-        scale: float = 1.0,
        max_position_embeddings: int = 131072,
        original_max_position_embeddings: int = 4096,
        short_factor: Union[List[float], float] = 1.0,
        long_factor: Union[List[float], float] = 1.0,
+        short_mscale: float = None,
+        long_mscale: float = None,
    ):
        """
        Phi3Su Scaled Rotary Embedding layer for Phi-3 models.

        Args:
            dims (int): The feature dimensions to be rotated.
-            traditional (bool, optional): Unused. Default: ``False``.
            base (int, optional): Base for the exponential scaling.
-            scale (float, optional): The scale used to scale the positions.
-              Default: ``1.0``.
            max_position_embeddings (int, optional): The maximum sequence
              length that this model was trained with. This is used to determine
              the size of the original RoPE embeddings when using long scaling.
@@ -39,41 +39,26 @@ class SuScaledRotaryEmbedding:
            long_factor (float or list[float], optional): List of scaling
              factors for sequences of length greater than
              ``original_max_position_embeddings``.  Default: ``1.0``.
+            short_mscale (float, optional): Scale the input prior to embedding.
+            long_mscale (float, optional): Scale the input prior to embedding.
        """
-        self.inv_freq_short = 1.0 / (
-            mx.array(short_factor, dtype=mx.float32)
-            * base ** (mx.arange(0, dims, 2, dtype=mx.float32) / dims)
-        )
-        self.inv_freq_long = 1.0 / (
-            scale
-            * mx.array(long_factor, dtype=mx.float32)
-            * base ** (mx.arange(0, dims, 2, dtype=mx.float32) / dims)
-        )
+        super().__init__()
+        freqs = base ** (mx.arange(0, dims, 2, dtype=mx.float32) / dims)
+        self._freqs = mx.array(long_factor, dtype=mx.float32) * freqs
        self.original_max_position_embeddings = original_max_position_embeddings
-        self.scaling_factor = math.sqrt(
+        self.scale = long_mscale or math.sqrt(
            1
            + math.log(max_position_embeddings / original_max_position_embeddings)
            / math.log(original_max_position_embeddings)
        )

-    def _get_cos_sin(self, offset, L):
-        position_ids = mx.arange(offset, offset + L, dtype=mx.float32)
-        inv_freq = (
-            self.inv_freq_long
-            if (offset + L) > self.original_max_position_embeddings
-            else self.inv_freq_short
-        )
-        freqs = position_ids[:, None] * inv_freq[None, :]
-        emb = mx.concatenate([freqs, freqs], axis=-1)
-        cos = mx.cos(emb) * self.scaling_factor
-        sin = mx.sin(emb) * self.scaling_factor
-        return cos, sin
-
    def __call__(self, x, offset: int = 0):
-        def _rotate_half(_x):
-            midpoint = _x.shape[-1] // 2
-            x1, x2 = _x[..., :midpoint], _x[..., midpoint:]
-            return mx.concatenate([-x2, x1], axis=-1)
-
-        cos, sin = self._get_cos_sin(offset, x.shape[2])
-        return (x * cos) + (_rotate_half(x) * sin)
+        return mx.fast.rope(
+            self.scale * x,
+            x.shape[-1],
+            traditional=False,
+            base=None,
+            scale=1.0,
+            offset=offset,
+            freqs=self._freqs,
+        )
--- a/llms/mlx_lm/models/switch_layers.py
+++ b/llms/mlx_lm/models/switch_layers.py
@@ -1,3 +1,5 @@
+# Copyright © 2023-2024 Apple Inc.
+
 import math

 import mlx.core as mx
--- a/llms/mlx_lm/requirements.txt
+++ b/llms/mlx_lm/requirements.txt
@@ -1,6 +1,6 @@
-mlx>=0.14.1
+mlx>=0.19.2
 numpy
-transformers>=4.39.3
+transformers[sentencepiece]>=4.39.3
 protobuf
 pyyaml
 jinja2
--- a/llms/mlx_lm/sample_utils.py
+++ b/llms/mlx_lm/sample_utils.py
@@ -1,6 +1,69 @@
+# Copyright © 2023-2024 Apple Inc.
+
+from functools import partial
+
 import mlx.core as mx


+@partial(mx.compile, inputs=mx.random.state, outputs=mx.random.state)
+def min_p_sampling(
+    logits: mx.array,
+    min_p: float,
+    min_tokens_to_keep: int = 1,
+    temperature=1.0,
+) -> mx.array:
+    """
+    Apply min-p sampling to the logits.
+
+    Min-p keeps all tokens that are above a minimum probability, scaled by the
+    probability of the most likely token. As a result, the filter is more
+    aggressive given a very high-probability token.
+
+    Args:
+        logits: The logits from the model's output.
+        min_p (float): Minimum token probability. Typical values are in the
+            0.01-0.2 range, comparably selective as setting `top_p` in the
+            0.99-0.8 range.
+        min_tokens_to_keep (int, optional): Minimum number of tokens that cannot
+            be filtered. Default: ``1``.
+
+    """
+    if not (0 <= min_p <= 1.0):
+        raise ValueError(
+            f"`min_p` has to be a float in the [0, 1] interval, but is {min_p}"
+        )
+    if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 1):
+        raise ValueError(
+            f"`min_tokens_to_keep` has to be a positive integer, but is {min_tokens_to_keep}"
+        )
+    # reference implementation: https://github.com/huggingface/transformers/blob/main/src/transformers/generation/logits_process.py#L531-L605
+
+    # Softmax probabilities
+    probs = mx.softmax(logits * (1 / temperature), axis=-1)
+
+    # Indices sorted in decreasing order
+    sorted_indices = mx.argsort(-logits).squeeze(0)
+    sorted_probs = probs[..., sorted_indices]
+
+    # Top probability
+    top_probs = probs[..., sorted_indices[0]]
+
+    # Calculate the min_p threshold
+    scaled_min_p = min_p * top_probs
+
+    # Mask tokens that have a probability less than the scaled min_p
+    tokens_to_remove = sorted_probs < scaled_min_p
+    tokens_to_remove[..., :min_tokens_to_keep] = False
+
+    # Create pool of tokens with probability less than scaled min_p
+    selected_probs = mx.where(tokens_to_remove, 0, sorted_probs)
+
+    # Return sampled token
+    sorted_token = mx.random.categorical(mx.log(selected_probs))
+    return sorted_indices[sorted_token]
+
+
+@partial(mx.compile, inputs=mx.random.state, outputs=mx.random.state)
 def top_p_sampling(logits: mx.array, top_p: float, temperature: float) -> mx.array:
    """
    Apply top-p (nucleus) sampling to logits.
@@ -13,7 +76,7 @@ def top_p_sampling(logits: mx.array, top_p: float, temperature: float) -> mx.arr
        token selected based on the top-p criterion.
    """
    # referenced implementation from https://github.com/huggingface/transformers/blob/main/src/transformers/generation/logits_process.py#L449-L460
-    probs = mx.softmax(logits / temperature, axis=-1)
+    probs = mx.softmax(logits * (1 / temperature), axis=-1)

    # sort probs in ascending order
    sorted_indices = mx.argsort(probs, axis=-1)
@@ -25,10 +88,15 @@ def top_p_sampling(logits: mx.array, top_p: float, temperature: float) -> mx.arr
    top_probs = mx.where(
        cumulative_probs > 1 - top_p,
        sorted_probs,
-        mx.zeros_like(sorted_probs),
+        0,
    )

    sorted_token = mx.random.categorical(mx.log(top_probs))
    token = sorted_indices.squeeze(0)[sorted_token]

    return token
+
+
+@partial(mx.compile, inputs=mx.random.state, outputs=mx.random.state)
+def categorical_sampling(logits, temp):
+    return mx.random.categorical(logits * (1 / temp))
--- a/llms/mlx_lm/server.py
+++ b/llms/mlx_lm/server.py
@@ -3,22 +3,38 @@
 import argparse
 import json
 import logging
+import platform
 import time
 import uuid
 import warnings
-from functools import lru_cache
+from dataclasses import dataclass, field
 from http.server import BaseHTTPRequestHandler, HTTPServer
 from pathlib import Path
-from typing import Dict, List, Literal, NamedTuple, Optional, Tuple, Union
+from typing import (
+    Any,
+    Dict,
+    List,
+    Literal,
+    NamedTuple,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+)

 import mlx.core as mx
-import mlx.nn as nn
-from transformers import PreTrainedTokenizer
+from huggingface_hub import scan_cache_dir

-from .tokenizer_utils import TokenizerWrapper
+from ._version import __version__
+from .models.cache import make_prompt_cache
 from .utils import generate_step, load


+def get_system_fingerprint():
+    gpu_arch = mx.metal.device_info()["architecture"] if mx.metal.is_available() else ""
+    return f"{__version__}-{mx.__version__}-{platform.platform()}-{gpu_arch}"
+
+
 class StopCondition(NamedTuple):
    stop_met: bool
    trim_length: int
@@ -58,6 +74,21 @@ def stopping_criteria(
    return StopCondition(stop_met=False, trim_length=0)


+def sequence_overlap(s1: Sequence, s2: Sequence) -> bool:
+    """
+    Checks if a suffix of s1 has overlap with a prefix of s2
+
+    Args:
+        s1 (Sequence): The first sequence
+        s2 (Sequence): The second sequence
+
+    Returns:
+        bool: If the two sequences have overlap
+    """
+    max_overlap = min(len(s1), len(s2))
+    return any(s1[-i:] == s2[:i] for i in range(1, max_overlap + 1))
+
+
 def convert_chat(messages: List[dict], role_mapping: Optional[dict] = None):
    default_role_mapping = {
        "system_prompt": (
@@ -82,6 +113,13 @@ def convert_chat(messages: List[dict], role_mapping: Optional[dict] = None):
    return prompt.rstrip()


+@dataclass
+class PromptCache:
+    cache: List[Any] = field(default_factory=list)
+    model_key: Tuple[str, Optional[str]] = ("", None)
+    tokens: List[int] = field(default_factory=list)
+
+
 class ModelProvider:
    def __init__(self, cli_args: argparse.Namespace):
        """Load models on demand and persist them across the whole process."""
@@ -101,13 +139,15 @@ class ModelProvider:
                "Local models must be relative to the current working dir."
            )

-    def load(self, model_path):
-        if self.model_key == model_path:
+    # Added in adapter_path to load dynamically
+    def load(self, model_path, adapter_path=None):
+        if self.model_key == (model_path, adapter_path):
            return self.model, self.tokenizer

        # Remove the old model if it exists.
        self.model = None
        self.tokenizer = None
+        self.model_key = None

        # Building tokenizer_config
        tokenizer_config = {
@@ -119,18 +159,22 @@ class ModelProvider:
        if model_path == "default_model" and self.cli_args.model is not None:
            model, tokenizer = load(
                self.cli_args.model,
-                adapter_path=self.cli_args.adapter_path,
+                adapter_path=(
+                    adapter_path if adapter_path else self.cli_args.adapter_path
+                ),  # if the user doesn't change the model but adds an adapter path
                tokenizer_config=tokenizer_config,
            )
        else:
            self._validate_model_path(model_path)
-            model, tokenizer = load(model_path, tokenizer_config=tokenizer_config)
+            model, tokenizer = load(
+                model_path, adapter_path=adapter_path, tokenizer_config=tokenizer_config
+            )

        if self.cli_args.use_default_chat_template:
            if tokenizer.chat_template is None:
                tokenizer.chat_template = tokenizer.default_chat_template

-        self.model_key = model_path
+        self.model_key = (model_path, adapter_path)
        self.model = model
        self.tokenizer = tokenizer

@@ -138,12 +182,21 @@ class ModelProvider:


 class APIHandler(BaseHTTPRequestHandler):
-    def __init__(self, model_provider: ModelProvider, *args, **kwargs):
+    def __init__(
+        self,
+        model_provider: ModelProvider,
+        *args,
+        prompt_cache: Optional[PromptCache] = None,
+        system_fingerprint: Optional[str] = None,
+        **kwargs,
+    ):
        """
        Create static request specific metadata
        """
        self.created = int(time.time())
        self.model_provider = model_provider
+        self.prompt_cache = prompt_cache or PromptCache()
+        self.system_fingerprint = system_fingerprint or get_system_fingerprint()
        super().__init__(*args, **kwargs)

    def _set_cors_headers(self):
@@ -173,6 +226,7 @@ class APIHandler(BaseHTTPRequestHandler):
        endpoints = {
            "/v1/completions": self.handle_text_completions,
            "/v1/chat/completions": self.handle_chat_completions,
+            "/chat/completions": self.handle_chat_completions,
        }

        if self.path not in endpoints:
@@ -193,8 +247,12 @@ class APIHandler(BaseHTTPRequestHandler):

        # Extract request parameters from the body
        self.stream = self.body.get("stream", False)
+        self.stream_options = self.body.get("stream_options", None)
        self.requested_model = self.body.get("model", "default_model")
-        self.max_tokens = self.body.get("max_tokens", 100)
+        self.adapter = self.body.get("adapters", None)
+        self.max_tokens = self.body.get("max_completion_tokens", None)
+        if self.max_tokens is None:
+            self.max_tokens = self.body.get("max_tokens", 512)
        self.temperature = self.body.get("temperature", 1.0)
        self.top_p = self.body.get("top_p", 1.0)
        self.repetition_penalty = self.body.get("repetition_penalty", 1.0)
@@ -205,7 +263,9 @@ class APIHandler(BaseHTTPRequestHandler):

        # Load the model if needed
        try:
-            self.model, self.tokenizer = self.model_provider.load(self.requested_model)
+            self.model, self.tokenizer = self.model_provider.load(
+                self.requested_model, self.adapter
+            )
        except:
            self._set_completion_headers(404)
            self.end_headers()
@@ -279,6 +339,8 @@ class APIHandler(BaseHTTPRequestHandler):

        if not isinstance(self.requested_model, str):
            raise ValueError("model must be a string")
+        if self.adapter is not None and not isinstance(self.adapter, str):
+            raise ValueError("adapter must be a string")

    def generate_response(
        self,
@@ -318,7 +380,7 @@ class APIHandler(BaseHTTPRequestHandler):
        # Static response
        response = {
            "id": self.request_id,
-            "system_fingerprint": f"fp_{uuid.uuid4()}",
+            "system_fingerprint": self.system_fingerprint,
            "object": self.object_type,
            "model": self.requested_model,
            "created": self.created,
@@ -363,16 +425,30 @@ class APIHandler(BaseHTTPRequestHandler):

        return response

+    def get_prompt_cache(self, prompt):
+        cache_len = len(self.prompt_cache.tokens)
+        if (
+            self.prompt_cache.model_key != self.model_provider.model_key
+            or cache_len >= len(prompt)
+            or self.prompt_cache.tokens != prompt[:cache_len]
+        ):
+            self.prompt_cache.model_key = self.model_provider.model_key
+            self.prompt_cache.cache = make_prompt_cache(self.model_provider.model)
+        else:
+            prompt = prompt[cache_len:]
+        self.prompt_cache.tokens.extend(prompt)
+        return prompt
+
    def handle_completion(
        self,
-        prompt: mx.array,
+        prompt: List[int],
        stop_id_sequences: List[List[int]],
    ):
        """
        Generate a response to a prompt and send it to the client in a single batch.

        Args:
-            prompt (mx.array): The prompt, in token form inside of a mlx array
+            prompt (List[int]): The tokenized prompt.
            stop_id_sequences (List[List[int]]): A list of stop words passed
              to the stopping_criteria function
        """
@@ -384,17 +460,21 @@ class APIHandler(BaseHTTPRequestHandler):
        logging.debug(f"Starting completion:")
        token_logprobs = []
        top_tokens = []
-        for (token, logprobs), _ in zip(
+
+        prompt = self.get_prompt_cache(prompt)
+
+        for _, (token, logprobs) in zip(
+            range(self.max_tokens),
            generate_step(
-                prompt=prompt,
+                prompt=mx.array(prompt),
                model=self.model,
                temp=self.temperature,
                top_p=self.top_p,
                repetition_penalty=self.repetition_penalty,
                repetition_context_size=self.repetition_context_size,
                logit_bias=self.logit_bias,
+                prompt_cache=self.prompt_cache.cache,
            ),
-            range(self.max_tokens),
        ):
            detokenizer.add_token(token)
            logging.debug(detokenizer.text)
@@ -405,7 +485,7 @@ class APIHandler(BaseHTTPRequestHandler):
                top_indices = sorted_indices[: self.logprobs]
                top_logprobs = logprobs[top_indices]
                top_token_info = zip(top_indices.tolist(), top_logprobs.tolist())
-                top_tokens.append(dict(top_token_info))
+                top_tokens.append(tuple(top_token_info))

            token_logprobs.append(logprobs[token].item())

@@ -420,6 +500,7 @@ class APIHandler(BaseHTTPRequestHandler):
                    )
                break

+        self.prompt_cache.tokens.extend(tokens)
        detokenizer.finalize()
        text = (
            detokenizer.text
@@ -449,16 +530,17 @@ class APIHandler(BaseHTTPRequestHandler):

    def handle_stream(
        self,
-        prompt: mx.array,
+        prompt: List[int],
        stop_id_sequences: List[List[int]],
    ):
        """
-        Generate response to prompt and foward it to the client using a Server Sent Events (SSE) stream.
+        Generate response to prompt and foward it to the client using a Server
+        Sent Events (SSE) stream.

        Args:
-            prompt (mx.array): The prompt, in token form inside of a mlx array
-            stop_id_sequences (List[List[int]]):
-                A list of stop words passed to the stopping_criteria function
+            prompt (mx.array): The tokenized prompt
+            stop_id_sequences (List[List[int]]): A list of stop words passed to
+              the stopping_criteria function
        """
        # No additional headers are needed, call end_headers
        self.end_headers()
@@ -467,31 +549,26 @@ class APIHandler(BaseHTTPRequestHandler):
        detokenizer.reset()
        tokens = []

-        max_stop_id_sequence_len = len(max(stop_id_sequences, default=[]))
-        # Buffer to store the last `max_stop_id_sequence_len` tokens
-        # to check for stop conditions before writing to the stream.
-        stop_sequence_buffer = []
        stop_sequence_suffix = None
        logging.debug(f"Starting stream:")
-        for (token, _), _ in zip(
+
+        prompt = self.get_prompt_cache(prompt)
+
+        for _, (token, _) in zip(
+            range(self.max_tokens),
            generate_step(
-                prompt=prompt,
+                prompt=mx.array(prompt),
                model=self.model,
                temp=self.temperature,
                top_p=self.top_p,
                repetition_penalty=self.repetition_penalty,
                repetition_context_size=self.repetition_context_size,
+                prompt_cache=self.prompt_cache.cache,
            ),
-            range(self.max_tokens),
        ):
            detokenizer.add_token(token)
            logging.debug(detokenizer.text)
            tokens.append(token)
-            stop_sequence_buffer.append(token)
-
-            # Continue generating tokens until buffer is as large as the longest stop_id_sequence
-            if len(stop_sequence_buffer) < max_stop_id_sequence_len:
-                continue

            stop_condition = stopping_criteria(
                tokens,
@@ -505,28 +582,59 @@ class APIHandler(BaseHTTPRequestHandler):
                    )
                break

+            # If the end of tokens overlaps with a stop sequence, generate new
+            # tokens until we know if the stop sequence is hit or not
+            if any(
+                (sequence_overlap(tokens, sequence) for sequence in stop_id_sequences)
+            ):
+                continue
+
            new_text = detokenizer.last_segment
-            response = self.generate_response(new_text, None)
-            self.wfile.write(f"data: {json.dumps(response)}\n\n".encode())
-            self.wfile.flush()
-            stop_sequence_buffer = []
+            if new_text:
+                response = self.generate_response(new_text, None)
+                self.wfile.write(f"data: {json.dumps(response)}\n\n".encode())
+                self.wfile.flush()
+
+        self.prompt_cache.tokens.extend(tokens)

        # check is there any remaining text to send
-        if stop_sequence_buffer:
-            next_chunk = (
-                detokenizer.last_segment
-                if stop_sequence_suffix is None
-                else detokenizer.last_segment[: -len(stop_sequence_suffix)]
-            )
-            response = self.generate_response(next_chunk, "length")
-
+        detokenizer.finalize()
+        last_segment = detokenizer.last_segment
+        if last_segment:
+            if stop_sequence_suffix is not None:
+                last_segment = last_segment[: -len(stop_sequence_suffix)]
+            response = self.generate_response(last_segment, "length")
            self.wfile.write(f"data: {json.dumps(response)}\n\n".encode())
            self.wfile.flush()

+        if self.stream_options is not None and self.stream_options["include_usage"]:
+            response = self.completion_usage_response(len(prompt), len(tokens))
+            self.wfile.write(f"data: {json.dumps(response)}\n\n".encode())
+
        self.wfile.write("data: [DONE]\n\n".encode())
        self.wfile.flush()

-    def handle_chat_completions(self) -> mx.array:
+    def completion_usage_response(
+        self,
+        prompt_token_count: Optional[int] = None,
+        completion_token_count: Optional[int] = None,
+    ):
+        response = {
+            "id": self.request_id,
+            "system_fingerprint": self.system_fingerprint,
+            "object": "chat.completion",
+            "model": self.requested_model,
+            "created": self.created,
+            "choices": [],
+            "usage": {
+                "prompt_tokens": prompt_token_count,
+                "completion_tokens": completion_token_count,
+                "total_tokens": prompt_token_count + completion_token_count,
+            },
+        }
+        return response
+
+    def handle_chat_completions(self) -> List[int]:
        """
        Handle a chat completion request.

@@ -541,13 +649,13 @@ class APIHandler(BaseHTTPRequestHandler):
        self.object_type = (
            "chat.completions.chunk" if self.stream else "chat.completions"
        )
-
        if (
            hasattr(self.tokenizer, "apply_chat_template")
            and self.tokenizer.chat_template
        ):
            prompt = self.tokenizer.apply_chat_template(
                body["messages"],
+                body.get("tools", None),
                tokenize=True,
                add_generation_prompt=True,
            )
@@ -555,9 +663,9 @@ class APIHandler(BaseHTTPRequestHandler):
            prompt = convert_chat(body["messages"], body.get("role_mapping"))
            prompt = self.tokenizer.encode(prompt)

-        return mx.array(prompt)
+        return prompt

-    def handle_text_completions(self) -> mx.array:
+    def handle_text_completions(self) -> List[int]:
        """
        Handle a text completion request.

@@ -567,11 +675,48 @@ class APIHandler(BaseHTTPRequestHandler):
        # Determine response type
        self.request_id = f"cmpl-{uuid.uuid4()}"
        self.object_type = "text_completion"
-
        assert "prompt" in self.body, "Request did not contain a prompt"
-        prompt_text = self.body["prompt"]
-        prompt = self.tokenizer.encode(prompt_text)
-        return mx.array(prompt)
+        return self.tokenizer.encode(self.body["prompt"])
+
+    def do_GET(self):
+        """
+        Respond to a GET request from a client.
+        """
+        if self.path == "/v1/models":
+            self.handle_models_request()
+        else:
+            self._set_completion_headers(404)
+            self.end_headers()
+            self.wfile.write(b"Not Found")
+
+    def handle_models_request(self):
+        """
+        Handle a GET request for the /v1/models endpoint.
+        """
+        self._set_completion_headers(200)
+        self.end_headers()
+
+        # Scan the cache directory for downloaded mlx models
+        hf_cache_info = scan_cache_dir()
+        downloaded_models = [
+            repo for repo in hf_cache_info.repos if "mlx" in repo.repo_id
+        ]
+
+        # Create a list of available models
+        models = [
+            {
+                "id": repo.repo_id,
+                "object": "model",
+                "created": self.created,
+            }
+            for repo in downloaded_models
+        ]
+
+        response = {"object": "list", "data": models}
+
+        response_json = json.dumps(response).encode()
+        self.wfile.write(response_json)
+        self.wfile.flush()


 def run(
@@ -582,9 +727,16 @@ def run(
    handler_class=APIHandler,
 ):
    server_address = (host, port)
+    prompt_cache = PromptCache()
    httpd = server_class(
        server_address,
-        lambda *args, **kwargs: handler_class(model_provider, *args, **kwargs),
+        lambda *args, **kwargs: handler_class(
+            model_provider,
+            prompt_cache=prompt_cache,
+            system_fingerprint=get_system_fingerprint(),
+            *args,
+            **kwargs,
+        ),
    )
    warnings.warn(
        "mlx_lm.server is not recommended for production as "
--- a/llms/mlx_lm/tokenizer_utils.py
+++ b/llms/mlx_lm/tokenizer_utils.py
@@ -97,6 +97,11 @@ class NaiveStreamingDetokenizer(StreamingDetokenizer):
    def text(self):
        if self._current_tokens:
            self._current_text = self._tokenizer.decode(self._current_tokens)
+            if (
+                self._tokenizer.clean_up_tokenization_spaces
+                and self._current_text[-1] == " "
+            ):
+                self._current_text = self._current_text[:-1]
        if self._current_text and self._current_text[-1] == "\n":
            self._tokens.extend(self._current_tokens)
            self._text += self._current_text
@@ -164,9 +169,11 @@ class BPEStreamingDetokenizer(StreamingDetokenizer):
    """

    _byte_decoder = None
+    _space_matches = (".", "?", "!", ",", "n't", "'m", "'s", "'ve", "'re")

-    def __init__(self, tokenizer, trim_space=False):
-        self.trim_space = trim_space
+    def __init__(self, tokenizer):
+
+        self.clean_spaces = tokenizer.clean_up_tokenization_spaces

        # Extract the tokens in a list from id to text
        self.tokenmap = [None] * len(tokenizer.vocab)
@@ -179,24 +186,38 @@ class BPEStreamingDetokenizer(StreamingDetokenizer):
        # https://github.com/openai/gpt-2/blob/master/src/encoder.py
        self.make_byte_decoder()

+        self._added_ids = set(tokenizer.added_tokens_decoder.keys())
+
    def reset(self):
        self.offset = 0
        self._unflushed = ""
        self.text = ""
        self.tokens = []

+    def _maybe_trim_space(self, current_text):
+        if len(current_text) == 0:
+            return current_text
+        elif current_text[0] != " ":
+            return current_text
+        elif not self.text:
+            return current_text[1:]
+        elif self.clean_spaces and current_text[1:].startswith(self._space_matches):
+            return current_text[1:]
+        return current_text
+
    def add_token(self, token):
        v = self.tokenmap[token]
-        # if the token starts with space
-        if self._byte_decoder[v[0]] == 32:
+        is_added = token in self._added_ids
+        if is_added or self._byte_decoder[v[0]] == 32:
            current_text = bytearray(
                self._byte_decoder[c] for c in self._unflushed
            ).decode("utf-8")
-            if self.text or not self.trim_space:
-                self.text += current_text
+            self.text += self._maybe_trim_space(current_text)
+            if is_added:
+                self.text += v
+                self._unflushed = ""
            else:
-                self.text += _remove_space(current_text)
-            self._unflushed = v
+                self._unflushed = v
        else:
            self._unflushed += v

@@ -204,10 +225,7 @@ class BPEStreamingDetokenizer(StreamingDetokenizer):
        current_text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode(
            "utf-8"
        )
-        if self.text or not self.trim_space:
-            self.text += current_text
-        else:
-            self.text += _remove_space(current_text)
+        self.text += self._maybe_trim_space(current_text)
        self._unflushed = ""

    @classmethod
@@ -252,9 +270,19 @@ class TokenizerWrapper:
    def __getattr__(self, attr):
        if attr == "detokenizer":
            return self._detokenizer
+        elif attr.startswith("_"):
+            return self.__getattribute__(attr)
        else:
            return getattr(self._tokenizer, attr)

+    def __setattr__(self, attr, value):
+        if attr == "detokenizer":
+            raise AttributeError("Cannot set the detokenizer.")
+        elif attr.startswith("_"):
+            super().__setattr__(attr, value)
+        else:
+            setattr(self._tokenizer, attr, value)
+

 def _match(a, b):
    if type(a) != type(b):
@@ -293,14 +321,7 @@ def _is_spm_decoder_no_space(decoder):


 def _is_bpe_decoder(decoder):
-    _target_description = {
-        "type": "ByteLevel",
-        "add_prefix_space": False,
-        "trim_offsets": False,
-        "use_regex": False,
-    }
-
-    return _match(_target_description, decoder)
+    return isinstance(decoder, dict) and decoder.get("type", None) == "ByteLevel"


 def load_tokenizer(model_path, tokenizer_config_extra={}):
--- a/llms/mlx_lm/tuner/datasets.py
+++ b/llms/mlx_lm/tuner/datasets.py
@@ -36,7 +36,10 @@ class ChatDataset(Dataset):
    def __getitem__(self, idx: int):
        messages = self._data[idx]["messages"]
        text = self._tokenizer.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
+            messages,
+            tools=self._data[idx].get("tools", None),
+            tokenize=False,
+            add_generation_prompt=True,
        )
        return text

@@ -73,17 +76,14 @@ class CompletionsDataset(Dataset):
        return text


-def create_dataset(path: Path, tokenizer: PreTrainedTokenizer = None):
-    # Return empty dataset for non-existent paths
-    if not path.exists():
-        return []
-    with open(path, "r") as fid:
-        data = [json.loads(l) for l in fid]
-    if "messages" in data[0]:
+def create_dataset(data, tokenizer: PreTrainedTokenizer = None):
+    sample = data[0]
+
+    if "messages" in sample:
        return ChatDataset(data, tokenizer)
-    elif "prompt" in data[0] and "completion" in data[0]:
+    elif "prompt" in sample and "completion" in sample:
        return CompletionsDataset(data, tokenizer)
-    elif "text" in data[0]:
+    elif "text" in sample:
        return Dataset(data)
    else:
        raise ValueError(
@@ -92,54 +92,90 @@ def create_dataset(path: Path, tokenizer: PreTrainedTokenizer = None):
        )


-def load_dataset(args, tokenizer: PreTrainedTokenizer):
-    if getattr(args, "hf_dataset", None) is not None:
-        import datasets
+def load_local_dataset(data_path: Path, tokenizer: PreTrainedTokenizer):
+    def load_subset(path):
+        if not path.exists():
+            return []
+        with open(path, "r") as fid:
+            data = [json.loads(l) for l in fid]
+        return create_dataset(data, tokenizer)

-        hf_args = args.hf_dataset
-        dataset_name = hf_args["name"]
-        print(f"Loading Hugging Face dataset {dataset_name}.")
-        text_feature = hf_args.get("text_feature")
-        prompt_feature = hf_args.get("prompt_feature")
-        completion_feature = hf_args.get("completion_feature")
+    names = ("train", "valid", "test")
+    train, valid, test = [load_subset(data_path / f"{n}.jsonl") for n in names]
+    return train, valid, test

-        def create_hf_dataset(split: str = None):
-            ds = datasets.load_dataset(
-                dataset_name,
-                split=split,
-                **hf_args.get("config", {}),
-            )
-            if prompt_feature and completion_feature:
-                return CompletionsDataset(
-                    ds, tokenizer, prompt_feature, completion_feature
-                )
-            elif text_feature:
-                return Dataset(train_ds, text_key=text_feature)
-            else:
-                raise ValueError(
-                    "Specify either a prompt and completion feature or a text "
-                    "feature for the Hugging Face dataset."
-                )

-        if args.train:
-            train_split = hf_args.get("train_split", "train[:80%]")
-            valid_split = hf_args.get("valid_split", "train[-10%:]")
-            train = create_hf_dataset(split=train_split)
-            valid = create_hf_dataset(split=valid_split)
-        else:
-            train, valid = [], []
-        if args.test:
-            test = create_hf_dataset(split=hf_args.get("test_split"))
-        else:
-            test = []
+def load_hf_dataset(data_id: str, tokenizer: PreTrainedTokenizer):
+    from datasets import exceptions, load_dataset
+
+    try:
+        dataset = load_dataset(data_id)

-    else:
        names = ("train", "valid", "test")
-        data_path = Path(args.data)

        train, valid, test = [
-            create_dataset(data_path / f"{n}.jsonl", tokenizer) for n in names
+            create_dataset(dataset[n], tokenizer) if n in dataset.keys() else []
+            for n in names
        ]
+
+    except exceptions.DatasetNotFoundError:
+        raise ValueError(f"Not found Hugging Face dataset: {data_id} .")
+
+    return train, valid, test
+
+
+def load_custom_hf_dataset(args, tokenizer: PreTrainedTokenizer):
+    import datasets
+
+    hf_args = args.hf_dataset
+    dataset_name = hf_args["name"]
+    print(f"Loading Hugging Face dataset {dataset_name}.")
+    text_feature = hf_args.get("text_feature")
+    prompt_feature = hf_args.get("prompt_feature")
+    completion_feature = hf_args.get("completion_feature")
+
+    def create_hf_dataset(split: str = None):
+        ds = datasets.load_dataset(
+            dataset_name,
+            split=split,
+            **hf_args.get("config", {}),
+        )
+        if prompt_feature and completion_feature:
+            return CompletionsDataset(ds, tokenizer, prompt_feature, completion_feature)
+        elif text_feature:
+            return Dataset(train_ds, text_key=text_feature)
+        else:
+            raise ValueError(
+                "Specify either a prompt and completion feature or a text "
+                "feature for the Hugging Face dataset."
+            )
+
+    if args.train:
+        train_split = hf_args.get("train_split", "train[:80%]")
+        valid_split = hf_args.get("valid_split", "train[-10%:]")
+        train = create_hf_dataset(split=train_split)
+        valid = create_hf_dataset(split=valid_split)
+    else:
+        train, valid = [], []
+    if args.test:
+        test = create_hf_dataset(split=hf_args.get("test_split"))
+    else:
+        test = []
+
+    return train, valid, test
+
+
+def load_dataset(args, tokenizer: PreTrainedTokenizer):
+    if getattr(args, "hf_dataset", None) is not None:
+        train, valid, test = load_custom_hf_dataset(args, tokenizer)
+    else:
+        data_path = Path(args.data)
+        if data_path.exists():
+            train, valid, test = load_local_dataset(data_path, tokenizer)
+        else:
+            print(f"Loading Hugging Face dataset {args.data}.")
+            train, valid, test = load_hf_dataset(args.data, tokenizer)
+
    if args.train and len(train) == 0:
        raise ValueError(
            "Training set not found or empty. Must provide training set for fine-tuning."
--- a/llms/mlx_lm/tuner/dora.py
+++ b/llms/mlx_lm/tuner/dora.py
@@ -8,16 +8,17 @@ import mlx.nn as nn

 class DoRALinear(nn.Module):
    @staticmethod
-    def from_linear(
+    def from_base(
        linear: nn.Linear,
        r: int = 8,
        dropout: float = 0.0,
        scale: float = 20.0,
    ):
-        # TODO support quantized weights in DoRALinear
+        # TODO remove when input_dims and output_dims are attributes
+        # on linear and quantized linear
        output_dims, input_dims = linear.weight.shape
        if isinstance(linear, nn.QuantizedLinear):
-            raise ValueError("DoRALinear does not yet support quantization.")
+            input_dims *= 32 // linear.bits
        dora_lin = DoRALinear(
            input_dims=input_dims,
            output_dims=output_dims,
@@ -25,19 +26,19 @@ class DoRALinear(nn.Module):
            dropout=dropout,
            scale=scale,
        )
-        dora_lin.linear = linear
+        dora_lin.set_linear(linear)
        return dora_lin

-    def to_linear(self, de_quantize: bool = False):
+    def fuse(self, de_quantize: bool = False):
        linear = self.linear
        bias = "bias" in linear
-        weight = linear.weight
+        weight = self._dequantized_weight()

-        # Use the same type as the linear weight if not quantized
+        # Use the same type as the linear weight
        dtype = weight.dtype

        output_dims, input_dims = weight.shape
-        fused_linear = nn.Linear(input_dims, output_dims, bias=bias)
+        fused_linear = nn.Linear(input_dims, output_dims, bias=False)

        lora_b = (self.scale * self.lora_b.T).astype(dtype)
        lora_a = self.lora_a.T.astype(dtype)
@@ -47,6 +48,13 @@ class DoRALinear(nn.Module):

        if bias:
            fused_linear.bias = linear.bias
+
+        if self._is_quantized() and not de_quantize:
+            fused_linear = nn.QuantizedLinear.from_linear(
+                fused_linear,
+                linear.group_size,
+                linear.bits,
+            )
        return fused_linear

    def __init__(
@@ -61,7 +69,7 @@ class DoRALinear(nn.Module):
        super().__init__()

        # Regular linear layer weights
-        self.linear = nn.Linear(input_dims, output_dims, bias=bias)
+        self.set_linear(nn.Linear(input_dims, output_dims, bias=bias))
        self.dropout = nn.Dropout(p=dropout)

        # Scale for low-rank update
@@ -75,21 +83,146 @@ class DoRALinear(nn.Module):
            shape=(input_dims, r),
        )
        self.lora_b = mx.zeros(shape=(r, output_dims))
-        self.m = mx.linalg.norm(self.linear.weight, axis=1)
+
+    def set_linear(self, linear):
+        """
+        Set the self.linear layer and recompute self.m.
+        """
+        self.linear = linear
+        self.m = mx.linalg.norm(self._dequantized_weight().astype(mx.float32), axis=1)
+
+    def _dequantized_weight(self):
+        """
+        Return the weight of linear layer and dequantize it if is quantized
+        """
+        weight = self.linear.weight
+        if self._is_quantized():
+            weight = mx.dequantize(
+                weight,
+                self.linear.scales,
+                self.linear.biases,
+                self.linear.group_size,
+                self.linear.bits,
+            )
+        return weight
+
+    def _is_quantized(self):
+        return isinstance(self.linear, nn.QuantizedLinear)

    def __call__(self, x):
        # Regular LoRA (without a bias)
-        y = x @ self.linear.weight.T
+        w = self._dequantized_weight()
+        y = x @ w.T
+
        z = (self.dropout(x) @ self.lora_a) @ self.lora_b
        out = y + (self.scale * z).astype(x.dtype)

        # Compute the norm of the adapted weights
-        adapted = self.linear.weight + (self.scale * self.lora_b.T) @ self.lora_a.T
+        adapted = w + (self.scale * self.lora_b.T) @ self.lora_a.T
+        denom = mx.stop_gradient(mx.linalg.norm(adapted, axis=1))
+
+        # Remove the norm and scale by the learned magnitude
+        out = (self.m / denom).astype(x.dtype) * out
+
+        if "bias" in self.linear:
+            out = out + self.linear.bias
+        return out
+
+
+class DoRAEmbedding(nn.Module):
+    def from_base(
+        embedding: nn.Embedding,
+        r: int = 8,
+        dropout: float = 0.0,
+        scale: float = 20.0,
+    ):
+        num_embeddings, dims = embedding.weight.shape
+
+        # TODO support quantized weights in DoRALinear
+        if isinstance(embedding, nn.QuantizedLinear):
+            raise ValueError("DoRAEmbedding does not yet support quantization.")
+        dora_embedding = DoRAEmbedding(
+            num_embeddings=num_embeddings,
+            dims=dims,
+            r=r,
+            dropout=dropout,
+            scale=scale,
+        )
+        dora_embedding.set_embedding(embedding)
+        return dora_embedding
+
+    def fuse(self, de_quantize: bool = False):
+        embedding = self.embedding
+        weight = embedding.weight
+
+        # Use the same type as the linear weight if not quantized
+        dtype = weight.dtype
+
+        num_embeddings, dims = weight.shape
+        fused_embedding = nn.Embedding(num_embeddings, dims)
+
+        lora_a = (self.scale * self.lora_a).astype(dtype)
+        lora_b = self.lora_b.astype(dtype)
+        weight = weight + lora_a @ lora_b
+        norm_scale = self.m / mx.linalg.norm(weight, axis=1)
+        fused_embedding.weight = norm_scale[:, None] * weight
+
+        return fused_embedding
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        dims: int,
+        r: int = 8,
+        dropout: float = 0.0,
+        scale: float = 20.0,
+    ):
+        super().__init__()
+
+        # Regular embedding layer weights
+        self.set_embedding(nn.Embedding(num_embeddings, dims))
+        self.dropout = nn.Dropout(p=dropout)
+
+        # Scale for low-rank update
+        self.scale = scale
+
+        # Low rank lora weights
+        scale = 1 / math.sqrt(num_embeddings)
+        self.lora_a = mx.random.uniform(
+            low=-scale,
+            high=scale,
+            shape=(num_embeddings, r),
+        )
+        self.lora_b = mx.zeros(shape=(r, dims))
+
+    def set_embedding(self, embedding: nn.Module):
+        self.embedding = embedding
+        self.m = mx.linalg.norm(embedding.weight, axis=1)
+
+    def __call__(self, x):
+        y = self.embedding(x)
+        z = self.scale * self.lora_a[x] @ self.lora_b
+        out = y + self.dropout(z).astype(y.dtype)
+
+        # Compute the norm of the adapted weights for the individual embeddings
+        adapted = y + z
+        denom = mx.stop_gradient(mx.linalg.norm(adapted, axis=-1))
+
+        # Remove the norm and scale by the learned magnitude
+        out = (self.m[x] / denom)[..., None] * out
+
+        return out
+
+    def as_linear(self, x):
+        y = self.embedding.as_linear(x)
+        z = (self.dropout(x) @ self.lora_b.T) @ self.lora_a.T
+        out = y + (self.scale * z).astype(x.dtype)
+
+        # Compute the norm of the adapted weights
+        adapted = self.embedding.weight + (self.scale * self.lora_a) @ self.lora_b
        denom = mx.stop_gradient(mx.linalg.norm(adapted, axis=1))

        # Remove the norm and scale by the learned magnitude
        out = (self.m / denom) * out

-        if "bias" in self.linear:
-            out = out + self.linear.bias
        return out
--- a/llms/mlx_lm/tuner/lora.py
+++ b/llms/mlx_lm/tuner/lora.py
@@ -10,7 +10,7 @@ from ..models.switch_layers import QuantizedSwitchLinear, SwitchLinear

 class LoRALinear(nn.Module):
    @staticmethod
-    def from_linear(
+    def from_base(
        linear: nn.Linear,
        r: int = 8,
        dropout: float = 0.0,
@@ -31,7 +31,7 @@ class LoRALinear(nn.Module):
        lora_lin.linear = linear
        return lora_lin

-    def to_linear(self, de_quantize: bool = False):
+    def fuse(self, de_quantize: bool = False):
        linear = self.linear
        bias = "bias" in linear
        weight = linear.weight
@@ -41,7 +41,7 @@ class LoRALinear(nn.Module):
        dtype = weight.dtype

        if is_quantized:
-            dtype = mx.float16
+            dtype = linear.scales.dtype
            weight = mx.dequantize(
                weight,
                linear.scales,
@@ -103,7 +103,7 @@ class LoRALinear(nn.Module):

 class LoRASwitchLinear(nn.Module):
    @staticmethod
-    def from_linear(
+    def from_base(
        linear: nn.Module,
        r: int = 8,
        dropout: float = 0.0,
@@ -120,7 +120,7 @@ class LoRASwitchLinear(nn.Module):
        lora_lin.linear = linear
        return lora_lin

-    def to_linear(self, de_quantize: bool = False):
+    def fuse(self, de_quantize: bool = False):
        linear = self.linear
        bias = "bias" in linear
        weight = linear.weight
@@ -191,3 +191,95 @@ class LoRASwitchLinear(nn.Module):
        z = z[..., None, :] @ self.lora_b[indices].swapaxes(-2, -1)

        return y + (self.scale * z).astype(x.dtype)
+
+
+class LoRAEmbedding(nn.Module):
+    @staticmethod
+    def from_base(
+        embedding: nn.Embedding,
+        r: int = 8,
+        dropout: float = 0.0,
+        scale: float = 20.0,
+    ):
+        num_embeddings, dims = embedding.weight.shape
+        if isinstance(embedding, nn.QuantizedEmbedding):
+            dims *= 32 // embedding.bits
+        lora_embedding = LoRAEmbedding(
+            num_embeddings=num_embeddings,
+            dims=dims,
+            r=r,
+            dropout=dropout,
+            scale=scale,
+        )
+        lora_embedding.embedding = embedding
+        return lora_embedding
+
+    def fuse(self, de_quantize: bool = False):
+        embedding = self.embedding
+        weight = embedding.weight
+        is_quantized = isinstance(embedding, nn.QuantizedEmbedding)
+
+        # Use the same type as the linear weight if not quantized
+        dtype = weight.dtype
+
+        if is_quantized:
+            dtype = embedding.scales.dtype
+            weight = mx.dequantize(
+                weight,
+                embedding.scales,
+                embedding.biases,
+                embedding.group_size,
+                embedding.bits,
+            )
+        num_embeddings, dims = weight.shape
+        fused_embedding = nn.Embedding(num_embeddings, dims)
+
+        lora_a = (self.scale * self.lora_a).astype(dtype)
+        lora_b = self.lora_b.astype(dtype)
+        fused_embedding.weight = weight + lora_a @ lora_b
+
+        if is_quantized and not de_quantize:
+            fused_embedding = nn.QuantizedEmbedding.from_embedding(
+                fused_embedding,
+                embedding.group_size,
+                embedding.bits,
+            )
+
+        return fused_embedding
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        dims: int,
+        r: int = 8,
+        dropout: float = 0.0,
+        scale: float = 20.0,
+    ):
+        super().__init__()
+
+        # Regular embedding layer
+        self.embedding = nn.Embedding(num_embeddings, dims)
+        self.dropout = nn.Dropout(p=dropout)
+
+        # Scale for low-rank update
+        self.scale = scale
+
+        # Low rank lora weights
+        scale = 1 / math.sqrt(num_embeddings)
+        self.lora_a = mx.random.uniform(
+            low=-scale,
+            high=scale,
+            shape=(num_embeddings, r),
+        )
+        self.lora_b = mx.zeros(shape=(r, dims))
+
+    def __call__(self, x):
+        y = self.embedding(x)
+        z = self.dropout(self.lora_a[x] @ self.lora_b)
+        out = y + (self.scale * z).astype(y.dtype)
+        return out
+
+    def as_linear(self, x):
+        y = self.embedding.as_linear(x)
+        z = (self.dropout(x) @ self.lora_b.T) @ self.lora_a.T
+        return y + (self.scale * z).astype(x.dtype)
--- a/llms/mlx_lm/tuner/trainer.py
+++ b/llms/mlx_lm/tuner/trainer.py
@@ -1,5 +1,7 @@
 # Copyright © 2024 Apple Inc.

+import glob
+import shutil
 import time
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -8,6 +10,7 @@ from typing import Union
 import mlx.core as mx
 import mlx.nn as nn
 import numpy as np
+from mlx.nn.utils import average_gradients
 from mlx.utils import tree_flatten


@@ -141,9 +144,16 @@ def iterate_batches(dataset, tokenizer, batch_size, max_seq_length, train=False)
            f" examples but only has {len(dataset)}."
        )

+    # If running in distributed mode (N machines) then each one should skip N-1
+    # samples
+    step = mx.distributed.init().size()
+    if batch_size % step != 0:
+        raise ValueError("The batch size must be divisible by the number of workers")
+
    # Make the batches:
    batch_idx = [
-        idx[i : i + batch_size] for i in range(0, len(idx) - batch_size + 1, batch_size)
+        idx[i : i + batch_size : step]
+        for i in range(0, len(idx) - batch_size + 1, batch_size)
    ]

    while True:
@@ -152,9 +162,7 @@ def iterate_batches(dataset, tokenizer, batch_size, max_seq_length, train=False)
            # Encode batch
            batch = [tokenizer.encode(dataset[j]) for j in batch_idx[i]]
            for b in batch:
-                if b[-1] == tokenizer.eos_token_id:
-                    print("[WARNING] Example already has an EOS token appended")
-                else:
+                if b[-1] != tokenizer.eos_token_id:
                    b.append(tokenizer.eos_token_id)

            lengths = [len(x) for x in batch]
@@ -171,9 +179,9 @@ def iterate_batches(dataset, tokenizer, batch_size, max_seq_length, train=False)
            max_length_in_batch = pad_to * ((max(lengths) + pad_to - 1) // pad_to)
            max_length_in_batch = min(max_length_in_batch, max_seq_length)

-            batch_arr = np.zeros((batch_size, max_length_in_batch), np.int32)
+            batch_arr = np.zeros((batch_size // step, max_length_in_batch), np.int32)

-            for j in range(batch_size):
+            for j in range(batch_size // step):
                truncated_length = min(lengths[j], max_seq_length)
                batch_arr[j, :truncated_length] = batch[j][:truncated_length]
                lengths[j] = (
@@ -197,7 +205,7 @@ def evaluate(
    loss: callable = default_loss,
    iterate_batches: callable = iterate_batches,
 ):
-    all_losses = []
+    all_losses = 0
    ntokens = 0

    index_iterator = iter(range(num_batches)) if num_batches != -1 else iter(int, 1)
@@ -212,10 +220,14 @@ def evaluate(
        ),
    ):
        losses, toks = loss(model, *batch)
-        all_losses.append((losses * toks).item())
-        ntokens += toks.item()
+        all_losses += losses * toks
+        ntokens += toks
+        mx.eval(all_losses, ntokens)

-    return np.sum(all_losses) / ntokens
+    all_losses = mx.distributed.all_sum(all_losses)
+    ntokens = mx.distributed.all_sum(ntokens)
+
+    return (all_losses / ntokens).item()


 class TrainingCallback:
@@ -241,6 +253,11 @@ def train(
    training_callback: TrainingCallback = None,
 ):
    print(f"Starting training..., iters: {args.iters}")
+    world = mx.distributed.init()
+    world_size = world.size()
+    rank = world.rank()
+    if world_size > 1:
+        print(f"Node {rank} of {world_size}")

    if args.grad_checkpoint:
        grad_checkpoint(model.layers[0])
@@ -251,6 +268,9 @@ def train(
        # Forward and backward pass
        (lvalue, toks), grad = loss_value_and_grad(model, *batch)

+        # All reduce the gradients if running in distributed mode
+        grad = average_gradients(grad)
+
        # Model update
        optimizer.update(model, grad)

@@ -258,8 +278,9 @@ def train(

    loss_value_and_grad = nn.value_and_grad(model, loss)

-    losses = []
+    losses = 0
    n_tokens = 0
+    steps = 0
    trained_tokens = 0
    # Main training loop
    start = time.perf_counter()
@@ -288,9 +309,13 @@ def train(
                iterate_batches=iterate_batches,
            )
            val_time = time.perf_counter() - stop
-            print(
-                f"Iter {it}: " f"Val loss {val_loss:.3f}, " f"Val took {val_time:.3f}s"
-            )
+            if rank == 0:
+                print(
+                    f"Iter {it}: "
+                    f"Val loss {val_loss:.3f}, "
+                    f"Val took {val_time:.3f}s",
+                    flush=True,
+                )

            if training_callback is not None:
                val_info = {
@@ -303,30 +328,33 @@ def train(
            start = time.perf_counter()

        lvalue, toks = step(batch)
-        mx.eval(state, lvalue, toks)
-
-        # Record loss
-        losses.append(lvalue.item())
-        n_tokens += toks.item()
+        losses += lvalue
+        n_tokens += toks
+        steps += 1
+        mx.eval(state, losses, n_tokens)

        # Report training loss if needed
        if it % args.steps_per_report == 0 or it == args.iters:
            stop = time.perf_counter()

-            train_loss = np.mean(losses)
+            train_loss = mx.distributed.all_sum(losses).item()
+            train_loss /= steps * mx.distributed.init().size()
+            n_tokens = mx.distributed.all_sum(n_tokens).item()
            learning_rate = optimizer.learning_rate.item()
            it_sec = args.steps_per_report / (stop - start)
            tokens_sec = float(n_tokens) / (stop - start)
            trained_tokens += n_tokens
            peak_mem = mx.metal.get_peak_memory() / 2**30
-            print(
-                f"Iter {it}: Train loss {train_loss:.3f}, "
-                f"Learning Rate {learning_rate:.3e}, "
-                f"It/sec {it_sec:.3f}, "
-                f"Tokens/sec {tokens_sec:.3f}, "
-                f"Trained Tokens {trained_tokens}, "
-                f"Peak mem {peak_mem:.3f} GB"
-            )
+            if rank == 0:
+                print(
+                    f"Iter {it}: Train loss {train_loss:.3f}, "
+                    f"Learning Rate {learning_rate:.3e}, "
+                    f"It/sec {it_sec:.3f}, "
+                    f"Tokens/sec {tokens_sec:.3f}, "
+                    f"Trained Tokens {trained_tokens}, "
+                    f"Peak mem {peak_mem:.3f} GB",
+                    flush=True,
+                )

            if training_callback is not None:
                train_info = {
@@ -340,30 +368,25 @@ def train(
                }
                training_callback.on_train_loss_report(train_info)

-            losses = []
+            losses = 0
            n_tokens = 0
+            steps = 0
            start = time.perf_counter()

        # Save adapter weights
        if it % args.steps_per_save == 0:
-            save_adapter(model, args.adapter_file)
+            adapter_weights = dict(tree_flatten(model.trainable_parameters()))
+            mx.save_safetensors(str(args.adapter_file), adapter_weights)
            checkpoint = (
                Path(args.adapter_file).parent / f"{it:07d}_adapters.safetensors"
            )
-            save_adapter(model, checkpoint)
+            mx.save_safetensors(str(checkpoint), adapter_weights)
            print(
                f"Iter {it}: Saved adapter weights to "
                f"{args.adapter_file} and {checkpoint}."
            )

-    # save final adapter weights
-    save_adapter(model, args.adapter_file)
-    print(f"Saved final adapter weights to {args.adapter_file}.")
-
-
-def save_adapter(
-    model: nn.Module,
-    adapter_file: Union[str, Path],
-):
-    flattened_tree = tree_flatten(model.trainable_parameters())
-    mx.save_safetensors(str(adapter_file), dict(flattened_tree))
+    # Save final weights
+    adapter_weights = dict(tree_flatten(model.trainable_parameters()))
+    mx.save_safetensors(str(args.adapter_file), adapter_weights)
+    print(f"Saved final weights to {args.adapter_file}.")
--- a/llms/mlx_lm/tuner/utils.py
+++ b/llms/mlx_lm/tuner/utils.py
@@ -10,8 +10,8 @@ import mlx.optimizers as opt
 from mlx.utils import tree_flatten, tree_unflatten

 from ..models.switch_layers import QuantizedSwitchLinear, SwitchLinear
-from .dora import DoRALinear
-from .lora import LoRALinear, LoRASwitchLinear
+from .dora import DoRAEmbedding, DoRALinear
+from .lora import LoRAEmbedding, LoRALinear, LoRASwitchLinear


 def build_schedule(schedule_config: Dict):
@@ -36,7 +36,7 @@ def build_schedule(schedule_config: Dict):

 def linear_to_lora_layers(
    model: nn.Module,
-    num_lora_layers: int,
+    num_layers: int,
    config: Dict,
    use_dora: bool = False,
 ):
@@ -45,23 +45,17 @@ def linear_to_lora_layers(

    Args:
        model (nn.Module): The neural network model.
-        num_lora_layers (int): The number of blocks to convert to lora layers
+        num_layers (int): The number of blocks to convert to lora layers
        starting from the last layer.
        config (dict): More configuration parameters for LoRA, including the
          rank, scale, and optional layer keys.
        use_dora (bool): If True, uses DoRA instead of LoRA.
          Default: ``False``
    """
-
-    num_layers = len(model.layers)
-
-    if num_lora_layers < 0:
-        num_lora_layers = num_layers
-
-    if num_lora_layers > num_layers:
+    if num_layers > len(model.layers):
        raise ValueError(
-            f"Requested {num_lora_layers} LoRA layers "
-            f"but the model only has {num_layers} layers."
+            f"Requested {num_layers} LoRA layers "
+            f"but the model only has {len(model.layers)} layers."
        )

    def to_lora(layer):
@@ -71,12 +65,14 @@ def linear_to_lora_layers(
            if use_dora:
                raise ValueError(f"{type(layer).__name__} doesn't support DoRA yet.")
            LoRALayer = LoRASwitchLinear
+        elif isinstance(layer, (nn.Embedding, nn.QuantizedEmbedding)):
+            LoRALayer = DoRAEmbedding if use_dora else LoRAEmbedding
        else:
            raise ValueError(
                f"Can't convert layer of type {type(layer).__name__} to LoRA"
            )

-        return LoRALayer.from_linear(
+        return LoRALayer.from_base(
            layer,
            r=config["rank"],
            scale=config["scale"],
@@ -91,17 +87,20 @@ def linear_to_lora_layers(
        "llama",
        "phi",
        "mixtral",
+        "nemotron",
        "stablelm",
        "qwen2",
        "qwen2_moe",
+        "phimoe",
        "gemma",
        "gemma2",
        "starcoder2",
        "cohere",
        "minicpm",
+        "deepseek",
    ]:
        keys = set(["self_attn.q_proj", "self_attn.v_proj"])
-        if model.model_type == "mixtral":
+        if model.model_type in ["mixtral", "phimoe"]:
            keys.add("block_sparse_moe.gate")
        if model.model_type == "qwen2_moe":
            keys.add("mlp.gate")
@@ -111,6 +110,8 @@ def linear_to_lora_layers(
        keys = set(["attn.c_attn"])
    elif model.model_type == "gpt2":
        keys = set(["attn.c_attn"])
+    elif model.model_type == "gpt_neox":
+        keys = set(["attention.query_key_value"])
    elif model.model_type == "olmo":
        keys = set(["att_proj"])
    elif model.model_type == "openelm":
@@ -123,17 +124,41 @@ def linear_to_lora_layers(
        keys = set(["norm_attn_norm.attn.Wqkv", "ffn.router.layer"])
    elif model.model_type == "internlm2":
        keys = set(["attention.wqkv", "attention.wo"])
+    elif model.model_type == "deepseek_v2":
+        keys = set(
+            [
+                "self_attn.q_proj",
+                "self_attn.q_a_proj",
+                "self_attn.q_b_proj",
+                "self_attn.kv_a_proj_with_mqa",
+                "self_attn.kv_b_proj",
+            ]
+        )
+    elif model.model_type == "mamba":
+        keys = set(
+            [
+                "mixer.in_proj",
+                "mixer.x_proj",
+                "mixer.dt_proj",
+                "mixer.out_proj",
+            ]
+        )
    else:
        raise ValueError(f"Lora does not support {model.model_type}")

-    for l in model.layers[num_layers - num_lora_layers :]:
+    for l in model.layers[-min(num_layers, 0) :]:
        lora_layers = [(k, to_lora(m)) for k, m in l.named_modules() if k in keys]
-        l.update_modules(tree_unflatten(lora_layers))
+        if lora_layers:
+            l.update_modules(tree_unflatten(lora_layers))
+
+    lora_modules = [(k, to_lora(m)) for k, m in model.named_modules() if k in keys]
+    if lora_modules:
+        model.update_modules(tree_unflatten(lora_modules))


-def apply_lora_layers(model: nn.Module, adapter_path: str) -> nn.Module:
+def load_adapters(model: nn.Module, adapter_path: str) -> nn.Module:
    """
-    Apply LoRA layers to the model.
+    Load any fine-tuned adapters / layers.

    Args:
        model (nn.Module): The neural network model.
@@ -147,12 +172,14 @@ def apply_lora_layers(model: nn.Module, adapter_path: str) -> nn.Module:
        raise FileNotFoundError(f"The adapter path does not exist: {adapter_path}")
    with open(adapter_path / "adapter_config.json", "r") as fid:
        config = types.SimpleNamespace(**json.load(fid))
-    linear_to_lora_layers(
-        model,
-        config.lora_layers,
-        config.lora_parameters,
-        getattr(config, "use_dora", False),
-    )
+    fine_tune_type = getattr(config, "fine_tune_type", "lora")
+    if fine_tune_type != "full":
+        linear_to_lora_layers(
+            model,
+            config.num_layers,
+            config.lora_parameters,
+            use_dora=(fine_tune_type == "dora"),
+        )
    model.load_weights(str(adapter_path / "adapters.safetensors"), strict=False)
    return model

--- a/llms/mlx_lm/utils.py
+++ b/llms/mlx_lm/utils.py
@@ -1,5 +1,6 @@
 # Copyright © 2023-2024 Apple Inc.

+import contextlib
 import copy
 import glob
 import importlib
@@ -9,21 +10,20 @@ import shutil
 import time
 from pathlib import Path
 from textwrap import dedent
-from typing import Any, Callable, Dict, Generator, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Type, Union

 import mlx.core as mx
 import mlx.nn as nn
 from huggingface_hub import snapshot_download
-from huggingface_hub.utils._errors import RepositoryNotFoundError
-from mlx.utils import tree_flatten
+from mlx.utils import tree_flatten, tree_reduce
 from transformers import PreTrainedTokenizer

 # Local imports
-from .models.base import KVCache
-from .sample_utils import top_p_sampling
+from .models import cache
+from .sample_utils import categorical_sampling, min_p_sampling, top_p_sampling
 from .tokenizer_utils import TokenizerWrapper, load_tokenizer
-from .tuner.utils import apply_lora_layers
 from .tuner.utils import dequantize as dequantize_model
+from .tuner.utils import load_adapters

 # Constants
 MODEL_REMAPPING = {
@@ -40,6 +40,40 @@ class ModelNotFoundError(Exception):
        super().__init__(self.message)


+@contextlib.contextmanager
+def wired_limit(model: nn.Module, streams: Optional[List[mx.Stream]] = None):
+    """
+    A context manager to temporarily change the wired limit.
+
+    Note, the wired limit should not be changed during an async eval.  If an
+    async eval could be running pass in the streams to synchronize with prior
+    to exiting the context manager.
+    """
+    model_bytes = tree_reduce(
+        lambda acc, x: acc + x.nbytes if isinstance(x, mx.array) else acc, model, 0
+    )
+    max_rec_size = mx.metal.device_info()["max_recommended_working_set_size"]
+    if model_bytes > 0.9 * max_rec_size:
+        model_mb = model_bytes // 2**20
+        max_rec_mb = max_rec_size // 2**20
+        print(
+            "[WARNING] Generating with a model that requires {model_mb} MB "
+            "which is close to the maximum recommended size of {max_rec_mb} "
+            "MB. This can be slow. See the documentation for possible work-arounds: "
+            "https://github.com/ml-explore/mlx-examples/tree/main/llms#large-models"
+        )
+    old_limit = mx.metal.set_wired_limit(max_rec_size)
+    try:
+        yield None
+    finally:
+        if streams is not None:
+            for s in streams:
+                mx.synchronize(s)
+        else:
+            mx.synchronize()
+        mx.metal.set_wired_limit(old_limit)
+
+
 def _get_classes(config: dict):
    """
    Retrieve the model and model args classes based on the configuration.
@@ -91,7 +125,7 @@ def get_model_path(path_or_hf_repo: str, revision: Optional[str] = None) -> Path
                    ],
                )
            )
-        except RepositoryNotFoundError:
+        except:
            raise ModelNotFoundError(
                f"Model not found for path or HF repo: {path_or_hf_repo}.\n"
                "Please make sure you specified the local path or Hugging Face"
@@ -102,7 +136,7 @@ def get_model_path(path_or_hf_repo: str, revision: Optional[str] = None) -> Path
    return model_path


-def apply_repetition_penalty(logits: mx.array, generated_tokens: Any, penalty: float):
+def apply_repetition_penalty(logits: mx.array, tokens: mx.array, penalty: float):
    """
    Apply repetition penalty to specific logits based on the given context.

@@ -110,22 +144,33 @@ def apply_repetition_penalty(logits: mx.array, generated_tokens: Any, penalty: f

    Args:
        logits (mx.array): The logits produced by the language model.
-        generated_tokens (any): A list of N previous tokens.
+        tokens (mx.array): A list of N previous tokens.
        penalty (float): The repetition penalty factor to be applied.

    Returns:
        logits (mx.array): Logits with repetition penalty applied to generated tokens.
    """
-    if len(generated_tokens) > 0:
-        indices = mx.array([token for token in generated_tokens])
-        selected_logits = logits[:, indices]
+    if len(tokens) > 0:
+        selected_logits = logits[:, tokens]
        selected_logits = mx.where(
            selected_logits < 0, selected_logits * penalty, selected_logits / penalty
        )
-        logits[:, indices] = selected_logits
+        logits[:, tokens] = selected_logits
    return logits


+def maybe_quantize_kv_cache(prompt_cache, quantized_kv_start, kv_group_size, kv_bits):
+    if (
+        kv_bits is not None
+        and not isinstance(prompt_cache[0], cache.QuantizedKVCache)
+        and prompt_cache[0].offset > quantized_kv_start
+    ):
+        for i in range(len(prompt_cache)):
+            prompt_cache[i] = prompt_cache[i].to_quantized(
+                group_size=kv_group_size, bits=kv_bits
+            )
+
+
 def generate_step(
    prompt: mx.array,
    model: nn.Module,
@@ -133,7 +178,16 @@ def generate_step(
    repetition_penalty: Optional[float] = None,
    repetition_context_size: Optional[int] = 20,
    top_p: float = 1.0,
+    min_p: float = 0.0,
+    min_tokens_to_keep: int = 1,
+    prefill_step_size: int = 512,
+    max_kv_size: Optional[int] = None,
+    prompt_cache: Optional[Any] = None,
    logit_bias: Optional[Dict[int, float]] = None,
+    logits_processor: Optional[List[Callable[[mx.array, mx.array], mx.array]]] = None,
+    kv_bits: Optional[int] = None,
+    kv_group_size: int = 64,
+    quantized_kv_start: int = 0,
 ) -> Generator[Tuple[mx.array, mx.array], None, None]:
    """
    A generator producing token ids based on the given prompt from the model.
@@ -149,7 +203,24 @@ def generate_step(
          consider for repetition penalty. Default: ``20``.
        top_p (float, optional): Nulceus sampling, higher means model considers
          more less likely words.
+        min_p (float, optional): The minimum value (scaled by the top token's
+          probability) that a token probability must have to be considered.
+        min_tokens_to_keep (int, optional): Minimum number of tokens that cannot
+          be filtered by min_p sampling.
+        prefill_step_size (int): Step size for processing the prompt.
+        max_kv_size (int, optional): Maximum size of the key-value cache. Old
+          entries (except the first 4 tokens) will be overwritten.
+        prompt_cache (List[Any], optional): A pre-computed prompt cache. Note, if
+          provided, the cache will be updated in place.
        logit_bias (dictionary, optional): Additive logit bias.
+        logits_processor (List[Callable[[mx.array, mx.array], mx.array]], optional):
+            A list of functions that take tokens and logits and return the processed
+            logits. Default: ``None``.
+        kv_bits (int, optional): Number of bits to use for KV cache quantization.
+            None implies no cache quantization. Default: ``None``.
+        kv_group_size (int): Group size for KV cache quantization. Default: ``64``.
+        quantized_kv_start (int): Step to begin using a quantized KV cache.
+            when ``kv_bits`` is non-None. Default: ``0``.

    Yields:
        Generator[Tuple[mx.array, mx.array], None, None]: A generator producing
@@ -157,10 +228,6 @@ def generate_step(
    """

    def sample(logits: mx.array) -> Tuple[mx.array, float]:
-        if logit_bias:
-            indices = mx.array(list(logit_bias.keys()))
-            values = mx.array(list(logit_bias.values()))
-            logits[:, indices] += values
        logprobs = logits - mx.logsumexp(logits)

        if temp == 0:
@@ -168,8 +235,10 @@ def generate_step(
        else:
            if top_p > 0 and top_p < 1.0:
                token = top_p_sampling(logits, top_p, temp)
+            elif min_p != 0.0:
+                token = min_p_sampling(logits, min_p, min_tokens_to_keep, temp)
            else:
-                token = mx.random.categorical(logits * (1 / temp))
+                token = categorical_sampling(logits, temp)

        return token, logprobs

@@ -180,45 +249,75 @@ def generate_step(
            f"repetition_penalty must be a non-negative float, got {repetition_penalty}"
        )

+    logits_processor = logits_processor or []
+
+    if repetition_penalty:
+
+        def repetition_penalty_processor(tokens, logits):
+            return apply_repetition_penalty(
+                logits, tokens[-repetition_context_size:], repetition_penalty
+            )
+
+        logits_processor.append(repetition_penalty_processor)
+
+    if logit_bias:
+        indices = mx.array(list(logit_bias.keys()))
+        values = mx.array(list(logit_bias.values()))
+
+        def logit_bias_processor(_, logits):
+            logits[:, indices] += values
+            return logits
+
+        logits_processor.append(logit_bias_processor)
+
    y = prompt
-    kv_heads = (
-        [model.n_kv_heads] * len(model.layers)
-        if isinstance(model.n_kv_heads, int)
-        else model.n_kv_heads
-    )
-    cache = [KVCache(model.head_dim, n) for n in kv_heads]
+    tokens = None

-    repetition_context = prompt.tolist()
-
-    if repetition_context_size:
-        repetition_context = repetition_context[-repetition_context_size:]
+    # Create the KV cache for generation
+    if prompt_cache is None:
+        prompt_cache = cache.make_prompt_cache(
+            model,
+            max_kv_size=max_kv_size,
+        )
+    elif len(prompt_cache) != len(model.layers):
+        raise ValueError("Wrong number of layers in the prompt cache.")

    def _step(y):
-        nonlocal repetition_context
-        logits = model(y[None], cache=cache)
+
+        logits = model(y[None], cache=prompt_cache)
        logits = logits[:, -1, :]

-        if repetition_penalty:
-            logits = apply_repetition_penalty(
-                logits, repetition_context, repetition_penalty
-            )
-            y, logprobs = sample(logits)
-            repetition_context.append(y.item())
-        else:
-            y, logprobs = sample(logits)
+        if logits_processor:
+            nonlocal tokens
+            tokens = mx.concat([tokens, y]) if tokens is not None else y

-        if repetition_context_size:
-            if len(repetition_context) > repetition_context_size:
-                repetition_context = repetition_context[-repetition_context_size:]
+            for processor in logits_processor:
+                logits = processor(tokens, logits)
+
+        maybe_quantize_kv_cache(
+            prompt_cache, quantized_kv_start, kv_group_size, kv_bits
+        )
+
+        y, logprobs = sample(logits)
        return y, logprobs.squeeze(0)

+    while y.size > prefill_step_size:
+        model(y[:prefill_step_size][None], cache=prompt_cache)
+        mx.eval([c.state for c in prompt_cache])
+        y = y[prefill_step_size:]
+        mx.metal.clear_cache()
+
    y, logprobs = _step(y)

-    mx.async_eval(y)
+    mx.async_eval(y, logprobs)
+    n = 0
    while True:
        next_y, next_logprobs = _step(y)
-        mx.async_eval(next_y)
+        mx.async_eval(next_y, next_logprobs)
        yield y.item(), logprobs
+        if n % 256 == 0:
+            mx.metal.clear_cache()
+        n += 1
        y, logprobs = next_y, next_logprobs


@@ -249,9 +348,9 @@ def stream_generate(
    detokenizer = tokenizer.detokenizer

    detokenizer.reset()
-    for (token, _), n in zip(
-        generate_step(prompt_tokens, model, **kwargs),
+    for n, (token, _) in zip(
        range(max_tokens),
+        generate_step(prompt_tokens, model, **kwargs),
    ):
        if token == tokenizer.eos_token_id:
            break
@@ -298,44 +397,50 @@ def generate(
    prompt_tokens = mx.array(tokenizer.encode(prompt))
    detokenizer = tokenizer.detokenizer

-    tic = time.perf_counter()
-    detokenizer.reset()
+    with wired_limit(model):
+        tic = time.perf_counter()
+        detokenizer.reset()
+        for n, (token, logprobs) in zip(
+            range(max_tokens),
+            generate_step(prompt_tokens, model, **kwargs),
+        ):
+            if n == 0:
+                prompt_time = time.perf_counter() - tic
+                tic = time.perf_counter()
+            if token == tokenizer.eos_token_id:
+                break
+            detokenizer.add_token(token)

-    for (token, logprobs), n in zip(
-        generate_step(prompt_tokens, model, **kwargs),
-        range(max_tokens),
-    ):
-        if n == 0:
-            prompt_time = time.perf_counter() - tic
-            tic = time.perf_counter()
-        if token == tokenizer.eos_token_id:
-            break
-        detokenizer.add_token(token)
+            if verbose:
+                if formatter:
+                    # We have to finalize so that the prob corresponds to the last segment
+                    detokenizer.finalize()
+                    with mx.stream(mx.cpu):
+                        prob = mx.exp(logprobs[token]).item()
+                    formatter(detokenizer.last_segment, prob)
+                else:
+                    print(detokenizer.last_segment, end="", flush=True)
+
+        token_count = n + 1
+        detokenizer.finalize()

        if verbose:
-            if formatter:
-                # We have to finalize so that the prob corresponds to the last segment
-                detokenizer.finalize()
-                formatter(detokenizer.last_segment, mx.exp(logprobs[token]).item())
-            else:
-                print(detokenizer.last_segment, end="", flush=True)
+            gen_time = time.perf_counter() - tic
+            print(detokenizer.last_segment, flush=True)
+            print("=" * 10)
+            if token_count == 0:
+                print("No tokens generated for this prompt")
+                return
+            prompt_tps = prompt_tokens.size / prompt_time
+            gen_tps = (token_count - 1) / gen_time
+            print(
+                f"Prompt: {prompt_tokens.size} tokens, {prompt_tps:.3f} tokens-per-sec"
+            )
+            print(f"Generation: {token_count} tokens, {gen_tps:.3f} tokens-per-sec")
+            peak_mem = mx.metal.get_peak_memory() / 2**30
+            print(f"Peak memory: {peak_mem:.3f} GB")

-    token_count = n + 1
-    detokenizer.finalize()
-
-    if verbose:
-        gen_time = time.perf_counter() - tic
-        print(detokenizer.last_segment, flush=True)
-        print("=" * 10)
-        if token_count == 0:
-            print("No tokens generated for this prompt")
-            return
-        prompt_tps = prompt_tokens.size / prompt_time
-        gen_tps = (token_count - 1) / gen_time
-        print(f"Prompt: {prompt_tps:.3f} tokens-per-sec")
-        print(f"Generation: {gen_tps:.3f} tokens-per-sec")
-
-    return detokenizer.text
+        return detokenizer.text


 def load_config(model_path: Path) -> dict:
@@ -352,6 +457,7 @@ def load_model(
    model_path: Path,
    lazy: bool = False,
    model_config: dict = {},
+    get_model_classes: Callable[[dict], Tuple[Type[nn.Module], Type]] = _get_classes,
 ) -> nn.Module:
    """
    Load and initialize the model from a given path.
@@ -361,8 +467,11 @@ def load_model(
        lazy (bool): If False eval the model parameters to make sure they are
            loaded in memory before returning, otherwise they will be loaded
            when needed. Default: ``False``
-        model_config(dict, optional): Configuration parameters for the model.
+        model_config (dict, optional): Configuration parameters for the model.
            Defaults to an empty dictionary.
+        get_model_classes (Callable[[dict], Tuple[Type[nn.Module], Type]], optional):
+            A function that returns the model class and model args class given a config.
+            Defaults to the _get_classes function.

    Returns:
        nn.Module: The loaded and initialized model.
@@ -389,7 +498,7 @@ def load_model(
    for wf in weight_files:
        weights.update(mx.load(wf))

-    model_class, model_args_class = _get_classes(config=config)
+    model_class, model_args_class = get_model_classes(config=config)

    model_args = model_args_class.from_dict(config)
    model = model_class(model_args)
@@ -451,7 +560,7 @@ def load(

    model = load_model(model_path, lazy, model_config)
    if adapter_path is not None:
-        model = apply_lora_layers(model, adapter_path)
+        model = load_adapters(model, adapter_path)
        model.eval()
    tokenizer = load_tokenizer(model_path, tokenizer_config)

@@ -508,6 +617,7 @@ def upload_to_hub(path: str, upload_repo: str, hf_path: str):

    card = ModelCard.load(hf_path)
    card.data.tags = ["mlx"] if card.data.tags is None else card.data.tags + ["mlx"]
+    card.data.base_model = hf_path
    card.text = dedent(
        f"""
        # {upload_repo}
@@ -524,7 +634,16 @@ def upload_to_hub(path: str, upload_repo: str, hf_path: str):
        from mlx_lm import load, generate

        model, tokenizer = load("{upload_repo}")
-        response = generate(model, tokenizer, prompt="hello", verbose=True)
+
+        prompt="hello"
+
+        if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template is not None:
+            messages = [{{"role": "user", "content": prompt}}]
+            prompt = tokenizer.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+
+        response = generate(model, tokenizer, prompt=prompt, verbose=True)
        ```
        """
    )
@@ -614,6 +733,8 @@ def quantize_model(
    quantized_config = copy.deepcopy(config)
    nn.quantize(model, q_group_size, q_bits)
    quantized_config["quantization"] = {"group_size": q_group_size, "bits": q_bits}
+    # support hf model tree #957
+    quantized_config["quantization_config"] = quantized_config["quantization"]
    quantized_weights = dict(tree_flatten(model.parameters()))

    return quantized_weights, quantized_config
@@ -653,12 +774,22 @@ def convert(
    revision: Optional[str] = None,
    dequantize: bool = False,
 ):
+    # Check the save path is empty
+    if isinstance(mlx_path, str):
+        mlx_path = Path(mlx_path)
+
+    if mlx_path.exists():
+        raise ValueError(
+            f"Cannot save to the path {mlx_path} as it already exists."
+            " Please delete the file/directory or specify a new path to save to."
+        )
+
    print("[INFO] Loading")
    model_path = get_model_path(hf_path, revision=revision)
    model, config, tokenizer = fetch_from_hub(model_path, lazy=True)

    weights = dict(tree_flatten(model.parameters()))
-    dtype = mx.float16 if quantize else getattr(mx, dtype)
+    dtype = getattr(mx, dtype)
    weights = {k: v.astype(dtype) for k, v in weights.items()}

    if quantize and dequantize:
@@ -674,9 +805,6 @@ def convert(
        model = dequantize_model(model)
        weights = dict(tree_flatten(model.parameters()))

-    if isinstance(mlx_path, str):
-        mlx_path = Path(mlx_path)
-
    del model
    save_weights(mlx_path, weights, donate_weights=True)

--- a/llms/setup.py
+++ b/llms/setup.py
@@ -10,7 +10,7 @@ with open(package_dir / "requirements.txt") as fid:
    requirements = [l.strip() for l in fid.readlines()]

 sys.path.append(str(package_dir))
-from version import __version__
+from _version import __version__

 setup(
    name="mlx-lm",
@@ -31,6 +31,8 @@ setup(
    },
    entry_points={
        "console_scripts": [
+            "mlx_lm.cache_prompt = mlx_lm.cache_prompt:main",
+            "mlx_lm.chat = mlx_lm.chat:main",
            "mlx_lm.convert = mlx_lm.convert:main",
            "mlx_lm.fuse = mlx_lm.fuse:main",
            "mlx_lm.generate = mlx_lm.generate:main",
--- a/llms/tests/test_finetune.py
+++ b/llms/tests/test_finetune.py
@@ -0,0 +1,447 @@
+# Copyright © 2024 Apple Inc.
+
+import math
+import sys
+import unittest
+from contextlib import contextmanager
+from io import StringIO
+from unittest.mock import MagicMock
+
+import mlx.core as mx
+import mlx.nn as nn
+import mlx.optimizers as opt
+from mlx.utils import tree_flatten
+from mlx_lm import lora, tuner
+from mlx_lm.tuner.dora import DoRAEmbedding, DoRALinear
+from mlx_lm.tuner.lora import LoRAEmbedding, LoRALinear
+from mlx_lm.tuner.trainer import evaluate
+from mlx_lm.tuner.utils import build_schedule
+
+
+@contextmanager
+def swapped_with_identity(obj, func):
+    old_func = getattr(obj, func)
+    setattr(obj, func, lambda x: x)
+    yield
+    setattr(obj, func, old_func)
+
+
+class TestLora(unittest.TestCase):
+    def setUp(self):
+        self.capturedOutput = StringIO()
+        sys.stdout = self.capturedOutput
+
+    def tearDown(self):
+        sys.stdout = sys.__stdout__
+
+    def test_llama(self):
+        from mlx_lm.models import llama
+
+        args = llama.ModelArgs(
+            model_type="llama",
+            hidden_size=1024,
+            num_hidden_layers=4,
+            intermediate_size=2048,
+            num_attention_heads=4,
+            rms_norm_eps=1e-5,
+            vocab_size=10_000,
+            tie_word_embeddings=False,
+        )
+
+        lora_layers = 4
+
+        def check_config(params, expected_trainable_parameters=None):
+            n_keys = 2
+            if "keys" in params:
+                n_keys = len(params["keys"])
+            model = llama.Model(args)
+            model.freeze()
+            tuner.utils.linear_to_lora_layers(model, lora_layers, params)
+            trainable_params = sum(
+                v.size for _, v in tree_flatten(model.trainable_parameters())
+            )
+
+            expected_trainable_parameters = expected_trainable_parameters or (
+                lora_layers * params["rank"] * args.hidden_size * 2 * n_keys
+            )
+            self.assertEqual(trainable_params, expected_trainable_parameters)
+
+        params = {"rank": 8, "alpha": 16, "dropout": 0.0, "scale": 10.0}
+        check_config(params)
+
+        params["rank"] = 1
+        check_config(params)
+
+        params["keys"] = ["self_attn.k_proj"]
+        check_config(params)
+
+        params["keys"] = ["lm_head"]
+        check_config(
+            params,
+            expected_trainable_parameters=(
+                params["rank"] * (args.hidden_size + args.vocab_size)
+            ),
+        )
+
+        params["keys"] = ["model.embed_tokens"]
+        check_config(
+            params,
+            expected_trainable_parameters=(
+                params["rank"] * (args.hidden_size + args.vocab_size)
+            ),
+        )
+
+    def test_gpt_neox(self):
+        from mlx_lm.models import gpt_neox
+
+        args = gpt_neox.ModelArgs(
+            model_type="gpt_neox",
+            max_position_embeddings=2048,
+            hidden_size=6144,
+            num_attention_heads=64,
+            num_hidden_layers=44,
+            layer_norm_eps=1e-5,
+            vocab_size=50432,
+            rotary_emb_base=10_000,
+            rotary_pct=0.25,
+        )
+
+        num_lora_layers = 4
+        params = {"rank": 8, "alpha": 16, "dropout": 0.0, "scale": 10.0}
+
+        model = gpt_neox.Model(args)
+        model.freeze()
+        tuner.utils.linear_to_lora_layers(model, num_lora_layers, params)
+
+    def test_lora_embedding(self):
+        num_embeddings = 256
+        dims = 512
+        tokens = mx.array([1, 2, 3])
+
+        embedding = nn.QuantizedEmbedding(num_embeddings, dims)
+        dequantized_weight = mx.dequantize(
+            embedding.weight,
+            embedding.scales,
+            embedding.biases,
+            embedding.group_size,
+            embedding.bits,
+        )
+        lora_emb = LoRAEmbedding.from_base(embedding, r=8, dropout=0, scale=10)
+        new_embedding = lora_emb.fuse(de_quantize=True)
+        self.assertTrue(mx.array_equal(dequantized_weight, new_embedding.weight))
+        self.assertTrue(mx.array_equal(embedding(tokens), lora_emb(tokens)))
+
+        # as_linear
+        attn_output = mx.random.uniform(shape=(dims,))
+        embedding_lin_out = lora_emb.as_linear(attn_output)
+        self.assertEqual(embedding_lin_out.shape, (num_embeddings,))
+        self.assertTrue(
+            mx.array_equal(embedding_lin_out, embedding.as_linear(attn_output))
+        )
+
+        # change the value of lora_b and the embeddings will no longer be equal
+        lora_emb.lora_b = mx.random.uniform(shape=lora_emb.lora_b.shape)
+        new_embedding = lora_emb.fuse(de_quantize=True)
+        self.assertFalse(mx.array_equal(dequantized_weight, new_embedding.weight))
+        self.assertFalse(mx.array_equal(embedding(tokens), lora_emb(tokens)))
+
+
+class TestDora(unittest.TestCase):
+    def test_dora_embedding(self):
+        num_embeddings = 256
+        dims = 512
+        tokens = mx.array([1, 2, 3])
+
+        embedding = nn.Embedding(num_embeddings, dims)
+
+        dora_emb = DoRAEmbedding.from_base(embedding, r=8, dropout=0, scale=10)
+        new_embedding = dora_emb.fuse()
+        self.assertTrue(mx.array_equal(embedding.weight, new_embedding.weight))
+        self.assertTrue(mx.array_equal(embedding(tokens), dora_emb(tokens)))
+
+        # as_linear
+        attn_output = mx.random.uniform(shape=(dims,))
+        embedding_lin_out = dora_emb.as_linear(attn_output)
+        self.assertEqual(embedding_lin_out.shape, (num_embeddings,))
+        self.assertTrue(
+            mx.array_equal(embedding_lin_out, embedding.as_linear(attn_output))
+        )
+
+        # change the value of lora_b and the embeddings will no longer be equal
+        dora_emb.lora_b = mx.random.uniform(shape=dora_emb.lora_b.shape)
+        new_embedding = dora_emb.fuse()
+        self.assertFalse(mx.array_equal(embedding.weight, new_embedding.weight))
+        self.assertFalse(mx.array_equal(embedding(tokens), dora_emb(tokens)))
+
+    def test_llama(self):
+        from mlx_lm.models import llama
+
+        hidden_size = 1024
+        intermediate_size = 2048
+        args = llama.ModelArgs(
+            model_type="llama",
+            hidden_size=hidden_size,
+            num_hidden_layers=4,
+            intermediate_size=intermediate_size,
+            num_attention_heads=4,
+            rms_norm_eps=1e-5,
+            vocab_size=10_000,
+        )
+
+        dora_layers = 4
+
+        def check_config(params):
+            n_keys = 2
+            if "keys" in params:
+                n_keys = len(params["keys"])
+            model = llama.Model(args)
+            model.freeze()
+            tuner.utils.linear_to_lora_layers(model, dora_layers, params, use_dora=True)
+            trainable_params = sum(
+                v.size for _, v in tree_flatten(model.trainable_parameters())
+            )
+            self.assertEqual(
+                trainable_params,
+                dora_layers
+                * (params["rank"] * hidden_size * 2 * n_keys + n_keys * hidden_size),
+            )
+
+        params = {"rank": 8, "alpha": 16, "dropout": 0.0, "scale": 10.0}
+        check_config(params)
+
+        params["rank"] = 1
+        check_config(params)
+
+        params["keys"] = ["self_attn.k_proj"]
+        check_config(params)
+
+    def test_dora_m_parameter(self):
+        dora_lin = DoRALinear(input_dims=100, output_dims=100)
+        self.assertTrue(
+            mx.allclose(dora_lin.m, mx.linalg.norm(dora_lin.linear.weight, axis=1))
+        )
+
+        # Recomputes m when changing Linear
+        inital_m = dora_lin.m
+        lin = nn.Linear(10, 10)
+        dora_lin.set_linear(lin)
+        self.assertTrue(mx.allclose(dora_lin.m, mx.linalg.norm(lin.weight, axis=1)))
+
+        # Works with quantized weights
+        quantized_linear = nn.QuantizedLinear(512, 512)
+        dora_lin.set_linear(quantized_linear)
+        dequantized_weight = mx.dequantize(
+            quantized_linear.weight,
+            quantized_linear.scales,
+            quantized_linear.biases,
+            quantized_linear.group_size,
+            quantized_linear.bits,
+        )
+        self.assertTrue(
+            mx.allclose(dora_lin.m, mx.linalg.norm(dequantized_weight, axis=1))
+        )
+
+    def test_dora_from_linear(self):
+        in_dims = 256
+        out_dims = 256
+        r = 4
+
+        linear = nn.Linear(in_dims, out_dims)
+        dora_lin = DoRALinear.from_base(linear, r)
+        self.assertTrue(mx.allclose(dora_lin.m, mx.linalg.norm(linear.weight, axis=1)))
+        self.assertEqual(dora_lin.lora_a.shape, (in_dims, r))
+        self.assertEqual(dora_lin.lora_b.shape, (r, out_dims))
+        self.assertEqual(dora_lin.m.shape, (out_dims,))
+
+        quantized_linear = nn.QuantizedLinear(in_dims, out_dims)
+        dequantized_weight = mx.dequantize(
+            quantized_linear.weight,
+            quantized_linear.scales,
+            quantized_linear.biases,
+            quantized_linear.group_size,
+            quantized_linear.bits,
+        )
+        dora_quant_lin = DoRALinear.from_base(quantized_linear, r)
+        self.assertTrue(
+            mx.allclose(dora_quant_lin.m, mx.linalg.norm(dequantized_weight, axis=1))
+        )
+        self.assertEqual(dora_quant_lin.lora_a.shape, (in_dims, r))
+        self.assertEqual(dora_quant_lin.lora_b.shape, (r, out_dims))
+        self.assertEqual(dora_quant_lin.m.shape, (out_dims,))
+
+    def test_dora_to_linear(self):
+        in_dims = 256
+        out_dims = 256
+        r = 4
+
+        linear = nn.Linear(in_dims, out_dims, bias=True)
+        dora_lin = DoRALinear.from_base(linear, r)
+        to_linear = dora_lin.fuse()
+        self.assertTrue(mx.allclose(linear.weight, to_linear.weight))
+        self.assertTrue(mx.allclose(linear.bias, to_linear.bias))
+
+        def dequantize_weight(quantized_linear):
+            return mx.dequantize(
+                quantized_linear.weight,
+                quantized_linear.scales,
+                quantized_linear.biases,
+                quantized_linear.group_size,
+                quantized_linear.bits,
+            )
+
+        quantized_linear = nn.QuantizedLinear(in_dims, out_dims, bias=True)
+        dora_quantized_linear = DoRALinear.from_base(quantized_linear, r)
+        # Dequantize
+        to_linear_from_quantized = dora_quantized_linear.fuse(de_quantize=True)
+        self.assertTrue(
+            mx.allclose(quantized_linear.bias, to_linear_from_quantized.bias)
+        )
+        self.assertTrue(
+            mx.allclose(
+                dequantize_weight(quantized_linear), to_linear_from_quantized.weight
+            )
+        )
+
+    def test_dora_dtype(self):
+        in_dims = 256
+        out_dims = 256
+        r = 4
+
+        linear = nn.Linear(in_dims, out_dims, bias=True)
+        linear.set_dtype(mx.float16)
+        dora_lin = DoRALinear.from_base(linear, r)
+
+        x = mx.random.uniform(shape=(2, 256)).astype(mx.float16)
+        self.assertEqual(dora_lin(x).dtype, mx.float16)
+
+
+class TestScheduleConfig(unittest.TestCase):
+    def test_join(self):
+        config = {"name": "cosine_decay", "warmup": 100, "arguments": [1e-5, 100]}
+        cos_with_warmup = build_schedule(config)
+        self.assertIsNotNone(cos_with_warmup)
+
+        self.assertEqual(cos_with_warmup(0), 0.0)
+        self.assertAlmostEqual(cos_with_warmup(101), 1e-5, delta=1e-1)
+        optimizer = opt.Adam(learning_rate=cos_with_warmup)
+        for _ in range(100):
+            optimizer.update({}, {})
+        self.assertAlmostEqual(optimizer.learning_rate.item(), 1e-5, delta=1e-1)
+        for _ in range(100):
+            optimizer.update({}, {})
+        expected_lr = 1e-5 * 0.5 * (1.0 + math.cos(math.pi * 200 / 10))
+        self.assertAlmostEqual(optimizer.learning_rate.item(), expected_lr, delta=1e-1)
+
+    def test_single_schedule(self):
+
+        config = {
+            "name": "cosine_decay",
+            "arguments": [0.1, 10],
+        }
+        lr_schedule = build_schedule(config)
+        lr = lr_schedule(4)
+        expected_lr = 0.1 * 0.5 * (1.0 + math.cos(math.pi * 4 / 10))
+        self.assertAlmostEqual(lr, expected_lr, delta=1e-7)
+
+    def test_non_zero_warmup(self):
+        config = {
+            "name": "cosine_decay",
+            "warmup": 10,
+            "warmup_init": 1e-6,
+            "arguments": [1e-5, 20],
+        }
+        lr_schedule = build_schedule(config)
+        lr = lr_schedule(0)
+        self.assertAlmostEqual(lr, 1e-6, delta=1e-7)
+
+    def test_malformed_config(self):
+        config = {"warmup": 100}
+        self.assertRaises(KeyError, build_schedule, config)
+
+        config = {"cosine_decay": None}
+        self.assertRaises(KeyError, build_schedule, config)
+
+    def test_evaluate_calls(self):
+        mock_model = MagicMock()
+        mock_dataset = MagicMock()
+        mock_tokenizer = MagicMock()
+        mock_default_loss = MagicMock()
+        mock_iterate_batches = MagicMock()
+
+        mock_iterate_batches.return_value = [
+            (MagicMock(), MagicMock()),
+            (MagicMock(), MagicMock()),
+            (MagicMock(), MagicMock()),
+            (MagicMock(), MagicMock()),
+            (MagicMock(), MagicMock()),
+        ]
+
+        mock_default_loss.side_effect = [
+            (MagicMock(return_value=0.5), MagicMock(return_value=100)),
+            (MagicMock(return_value=0.3), MagicMock(return_value=200)),
+            (MagicMock(return_value=0.2), MagicMock(return_value=150)),
+            (MagicMock(return_value=0.4), MagicMock(return_value=180)),
+            (MagicMock(return_value=0.6), MagicMock(return_value=120)),
+        ]
+        with swapped_with_identity(mx.distributed, "all_sum"):
+            evaluate(
+                model=mock_model,
+                dataset=mock_dataset,
+                tokenizer=mock_tokenizer,
+                batch_size=2,
+                num_batches=2,
+                max_seq_length=2048,
+                loss=mock_default_loss,
+                iterate_batches=mock_iterate_batches,
+            )
+
+        mock_iterate_batches.assert_called_once_with(
+            dataset=mock_dataset,
+            tokenizer=mock_tokenizer,
+            batch_size=2,
+            max_seq_length=2048,
+        )
+        self.assertEqual(mock_default_loss.call_count, 2)
+
+    def test_evaluate_infinite_batches(self):
+        mock_model = MagicMock()
+        mock_dataset = MagicMock()
+        mock_tokenizer = MagicMock()
+        mock_default_loss = MagicMock()
+        mock_iterate_batches = MagicMock()
+
+        mock_iterate_batches.return_value = [
+            (MagicMock(), MagicMock()),
+            (MagicMock(), MagicMock()),
+            (MagicMock(), MagicMock()),
+        ]
+
+        mock_default_loss.side_effect = [
+            (MagicMock(return_value=0.5), MagicMock(return_value=100)),
+            (MagicMock(return_value=0.3), MagicMock(return_value=200)),
+            (MagicMock(return_value=0.2), MagicMock(return_value=150)),
+        ]
+
+        with swapped_with_identity(mx.distributed, "all_sum"):
+            evaluate(
+                model=mock_model,
+                dataset=mock_dataset,
+                tokenizer=mock_tokenizer,
+                batch_size=2,
+                num_batches=-1,
+                max_seq_length=2048,
+                loss=mock_default_loss,
+                iterate_batches=mock_iterate_batches,
+            )
+
+        mock_iterate_batches.assert_called_once_with(
+            dataset=mock_dataset,
+            tokenizer=mock_tokenizer,
+            batch_size=2,
+            max_seq_length=2048,
+        )
+        self.assertEqual(mock_default_loss.call_count, 3)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/llms/tests/test_generate.py
+++ b/llms/tests/test_generate.py
@@ -0,0 +1,55 @@
+# Copyright © 2024 Apple Inc.
+
+import unittest
+
+from mlx_lm.utils import generate, load
+
+
+class TestGenerate(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        HF_MODEL_PATH = "mlx-community/Qwen1.5-0.5B-Chat-4bit"
+        cls.model, cls.tokenizer = load(HF_MODEL_PATH)
+
+    def test_generate(self):
+        # Simple test that generation runs
+        text = generate(
+            self.model, self.tokenizer, "hello", max_tokens=5, verbose=False
+        )
+
+    def test_generate_with_logit_bias(self):
+        logit_bias = {0: 2000.0, 1: -20.0}
+        text = generate(
+            self.model,
+            self.tokenizer,
+            "hello",
+            max_tokens=5,
+            verbose=False,
+            logit_bias=logit_bias,
+        )
+        self.assertEqual(text, "!!!!!")
+
+    def test_generate_with_processor(self):
+        init_toks = self.tokenizer.encode("hello")
+
+        all_toks = None
+
+        def logits_processor(toks, logits):
+            nonlocal all_toks
+            all_toks = toks
+            return logits
+
+        generate(
+            self.model,
+            self.tokenizer,
+            "hello",
+            max_tokens=5,
+            verbose=False,
+            logits_processor=[logits_processor],
+        )
+        self.assertEqual(len(all_toks), len(init_toks) + 5)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/llms/tests/test_lora.py
+++ b/llms/tests/test_lora.py
@@ -1,191 +0,0 @@
-# Copyright © 2024 Apple Inc.
-
-import math
-import sys
-import unittest
-from io import StringIO
-from unittest.mock import MagicMock
-
-import mlx.optimizers as opt
-from mlx.utils import tree_flatten
-from mlx_lm import lora, tuner
-from mlx_lm.tuner.lora import LoRALinear
-from mlx_lm.tuner.trainer import evaluate
-from mlx_lm.tuner.utils import build_schedule
-
-
-class TestLora(unittest.TestCase):
-    def setUp(self):
-        self.capturedOutput = StringIO()
-        sys.stdout = self.capturedOutput
-
-    def tearDown(self):
-        sys.stdout = sys.__stdout__
-
-    def test_to_lora(self):
-        from mlx_lm.models import llama
-
-        args = llama.ModelArgs(
-            model_type="llama",
-            hidden_size=1024,
-            num_hidden_layers=4,
-            intermediate_size=2048,
-            num_attention_heads=4,
-            rms_norm_eps=1e-5,
-            vocab_size=10_000,
-        )
-
-        lora_layers = 4
-
-        def check_config(params):
-            n_keys = 2
-            if "keys" in params:
-                n_keys = len(params["keys"])
-            model = llama.Model(args)
-            model.freeze()
-            tuner.utils.linear_to_lora_layers(model, lora_layers, params)
-            trainable_params = sum(
-                v.size for _, v in tree_flatten(model.trainable_parameters())
-            )
-            self.assertEqual(
-                trainable_params, lora_layers * params["rank"] * 1024 * 2 * n_keys
-            )
-
-        params = {"rank": 8, "alpha": 16, "dropout": 0.0, "scale": 10.0}
-        check_config(params)
-
-        params["rank"] = 1
-        check_config(params)
-
-        params["keys"] = ["self_attn.k_proj"]
-        check_config(params)
-
-
-class TestScheduleConfig(unittest.TestCase):
-    def test_join(self):
-        config = {"name": "cosine_decay", "warmup": 100, "arguments": [1e-5, 100]}
-        cos_with_warmup = build_schedule(config)
-        self.assertIsNotNone(cos_with_warmup)
-
-        self.assertEqual(cos_with_warmup(0), 0.0)
-        self.assertAlmostEqual(cos_with_warmup(101), 1e-5, delta=1e-1)
-        optimizer = opt.Adam(learning_rate=cos_with_warmup)
-        for _ in range(100):
-            optimizer.update({}, {})
-        self.assertAlmostEqual(optimizer.learning_rate.item(), 1e-5, delta=1e-1)
-        for _ in range(100):
-            optimizer.update({}, {})
-        expected_lr = 1e-5 * 0.5 * (1.0 + math.cos(math.pi * 200 / 10))
-        self.assertAlmostEqual(optimizer.learning_rate.item(), expected_lr, delta=1e-1)
-
-    def test_single_schedule(self):
-
-        config = {
-            "name": "cosine_decay",
-            "arguments": [0.1, 10],
-        }
-        lr_schedule = build_schedule(config)
-        lr = lr_schedule(4)
-        expected_lr = 0.1 * 0.5 * (1.0 + math.cos(math.pi * 4 / 10))
-        self.assertAlmostEqual(lr, expected_lr, delta=1e-7)
-
-    def test_non_zero_warmup(self):
-        config = {
-            "name": "cosine_decay",
-            "warmup": 10,
-            "warmup_init": 1e-6,
-            "arguments": [1e-5, 20],
-        }
-        lr_schedule = build_schedule(config)
-        lr = lr_schedule(0)
-        self.assertAlmostEqual(lr, 1e-6, delta=1e-7)
-
-    def test_malformed_config(self):
-        config = {"warmup": 100}
-        self.assertRaises(KeyError, build_schedule, config)
-
-        config = {"cosine_decay": None}
-        self.assertRaises(KeyError, build_schedule, config)
-
-    def test_evaluate_calls(self):
-        mock_model = MagicMock()
-        mock_dataset = MagicMock()
-        mock_tokenizer = MagicMock()
-        mock_default_loss = MagicMock()
-        mock_iterate_batches = MagicMock()
-
-        mock_iterate_batches.return_value = [
-            (MagicMock(), MagicMock()),
-            (MagicMock(), MagicMock()),
-            (MagicMock(), MagicMock()),
-            (MagicMock(), MagicMock()),
-            (MagicMock(), MagicMock()),
-        ]
-
-        mock_default_loss.side_effect = [
-            (MagicMock(return_value=0.5), MagicMock(return_value=100)),
-            (MagicMock(return_value=0.3), MagicMock(return_value=200)),
-            (MagicMock(return_value=0.2), MagicMock(return_value=150)),
-            (MagicMock(return_value=0.4), MagicMock(return_value=180)),
-            (MagicMock(return_value=0.6), MagicMock(return_value=120)),
-        ]
-        evaluate(
-            model=mock_model,
-            dataset=mock_dataset,
-            tokenizer=mock_tokenizer,
-            batch_size=2,
-            num_batches=2,
-            max_seq_length=2048,
-            loss=mock_default_loss,
-            iterate_batches=mock_iterate_batches,
-        )
-
-        mock_iterate_batches.assert_called_once_with(
-            dataset=mock_dataset,
-            tokenizer=mock_tokenizer,
-            batch_size=2,
-            max_seq_length=2048,
-        )
-        self.assertEqual(mock_default_loss.call_count, 2)
-
-    def test_evaluate_infinite_batches(self):
-        mock_model = MagicMock()
-        mock_dataset = MagicMock()
-        mock_tokenizer = MagicMock()
-        mock_default_loss = MagicMock()
-        mock_iterate_batches = MagicMock()
-
-        mock_iterate_batches.return_value = [
-            (MagicMock(), MagicMock()),
-            (MagicMock(), MagicMock()),
-            (MagicMock(), MagicMock()),
-        ]
-
-        mock_default_loss.side_effect = [
-            (MagicMock(return_value=0.5), MagicMock(return_value=100)),
-            (MagicMock(return_value=0.3), MagicMock(return_value=200)),
-            (MagicMock(return_value=0.2), MagicMock(return_value=150)),
-        ]
-
-        evaluate(
-            model=mock_model,
-            dataset=mock_dataset,
-            tokenizer=mock_tokenizer,
-            batch_size=2,
-            num_batches=-1,
-            max_seq_length=2048,
-            loss=mock_default_loss,
-            iterate_batches=mock_iterate_batches,
-        )
-
-        mock_iterate_batches.assert_called_once_with(
-            dataset=mock_dataset,
-            tokenizer=mock_tokenizer,
-            batch_size=2,
-            max_seq_length=2048,
-        )
-        self.assertEqual(mock_default_loss.call_count, 3)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/llms/tests/test_models.py
+++ b/llms/tests/test_models.py
@@ -1,16 +1,15 @@
 # Copyright © 2024 Apple Inc.
-
 import unittest

 import mlx.core as mx
 from mlx.utils import tree_map
-from mlx_lm.models.base import KVCache
+from mlx_lm.models.cache import KVCache, RotatingKVCache, make_prompt_cache


 class TestModels(unittest.TestCase):

    def test_kv_cache(self):
-        cache = KVCache(32, 4)
+        cache = KVCache()

        k = mx.ones((1, 4, 1, 32), mx.float16)
        v = mx.ones((1, 4, 1, 32), mx.float16)
@@ -29,6 +28,104 @@ class TestModels(unittest.TestCase):
        self.assertTrue(mx.array_equal(v_up, expected))
        self.assertEqual(cache.offset, cache.step + 1)

+    def test_rotating_kv_cache(self):
+        b, h, d = 1, 2, 32
+        cache = RotatingKVCache(max_size=8, step=4)
+
+        k = mx.random.uniform(shape=(b, h, 2, d))
+        v = mx.random.uniform(shape=(b, h, 2, d))
+
+        k_up, v_up = cache.update_and_fetch(k, v)
+        self.assertTrue(mx.array_equal(k_up, k))
+        self.assertTrue(mx.array_equal(v_up, v))
+        self.assertEqual(cache.offset, 2)
+
+        k = mx.random.uniform(shape=(b, h, 5, d))
+        v = mx.random.uniform(shape=(b, h, 5, d))
+        k_up, v_up = cache.update_and_fetch(k, v)
+        self.assertTrue(mx.array_equal(k_up[..., 2:, :], k))
+        self.assertTrue(mx.array_equal(v_up[..., 2:, :], v))
+
+        k = mx.random.uniform(shape=(b, h, 4, d))
+        v = mx.random.uniform(shape=(b, h, 4, d))
+        k_up, v_up = cache.update_and_fetch(k, v)
+        self.assertTrue(mx.array_equal(k_up[..., -4:, :], k))
+        self.assertTrue(mx.array_equal(v_up[..., -4:, :], v))
+
+        idx = 0
+        for _ in range(10):
+            k = mx.random.uniform(shape=(b, h, 1, d))
+            v = mx.random.uniform(shape=(b, h, 1, d))
+            k_up, v_up = cache.update_and_fetch(k, v)
+            self.assertTrue(mx.array_equal(k_up[..., idx : idx + 1, :], k))
+            self.assertTrue(mx.array_equal(v_up[..., idx : idx + 1, :], v))
+            idx += 1
+            idx %= 8
+
+        # Try with nonzero keep
+        cache = RotatingKVCache(max_size=8, step=4, keep=2)
+
+        # Check a large update
+        k = mx.random.uniform(shape=(b, h, 20, d))
+        v = mx.random.uniform(shape=(b, h, 20, d))
+        k_up, v_up = cache.update_and_fetch(k, v)
+        self.assertTrue(mx.array_equal(k_up, k))
+        self.assertTrue(mx.array_equal(v_up, v))
+
+        # A bunch of small updates
+        self.assertEqual(cache.offset, 20)
+        idx = 2
+        for i in range(10):
+            k = mx.random.uniform(shape=(b, h, 1, d))
+            v = mx.random.uniform(shape=(b, h, 1, d))
+            k_up, v_up = cache.update_and_fetch(k, v)
+            self.assertTrue(mx.array_equal(k_up[..., idx : idx + 1, :], k))
+            self.assertTrue(mx.array_equal(v_up[..., idx : idx + 1, :], v))
+            self.assertEqual(cache.offset, 21 + i)
+            idx += 1
+            if idx >= 8:
+                idx = 2
+
+    def test_rotating_kv_cache_chat_mode(self):
+        # Test that the rotating kv cache can handle
+        # alternating prompt/prefill with generation
+        d = 4
+        h = 2
+        cache = RotatingKVCache(max_size=18, step=4)
+
+        x = mx.random.uniform(shape=(1, h, 8, d))
+        k, v = cache.update_and_fetch(x, x)
+        self.assertEqual(k.shape[2], 8)
+        self.assertEqual(cache.offset, 8)
+
+        x = mx.random.uniform(shape=(1, h, 1, d))
+        k, v = cache.update_and_fetch(x, x)
+        self.assertEqual(k.shape[2], 9)
+        self.assertEqual(cache.offset, 9)
+        self.assertTrue(mx.allclose(x, k[..., 8:9, :]))
+
+        x = mx.random.uniform(shape=(1, h, 2, d))
+        k, v = cache.update_and_fetch(x, x)
+        self.assertEqual(k.shape[2], 11)
+        self.assertEqual(cache.offset, 11)
+        self.assertTrue(mx.allclose(x, k[..., 9:11, :]))
+
+        x = mx.random.uniform(shape=(1, h, 3, d))
+        k, v = cache.update_and_fetch(x, x)
+        self.assertEqual(k.shape[2], 14)
+        self.assertEqual(cache.offset, 14)
+        self.assertTrue(mx.allclose(x, k[..., 11:14, :]))
+
+        x = mx.random.uniform(shape=(1, h, 6, d))
+        k, v = cache.update_and_fetch(x, x)
+        self.assertEqual(cache.offset, 20)
+        self.assertTrue(mx.allclose(x, k[..., -6:, :]))
+
+        x = mx.random.uniform(shape=(1, h, 2, d))
+        k, v = cache.update_and_fetch(x, x)
+        self.assertEqual(cache.offset, 22)
+        self.assertTrue(mx.allclose(x, k[..., -2:, :]))
+
    def model_test_runner(self, model, model_type, vocab_size, num_layers):

        self.assertEqual(len(model.layers), num_layers)
@@ -42,13 +139,7 @@ class TestModels(unittest.TestCase):
            self.assertEqual(outputs.shape, (1, 2, vocab_size))
            self.assertEqual(outputs.dtype, t)

-            kv_heads = (
-                [model.n_kv_heads] * len(model.layers)
-                if isinstance(model.n_kv_heads, int)
-                else model.n_kv_heads
-            )
-            cache = [KVCache(model.head_dim, n) for n in kv_heads]
-
+            cache = make_prompt_cache(model)
            outputs = model(inputs, cache)
            self.assertEqual(outputs.shape, (1, 2, vocab_size))
            self.assertEqual(outputs.dtype, t)
@@ -339,6 +430,26 @@ class TestModels(unittest.TestCase):
            model, args.model_type, args.vocab_size, args.num_hidden_layers
        )

+    def test_mamba(self):
+        from mlx_lm.models import mamba
+
+        args = mamba.ModelArgs(
+            model_type="mamba",
+            vocab_size=10000,
+            use_bias=False,
+            use_conv_bias=True,
+            conv_kernel=4,
+            hidden_size=768,
+            num_hidden_layers=24,
+            state_size=16,
+            intermediate_size=1536,
+            time_step_rank=48,
+        )
+        model = mamba.Model(args)
+        self.model_test_runner(
+            model, args.model_type, args.vocab_size, args.num_hidden_layers
+        )
+
    def test_gpt2(self):
        from mlx_lm.models import gpt2

@@ -355,6 +466,25 @@ class TestModels(unittest.TestCase):
        model = gpt2.Model(args)
        self.model_test_runner(model, args.model_type, args.vocab_size, args.n_layer)

+    def test_gpt_neox(self):
+        from mlx_lm.models import gpt_neox
+
+        args = gpt_neox.ModelArgs(
+            model_type="gpt_neox",
+            max_position_embeddings=2048,
+            hidden_size=6144,
+            num_attention_heads=64,
+            num_hidden_layers=44,
+            layer_norm_eps=1e-5,
+            vocab_size=50432,
+            rotary_emb_base=10_000,
+            rotary_pct=0.25,
+        )
+        model = gpt_neox.Model(args)
+        self.model_test_runner(
+            model, args.model_type, args.vocab_size, args.num_hidden_layers
+        )
+
    def test_openelm(self):
        from mlx_lm.models import openelm

@@ -430,6 +560,206 @@ class TestModels(unittest.TestCase):
            model, args.model_type, args.vocab_size, args.num_hidden_layers
        )

+    def test_llama3_1(self):
+        from mlx_lm.models import llama
+
+        args = llama.ModelArgs(
+            model_type="llama",
+            hidden_size=1024,
+            num_hidden_layers=4,
+            intermediate_size=2048,
+            num_attention_heads=4,
+            rms_norm_eps=1e-5,
+            vocab_size=10_000,
+            max_position_embeddings=128,
+            mlp_bias=False,
+            num_key_value_heads=2,
+            rope_scaling={
+                "factor": 8.0,
+                "low_freq_factor": 1.0,
+                "high_freq_factor": 4.0,
+                "original_max_position_embeddings": 8192,
+                "rope_type": "llama3",
+            },
+        )
+        model = llama.Model(args)
+        self.model_test_runner(
+            model, args.model_type, args.vocab_size, args.num_hidden_layers
+        )
+
+    def test_deepseek(self):
+        from mlx_lm.models import deepseek
+
+        args = deepseek.ModelArgs(
+            model_type="deepseek",
+            vocab_size=1024,
+            hidden_size=128,
+            intermediate_size=256,
+            moe_intermediate_size=256,
+            num_hidden_layers=4,
+            num_attention_heads=8,
+            num_key_value_heads=4,
+        )
+        model = deepseek.Model(args)
+        self.model_test_runner(
+            model, args.model_type, args.vocab_size, args.num_hidden_layers
+        )
+
+    def test_deepseek_v2(self):
+        from mlx_lm.models import deepseek_v2
+
+        args = deepseek_v2.ModelArgs(
+            model_type="deepseek_v2",
+            vocab_size=1024,
+            hidden_size=128,
+            intermediate_size=256,
+            moe_intermediate_size=256,
+            num_hidden_layers=4,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            kv_lora_rank=4,
+            q_lora_rank=4,
+            qk_rope_head_dim=32,
+            v_head_dim=16,
+            qk_nope_head_dim=32,
+            rope_scaling={
+                "beta_fast": 32,
+                "beta_slow": 1,
+                "factor": 40,
+                "mscale": 1.0,
+                "mscale_all_dim": 1.0,
+                "original_max_position_embeddings": 4096,
+                "type": "yarn",
+            },
+        )
+        model = deepseek_v2.Model(args)
+        self.model_test_runner(
+            model, args.model_type, args.vocab_size, args.num_hidden_layers
+        )
+
+    def test_gemma2(self):
+        from mlx_lm.models import gemma2
+
+        args = gemma2.ModelArgs(
+            model_type="gemma2",
+            hidden_size=128,
+            num_hidden_layers=4,
+            intermediate_size=256,
+            num_attention_heads=2,
+            head_dim=32,
+            rms_norm_eps=1e-4,
+            vocab_size=1024,
+            num_key_value_heads=2,
+        )
+        model = gemma2.Model(args)
+        self.model_test_runner(
+            model, args.model_type, args.vocab_size, args.num_hidden_layers
+        )
+
+    def test_gpt_bigcode(self):
+        from mlx_lm.models import gpt_bigcode
+
+        args = gpt_bigcode.ModelArgs(
+            model_type="gpt_bigcode",
+            n_embd=128,
+            n_layer=128,
+            n_inner=256,
+            n_head=4,
+            n_positions=1000,
+            layer_norm_epsilon=1e-5,
+            vocab_size=1024,
+        )
+        model = gpt_bigcode.Model(args)
+        self.model_test_runner(model, args.model_type, args.vocab_size, args.n_layer)
+
+    def test_nemotron(self):
+        from mlx_lm.models import nemotron
+
+        args = nemotron.ModelArgs(
+            model_type="nemotron",
+            hidden_size=128,
+            hidden_act="gelu",
+            num_hidden_layers=4,
+            intermediate_size=256,
+            num_attention_heads=4,
+            norm_eps=1e-5,
+            vocab_size=1024,
+            num_key_value_heads=2,
+        )
+        model = nemotron.Model(args)
+        self.model_test_runner(
+            model, args.model_type, args.vocab_size, args.num_hidden_layers
+        )
+
+    def test_phi3small(self):
+        from mlx_lm.models import phi3small
+
+        args = phi3small.ModelArgs(
+            model_type="phi3small",
+            hidden_size=128,
+            dense_attention_every_n_layers=2,
+            ff_intermediate_size=256,
+            gegelu_limit=1.0,
+            num_hidden_layers=4,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            layer_norm_epsilon=1e-4,
+            vocab_size=1000,
+        )
+        model = phi3small.Model(args)
+        self.model_test_runner(
+            model, args.model_type, args.vocab_size, args.num_hidden_layers
+        )
+
+    def test_phimoe(self):
+        from mlx_lm.models import phimoe
+
+        args = phimoe.ModelArgs(
+            model_type="phimoe",
+            vocab_size=320,
+            hidden_size=128,
+            intermediate_size=256,
+            num_hidden_layers=4,
+            num_attention_heads=4,
+            num_key_value_heads=4,
+            rope_scaling={
+                "long_factor": [1.0] * 16,
+                "long_mscale": 1.243163121016122,
+                "original_max_position_embeddings": 4096,
+                "short_factor": [1.0] * 16,
+                "short_mscale": 1.243163121016122,
+                "type": "longrope",
+            },
+        )
+        model = phimoe.Model(args)
+        self.model_test_runner(
+            model, args.model_type, args.vocab_size, args.num_hidden_layers
+        )
+
+    def test_recurrent_gemma(self):
+        from mlx_lm.models import recurrent_gemma
+
+        args = recurrent_gemma.ModelArgs(
+            model_type="recurrent_gemma",
+            hidden_size=128,
+            attention_bias=False,
+            conv1d_width=3,
+            intermediate_size=256,
+            logits_soft_cap=1.0,
+            num_attention_heads=4,
+            num_hidden_layers=4,
+            num_key_value_heads=2,
+            rms_norm_eps=1e-4,
+            rope_theta=1000,
+            attention_window_size=1024,
+            vocab_size=1000,
+            block_types=["recurrent", "recurrent", "attention"],
+        )
+        model = recurrent_gemma.Model(args)
+        self.model_test_runner(
+            model, args.model_type, args.vocab_size, args.num_hidden_layers
+        )
+

 if __name__ == "__main__":
    unittest.main()
--- a/llms/tests/test_prompt_cache.py
+++ b/llms/tests/test_prompt_cache.py
@@ -0,0 +1,306 @@
+# Copyright © 2024 Apple Inc.
+
+import copy
+import os
+import tempfile
+import unittest
+
+import mlx.core as mx
+from mlx_lm.models.cache import (
+    KVCache,
+    MambaCache,
+    QuantizedKVCache,
+    RotatingKVCache,
+    load_prompt_cache,
+    make_prompt_cache,
+    save_prompt_cache,
+    trim_prompt_cache,
+)
+from mlx_lm.utils import generate_step, load
+
+HF_MODEL_PATH = "mlx-community/Qwen1.5-0.5B-Chat-4bit"
+
+
+class TestPromptCache(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.test_dir_fid = tempfile.TemporaryDirectory()
+        cls.test_dir = cls.test_dir_fid.name
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.test_dir_fid.cleanup()
+
+    def test_save_load(self):
+        cache = [KVCache() for _ in range(4)]
+        for c in cache:
+            x = mx.random.uniform(shape=(1, 8, 10, 4))
+            c.update_and_fetch(x, x)
+        cache_file = os.path.join(self.test_dir, "prompt_cache.safetensors")
+        save_prompt_cache(cache_file, cache)
+        loaded_cache = load_prompt_cache(cache_file)
+        self.assertTrue(len(cache), len(loaded_cache))
+        for c, lc in zip(cache, loaded_cache):
+            self.assertEqual(c.offset, lc.offset)
+            self.assertTrue(mx.array_equal(c.state[0], lc.state[0]))
+            self.assertTrue(mx.array_equal(c.state[1], lc.state[1]))
+
+        # Test with metadata
+        cache_file = os.path.join(self.test_dir, "prompt_cache.safetensors")
+        metadata = {"a": "b", "c": "d"}
+        save_prompt_cache(cache_file, cache, metadata)
+        _, loaded_metadata = load_prompt_cache(cache_file, return_metadata=True)
+        self.assertEqual(metadata, loaded_metadata)
+
+    def test_save_load_rotating_cache(self):
+        cache_file = os.path.join(self.test_dir, "prompt_cache.safetensors")
+
+        # Test with rotating cache
+        cache = [RotatingKVCache(max_size=8, keep=2) for _ in range(4)]
+        for c in cache:
+            x = mx.random.uniform(shape=(1, 8, 10, 4))
+            c.update_and_fetch(x, x)
+
+        save_prompt_cache(cache_file, cache)
+        loaded_cache = load_prompt_cache(cache_file)
+        self.assertTrue(len(cache), len(loaded_cache))
+        for c, lc in zip(cache, loaded_cache):
+            self.assertEqual(c.offset, lc.offset)
+            self.assertEqual(c.keep, lc.keep)
+            self.assertEqual(c.max_size, lc.max_size)
+            self.assertEqual(c.step, lc.step)
+            self.assertTrue(mx.array_equal(c.state[0], lc.state[0]))
+            self.assertTrue(mx.array_equal(c.state[1], lc.state[1]))
+
+        # Do a couple single token updates to get a rotation
+        for _ in range(2):
+            for c in cache:
+                x = mx.random.uniform(shape=(1, 8, 1, 4))
+                c.update_and_fetch(x, x)
+
+        save_prompt_cache(cache_file, cache)
+        loaded_cache = load_prompt_cache(cache_file)
+
+        for c, lc in zip(cache, loaded_cache):
+            x = mx.random.uniform(shape=(1, 8, 1, 4))
+            k, v = c.update_and_fetch(x, x)
+            lk, lv = lc.update_and_fetch(x, x)
+            self.assertEqual(c.offset, lc.offset)
+            self.assertTrue(mx.array_equal(k, lk))
+            self.assertTrue(mx.array_equal(v, lv))
+
+    def test_save_load_mixed_cache(self):
+        cache_file = os.path.join(self.test_dir, "prompt_cache.safetensors")
+
+        cache = [MambaCache(), KVCache(), RotatingKVCache(8), MambaCache()]
+        for c in cache:
+            if isinstance(c, MambaCache):
+                c[0] = mx.random.uniform(shape=(4, 4, 4))
+                c[1] = mx.random.uniform(shape=(4, 4, 4))
+            else:
+                x = mx.random.uniform(shape=(4, 4, 7, 4))
+                y = mx.random.uniform(shape=(4, 4, 7, 4))
+                c.update_and_fetch(x, y)
+
+        save_prompt_cache(cache_file, cache)
+        loaded_cache = load_prompt_cache(cache_file)
+        for c, lc in zip(cache, loaded_cache):
+            if isinstance(c, MambaCache):
+                self.assertTrue(mx.array_equal(c[0], lc[0]))
+                self.assertTrue(mx.array_equal(c[1], lc[1]))
+            else:
+                x = mx.random.uniform(shape=(4, 4, 1, 4))
+                y = mx.random.uniform(shape=(4, 4, 1, 4))
+                k, v = c.update_and_fetch(x, y)
+                lk, lv = lc.update_and_fetch(x, y)
+                self.assertEqual(c.offset, lc.offset)
+                self.assertTrue(mx.array_equal(k, lk))
+                self.assertTrue(mx.array_equal(v, lv))
+
+    def test_cache_with_generate(self):
+        model, tokenizer = load(HF_MODEL_PATH)
+        prompt = tokenizer.encode("this is a prompt", return_tensors="mlx")[0]
+        results = zip(range(4), generate_step(prompt, model))
+        toks, all_logits = zip(*(r[1] for r in results))
+
+        prompt_cache = make_prompt_cache(model)
+        i = 0
+        for _, (tok, logits) in zip(
+            range(2), generate_step(prompt, model, prompt_cache=prompt_cache)
+        ):
+            self.assertEqual(tok, toks[i])
+            self.assertTrue(mx.allclose(logits, all_logits[i]))
+            i += 1
+
+        for _, (tok, logits) in zip(
+            range(1),
+            generate_step(mx.array([toks[i]]), model, prompt_cache=prompt_cache),
+        ):
+            i += 1
+            self.assertEqual(tok, toks[i])
+            self.assertTrue(mx.allclose(logits, all_logits[i]))
+
+    def test_trim_cache(self):
+        cache = [KVCache() for _ in range(2)]
+        for c in cache:
+            x = mx.random.uniform(shape=(1, 8, 10, 4))
+            c.update_and_fetch(x, x)
+
+        # Trim
+        num_trimmed = trim_prompt_cache(cache, 7)
+        self.assertEqual(num_trimmed, 7)
+
+        # Trim more tokens than remain
+        num_trimmed = trim_prompt_cache(cache, 4)
+        self.assertEqual(num_trimmed, 3)
+
+        # Can't trim mamba cache
+        cache = [MambaCache() for _ in range(2)]
+        for c in cache:
+            c.state = mx.zeros((5, 5))
+        num_trimmed = trim_prompt_cache(cache, 7)
+        self.assertEqual(num_trimmed, 0)
+
+        # All cache's have to be trimmable
+        cache = [MambaCache(), KVCache()]
+        cache[0].state = mx.zeros((5, 5))
+        x = mx.random.uniform(shape=(1, 8, 10, 4))
+        cache[1].update_and_fetch(x, x)
+        num_trimmed = trim_prompt_cache(cache, 1)
+        self.assertEqual(num_trimmed, 0)
+
+        cache = [RotatingKVCache(max_size=6) for _ in range(2)]
+        for c in cache:
+            x = mx.random.uniform(shape=(1, 8, 5, 4))
+            c.update_and_fetch(x, x)
+
+        num_trimmed = trim_prompt_cache(cache, 4)
+        self.assertEqual(num_trimmed, 4)
+
+        # Can't trim fixed-size KV cache after processing
+        # more than max_kv_size tokens
+        for c in cache:
+            x = mx.random.uniform(shape=(1, 8, 10, 4))
+            c.update_and_fetch(x, x)
+
+        num_trimmed = trim_prompt_cache(cache, 4)
+        self.assertEqual(num_trimmed, 0)
+
+        cache = [QuantizedKVCache() for _ in range(2)]
+        for c in cache:
+            x = mx.random.uniform(shape=(1, 8, 10, 64))
+            c.update_and_fetch(x, x)
+
+        num_trimmed = trim_prompt_cache(cache, 7)
+        self.assertEqual(num_trimmed, 7)
+
+        # Trim more tokens than remain
+        num_trimmed = trim_prompt_cache(cache, 4)
+        self.assertEqual(num_trimmed, 3)
+
+    def test_trim_cache_with_generate(self):
+        model, tokenizer = load(HF_MODEL_PATH)
+        prompt = tokenizer.encode("this is a prompt", return_tensors="mlx")[0]
+
+        prompt_cache = make_prompt_cache(model)
+
+        # Generate one token so we process the full prompt
+        last_tok, _ = next(generate_step(prompt, model, prompt_cache=prompt_cache))
+        last_tok = mx.array([last_tok])
+
+        # Generate two more tokens
+        results = zip(
+            range(2), generate_step(last_tok, model, prompt_cache=prompt_cache)
+        )
+        toks, all_logits = zip(*(r[1] for r in results))
+
+        # To get back to the cache just after processing the prompt,
+        # trim by 3 tokens
+        trim_prompt_cache(prompt_cache, 3)
+
+        # Generate the same thing again
+        results = zip(
+            range(2), generate_step(last_tok, model, prompt_cache=prompt_cache)
+        )
+        second_toks, second_all_logits = zip(*(r[1] for r in results))
+        self.assertEqual(toks, second_toks)
+        self.assertTrue(
+            all(mx.allclose(l, l2) for l, l2 in zip(all_logits, second_all_logits))
+        )
+
+    def test_cache_copying(self):
+        cache = [KVCache()]
+
+        x = mx.random.uniform(shape=(1, 8, 10, 4))
+        cache[0].update_and_fetch(x, x)
+
+        y = mx.random.uniform(shape=(1, 8, 1, 4))
+        cache[0].update_and_fetch(y, y)
+
+        old_cache = copy.deepcopy(cache)
+
+        trim_prompt_cache(cache, 1)
+
+        self.assertTrue(old_cache[0].offset, 11)
+        self.assertTrue(cache[0].offset, 10)
+
+        z = mx.random.uniform(shape=(1, 8, 1, 4))
+        cache[0].update_and_fetch(z, z)
+
+        self.assertTrue(mx.allclose(old_cache[0].keys[..., 10:11, :], y))
+        self.assertTrue(mx.allclose(cache[0].keys[..., 10:11, :], z))
+
+    def test_save_load_quantized_cache(self):
+        cache = [QuantizedKVCache(bits=4, group_size=32) for _ in range(4)]
+        for c in cache:
+            x = mx.random.uniform(shape=(1, 8, 10, 32))
+            c.update_and_fetch(x, x)
+        cache_file = os.path.join(self.test_dir, "prompt_cache.safetensors")
+        save_prompt_cache(cache_file, cache)
+        loaded_cache = load_prompt_cache(cache_file)
+        self.assertTrue(loaded_cache[0].bits == cache[0].bits)
+        self.assertTrue(loaded_cache[0].group_size == cache[0].group_size)
+        self.assertTrue(len(cache), len(loaded_cache))
+        for c, lc in zip(cache, loaded_cache):
+            self.assertEqual(c.offset, lc.offset)
+            # Loop over quantized tuple
+            for i in range(3):
+                self.assertTrue(mx.array_equal(c.state[0][i], lc.state[0][i]))
+                self.assertTrue(mx.array_equal(c.state[1][i], lc.state[1][i]))
+
+        # Test with metadata
+        cache_file = os.path.join(self.test_dir, "prompt_cache.safetensors")
+        metadata = {"a": "b", "c": "d"}
+        save_prompt_cache(cache_file, cache, metadata)
+        _, loaded_metadata = load_prompt_cache(cache_file, return_metadata=True)
+        self.assertEqual(metadata, loaded_metadata)
+
+    def test_cache_to_quantized(self):
+        model, tokenizer = load(HF_MODEL_PATH)
+        prompt = tokenizer.encode("this is a prompt", return_tensors="mlx")[0]
+        results = zip(range(4), generate_step(prompt, model))
+        toks, all_logits = zip(*(r[1] for r in results))
+
+        prompt_cache = make_prompt_cache(model)
+        i = 0
+        for _, (tok, logits) in zip(
+            range(2), generate_step(prompt, model, prompt_cache=prompt_cache)
+        ):
+            self.assertEqual(tok, toks[i])
+            self.assertTrue(mx.allclose(logits, all_logits[i]))
+            i += 1
+
+        prompt_cache = [c.to_quantized(bits=8, group_size=32) for c in prompt_cache]
+
+        for _, (tok, logits) in zip(
+            range(1),
+            generate_step(mx.array([toks[i]]), model, prompt_cache=prompt_cache),
+        ):
+            i += 1
+            self.assertEqual(tok, toks[i])
+            self.assertTrue(mx.allclose(logits, all_logits[i], rtol=1e-2))
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/llms/tests/test_sample_utils.py
+++ b/llms/tests/test_sample_utils.py
@@ -1,38 +1,32 @@
 import unittest
-from unittest.mock import patch

 import mlx.core as mx
 from mlx_lm.sample_utils import top_p_sampling


 class TestSamplingUtils(unittest.TestCase):
-    @patch("mlx.core.random.categorical")
-    def test_top_p_sampling(self, mock_categorical):
-        logits = mx.array([[1.0, 2.0, 3.0, 4.0]])
-        top_p = 0.3
+    def test_top_p_sampling(self):
+        probs = mx.array([0.9, 0.0, 0.0, 0.1])[None]
+        logits = mx.log(probs)
        temperature = 1.0
-        expected_token = mx.array([3])
-        mock_categorical.return_value = expected_token

-        token = top_p_sampling(logits, top_p, temperature)
-        expected_top_probs = mx.array([[0.0, 0.0, 0.0, 0.643914]])
-        self.assertTrue(mx.allclose(token, expected_token))
-        args, _ = mock_categorical.call_args
-        self.assertTrue(args[0].shape == expected_top_probs.shape)
-        self.assertTrue(mx.allclose(args[0], mx.log(expected_top_probs)))
+        token = top_p_sampling(logits, 0.3, temperature).item()
+        self.assertEqual(token, 0)

-        logits = mx.array([[1.0, 2.0, 3.0, 4.0]])
-        top_p = 0.9
-        temperature = 1.0
-        expected_token = mx.array([3])
-        mock_categorical.return_value = expected_token
+        token = top_p_sampling(logits, 0.95, temperature).item()
+        self.assertTrue(token in (0, 3))

-        token = top_p_sampling(logits, top_p, temperature)
-        expected_top_probs = mx.array([[0.0, 0.0871443, 0.236883, 0.643914]])
-        self.assertTrue(mx.allclose(token, expected_token))
-        args, _ = mock_categorical.call_args
-        self.assertTrue(args[0].shape == expected_top_probs.shape)
-        self.assertTrue(mx.allclose(args[0], mx.log(expected_top_probs)))
+        probs = mx.array([0.0, 0.5, 0.4, 0.1])[None]
+        logits = mx.log(probs)
+
+        token = top_p_sampling(logits, 0.4, temperature).item()
+        self.assertEqual(token, 1)
+
+        token = top_p_sampling(logits, 0.6, temperature).item()
+        self.assertTrue(token in (1, 2))
+
+        token = top_p_sampling(logits, 0.95, temperature).item()
+        self.assertTrue(token in (1, 2, 3))


 if __name__ == "__main__":
--- a/llms/tests/test_server.py
+++ b/llms/tests/test_server.py
@@ -1,4 +1,7 @@
+# Copyright © 2024 Apple Inc.
+
 import http
+import json
 import threading
 import unittest

@@ -11,8 +14,9 @@ class DummyModelProvider:
    def __init__(self):
        HF_MODEL_PATH = "mlx-community/Qwen1.5-0.5B-Chat-4bit"
        self.model, self.tokenizer = load(HF_MODEL_PATH)
+        self.model_key = (HF_MODEL_PATH, None)

-    def load(self, model):
+    def load(self, model, adapter=None):
        assert model in ["default_model", "chat_model"]
        return self.model, self.tokenizer

@@ -76,6 +80,31 @@ class TestServer(unittest.TestCase):
        self.assertIn("id", response_body)
        self.assertIn("choices", response_body)

+    def test_handle_models(self):
+        url = f"http://localhost:{self.port}/v1/models"
+        response = requests.get(url)
+        self.assertEqual(response.status_code, 200)
+        response_body = json.loads(response.text)
+        self.assertEqual(response_body["object"], "list")
+        self.assertIsInstance(response_body["data"], list)
+        self.assertGreater(len(response_body["data"]), 0)
+        model = response_body["data"][0]
+        self.assertIn("id", model)
+        self.assertEqual(model["object"], "model")
+        self.assertIn("created", model)
+
+    def test_sequence_overlap(self):
+        from mlx_lm.server import sequence_overlap
+
+        self.assertTrue(sequence_overlap([1], [1]))
+        self.assertTrue(sequence_overlap([1, 2], [1, 2]))
+        self.assertTrue(sequence_overlap([1, 3], [3, 4]))
+        self.assertTrue(sequence_overlap([1, 2, 3], [2, 3]))
+
+        self.assertFalse(sequence_overlap([1], [2]))
+        self.assertFalse(sequence_overlap([1, 2], [3, 4]))
+        self.assertFalse(sequence_overlap([1, 2, 3], [4, 1, 2, 3]))
+

 if __name__ == "__main__":
    unittest.main()
--- a/llms/tests/test_tokenizers.py
+++ b/llms/tests/test_tokenizers.py
@@ -0,0 +1,90 @@
+# Copyright © 2024 Apple Inc.
+
+import unittest
+from pathlib import Path
+
+from huggingface_hub import snapshot_download
+from mlx_lm.tokenizer_utils import (
+    BPEStreamingDetokenizer,
+    NaiveStreamingDetokenizer,
+    SPMStreamingDetokenizer,
+    load_tokenizer,
+)
+
+
+class TestTokenizers(unittest.TestCase):
+
+    def download_tokenizer(self, repo):
+        path = Path(
+            snapshot_download(
+                repo_id=repo,
+                allow_patterns=[
+                    "tokenizer.json",
+                    "tokenizer_config.json",
+                    "special_tokens_map.json",
+                    "tokenizer.model",
+                ],
+            )
+        )
+        return load_tokenizer(path)
+
+    def check_tokenizer(self, tokenizer):
+        def check(tokens):
+            expected_text = tokenizer.decode(tokens)
+            detokenizer = tokenizer.detokenizer
+            detokenizer.reset()
+            text = ""
+            for t in tokens:
+                detokenizer.add_token(t)
+                seg = detokenizer.last_segment
+                text += seg
+            detokenizer.finalize()
+            text += detokenizer.last_segment
+            self.assertEqual(text, expected_text)
+
+        tokens = tokenizer.encode("a ,b")
+        check(tokens)
+
+        tokens = tokenizer.encode('{"why_its_funny" :"a_joke_explainer" ,"rating":3.5}')
+        check(tokens)
+
+        tokens = tokenizer.encode("3 3")
+        check(tokens)
+
+        tokens = tokenizer.encode("import 'package:flutter/material.dart';")
+        check(tokens)
+
+    def test_tokenizers(self):
+        tokenizer_repos = [
+            ("mlx-community/Qwen1.5-0.5B-Chat-4bit", BPEStreamingDetokenizer),
+            ("mlx-community/Mistral-7B-v0.2-4bit", SPMStreamingDetokenizer),
+            ("mlx-community/Phi-3.5-mini-instruct-4bit", SPMStreamingDetokenizer),
+            ("mlx-community/Mistral-7B-Instruct-v0.3", SPMStreamingDetokenizer),
+            ("mlx-community/Llama-3.2-1B-Instruct-4bit", BPEStreamingDetokenizer),
+        ]
+        for tokenizer_repo, expected_detokenizer in tokenizer_repos:
+            with self.subTest(tokenizer=tokenizer_repo):
+                tokenizer = self.download_tokenizer(tokenizer_repo)
+                tokenizer.decode([0, 1, 2])
+                self.assertTrue(isinstance(tokenizer.detokenizer, expected_detokenizer))
+                self.check_tokenizer(tokenizer)
+
+        # Try one with a naive detokenizer
+        tokenizer = self.download_tokenizer("mlx-community/Llama-3.2-1B-Instruct-4bit")
+        tokenizer._detokenizer = NaiveStreamingDetokenizer(tokenizer)
+        self.check_tokenizer(tokenizer)
+
+    def test_special_tokens(self):
+        tokenizer_repo = "mlx-community/DeepSeek-Coder-V2-Lite-Instruct-4bit-mlx"
+        tokenizer = self.download_tokenizer(tokenizer_repo)
+
+        detokenizer = tokenizer.detokenizer
+        detokenizer.reset()
+        detokenizer.add_token(tokenizer.eos_token_id)
+        detokenizer.finalize()
+
+        self.assertEqual(detokenizer.last_segment, tokenizer.eos_token)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/llms/tests/test_utils.py
+++ b/llms/tests/test_utils.py
@@ -82,6 +82,7 @@ class TestUtils(unittest.TestCase):
        self.assertTrue(isinstance(model.layers[-1].mlp.up_proj, nn.QuantizedLinear))

        # Check model weights have right type
+        mlx_path = os.path.join(self.test_dir, "mlx_model_bf16")
        utils.convert(HF_MODEL_PATH, mlx_path=mlx_path, dtype="bfloat16")
        model, _ = utils.load(mlx_path)

--- a/llms/tests/test_utils_load_model.py
+++ b/llms/tests/test_utils_load_model.py
@@ -0,0 +1,50 @@
+import unittest
+from pathlib import Path
+
+import mlx.nn as nn
+from mlx_lm.models.qwen2 import Model as Qwen2Model
+from mlx_lm.utils import get_model_path, load_model
+
+HF_MODEL_PATH = "mlx-community/Qwen1.5-0.5B-Chat-4bit"
+
+
+class TestLoadModelCustomGetClasses(unittest.TestCase):
+
+    def test_load_model_with_custom_get_classes(self):
+        class CustomQwenModel(nn.Module):
+            def __init__(self, args):
+                super().__init__()
+                self.config = args
+                self.custom_attribute = "This is a custom model"
+
+            def load_weights(self, weights):
+                self.qwenWeights = weights
+
+        class CustomQwenConfig:
+            @classmethod
+            def from_dict(cls, config):
+                instance = cls()
+                for k, v in config.items():
+                    setattr(instance, k, v)
+                return instance
+
+        def custom_get_classes(config):
+            return CustomQwenModel, CustomQwenConfig
+
+        model_path = get_model_path(HF_MODEL_PATH)
+        model = load_model(model_path, get_model_classes=custom_get_classes)
+
+        self.assertIsInstance(model, CustomQwenModel)
+        self.assertTrue(hasattr(model, "custom_attribute"))
+        self.assertEqual(model.custom_attribute, "This is a custom model")
+        self.assertTrue(hasattr(model, "qwenWeights"))
+
+    def test_load_model_with_default_get_classes(self):
+        model_path = get_model_path(HF_MODEL_PATH)
+        model = load_model(model_path)
+
+        self.assertIsInstance(model, Qwen2Model)
+
+
+if __name__ == "__main__":
+    unittest.main()