Merge branch 'ml-explore:main' into fix-unsupported-scalartype

2025-08-10 11:16:40 +08:00 · 2023-12-07 17:04:01 +01:00 · 2023-12-07 17:04:01 +01:00 · 85345d42cb
commit 85345d42cb
parent 71aff8c346 1289f0bd9c
8 changed files with 75 additions and 51 deletions
--- a/.gitignore
+++ b/.gitignore
@ -127,3 +127,5 @@ dmypy.json

 # Pyre type checker
 .pyre/
+.idea/
+.vscode/
--- a/llama/README.md
+++ b/llama/README.md
@ -17,6 +17,9 @@ weights you will need to [request
 access](https://docs.google.com/forms/d/e/1FAIpQLSfqNECQnMkycAp2jP4Z9TFX0cGR4uf7b_fBxjY_OjhJILlKGA/viewform)
 from Meta.

+
+Alternatively, you can also download a select converted checkpoints from the [mlx-llama](https://huggingface.co/mlx-llama) community organisation on Hugging Face and skip the conversion step.
+
 Convert the weights with:

 ```
--- a/mistral/requirements.txt
+++ b/mistral/requirements.txt
@ -1,3 +1,4 @@
 mlx
 sentencepiece
 torch
+numpy
--- a/stable_diffusion/README.md
+++ b/stable_diffusion/README.md
@ -65,8 +65,9 @@ Performance
 -----------

 The following table compares the performance of the UNet in stable diffusion.
-We report throughput in images per second for the provided `txt2image.py`
-script and the `diffusers` library using the MPS PyTorch backend.
+We report throughput in images per second **processed by the UNet** for the
+provided `txt2image.py` script and the `diffusers` library using the MPS
+PyTorch backend.

 At the time of writing this comparison convolutions are still some of the least
 optimized operations in MLX. Despite that, MLX still achieves **~40% higher
@ -93,3 +94,7 @@ The above experiments were made on an M2 Ultra with PyTorch version 2.1,
 diffusers version 0.21.4 and transformers version 4.33.3. For the generation we
 used classifier free guidance which means that the above batch sizes result
 double the images processed by the UNet.
+
+Note that the above table means that it takes about 90 seconds to fully
+generate 16 images with MLX and 50 diffusion steps with classifier free
+guidance and about 120 for PyTorch.
--- a/stable_diffusion/requirements.txt
+++ b/stable_diffusion/requirements.txt
@ -1,3 +1,4 @@
+mlx
 safetensors
 huggingface-hub
 regex
--- a/transformer_lm/README.md
+++ b/transformer_lm/README.md
@ -11,4 +11,4 @@ python main.py --gpu

 By default the dataset is the [PTB corpus](https://paperswithcode.com/dataset/penn-treebank). Choose a different dataset with the `--dataset` option.

-To run the PyTorch, Jax or TensorFlowexamples install the respective framework.
+To run the PyTorch, Jax or TensorFlow examples install the respective framework.
--- a/transformer_lm/main.py
+++ b/transformer_lm/main.py
@ -81,13 +81,13 @@ def main(args):
    optimizer = optim.SGD(learning_rate=args.learning_rate)
    loss_and_grad_fn = nn.value_and_grad(model, model.loss)

-    def eval_fn(params, dataset):
+    def eval_fn(model, dataset):
        inputs, targets = map(mx.array, to_samples(context_size, dataset))
        loss = 0
        for s in range(0, targets.shape[0], batch_size):
            bx, by = inputs[s : s + batch_size], targets[s : s + batch_size]
            bx, by = map(mx.array, (bx, by))
-            losses = self.loss(bx, by, reduce=False)
+            losses = model.loss(bx, by, reduce=False)
            loss += mx.sum(losses).item()
        return loss / len(targets)

@ -110,9 +110,8 @@ def main(args):
            )
            losses = []
            tic = time.perf_counter()
-
        if (it + 1) % steps_per_eval == 0:
-            val_loss = eval_fn(params, valid)
+            val_loss = eval_fn(model, valid)
            toc = time.perf_counter()
            print(
                f"Iter {it + 1}: "
@ -123,7 +122,7 @@ def main(args):
            tic = time.perf_counter()

    if args.eval_test:
-        test_loss = eval_fn(params, test)
+        test_loss = eval_fn(model, test)
        test_ppl = math.exp(test_loss)
        print(f"Test loss {test_loss:.3f}, Test ppl {test_ppl:.3f}.")

--- a/whisper/benchmark.py
+++ b/whisper/benchmark.py
@ -1,5 +1,6 @@
 # Copyright © 2023 Apple Inc.

+import sys
 import time

 import mlx.core as mx
@ -48,46 +49,58 @@ def everything():


 if __name__ == "__main__":
-    feat_time = timer(feats)
-    print(f"Feature time {feat_time:.3f}")
-    mels = feats()[None]
-    tokens = mx.array(
-        [
-            50364,
-            1396,
-            264,
-            665,
-            5133,
-            23109,
-            25462,
-            264,
-            6582,
-            293,
-            750,
-            632,
-            42841,
-            292,
-            370,
-            938,
-            294,
-            4054,
-            293,
-            12653,
-            356,
-            50620,
-            50620,
-            23563,
-            322,
-            3312,
-            13,
-            50680,
-        ],
-        mx.int32,
-    )[None]
-    model = load_models.load_model("tiny")
-    model_forward_time = timer(model_forward, model, mels, tokens)
-    print(f"Model forward time {model_forward_time:.3f}")
-    decode_time = timer(decode, model, mels)
-    print(f"Decode time {decode_time:.3f}")
-    everything_time = timer(everything)
-    print(f"Everything time {everything_time:.3f}")
+
+    # get command line arguments without 3rd party libraries
+    # the command line argument to benchmark all models is "all"
+    models = ["tiny"]
+    if len(sys.argv) > 1:
+        if sys.argv[1] == "--all":
+            models = ["tiny", "small", "medium", "large"]
+
+    for model_name in models:
+        feat_time = timer(feats)
+
+        print(f"\nModel: {model_name.upper()}")
+        print(f"\nFeature time {feat_time:.3f}")
+        mels = feats()[None]
+        tokens = mx.array(
+            [
+                50364,
+                1396,
+                264,
+                665,
+                5133,
+                23109,
+                25462,
+                264,
+                6582,
+                293,
+                750,
+                632,
+                42841,
+                292,
+                370,
+                938,
+                294,
+                4054,
+                293,
+                12653,
+                356,
+                50620,
+                50620,
+                23563,
+                322,
+                3312,
+                13,
+                50680,
+            ],
+            mx.int32,
+        )[None]
+        model = load_models.load_model(f"{model_name}")
+        model_forward_time = timer(model_forward, model, mels, tokens)
+        print(f"Model forward time {model_forward_time:.3f}")
+        decode_time = timer(decode, model, mels)
+        print(f"Decode time {decode_time:.3f}")
+        everything_time = timer(everything)
+        print(f"Everything time {everything_time:.3f}")
+        print(f"\n{'-----' * 10}\n")