From f37e777243029872d5783f8db1183771d818d59e Mon Sep 17 00:00:00 2001
From: Sarthak Yadav <sarthak.yadav3@gmail.com>
Date: Tue, 12 Dec 2023 19:01:06 +0100
Subject: [PATCH 1/6] added CIFAR10 + ResNet example

---
 cifar/README.md        |  31 ++++++++++
 cifar/dataset.py       |  39 +++++++++++++
 cifar/main.py          | 108 ++++++++++++++++++++++++++++++++++
 cifar/requirements.txt |   3 +
 cifar/resnet.py        | 129 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 310 insertions(+)
 create mode 100644 cifar/README.md
 create mode 100644 cifar/dataset.py
 create mode 100644 cifar/main.py
 create mode 100644 cifar/requirements.txt
 create mode 100644 cifar/resnet.py

diff --git a/cifar/README.md b/cifar/README.md
new file mode 100644
index 00000000..0d793853
--- /dev/null
+++ b/cifar/README.md
@@ -0,0 +1,31 @@
+# CIFAR and ResNets
+
+* This example shows how to run ResNets on CIFAR10 dataset, in accordance with the original [paper](https://arxiv.org/abs/1512.03385).
+* Also illustrates how to use `mlx-data` to download and load the dataset.
+
+
+## Pre-requisites
+* Install the dependencies:
+
+```
+pip install -r requirements.txt
+```
+
+## Running the example
+Run the example with:
+
+```
+python main.py
+```
+
+By default the example runs on the GPU. To run on the CPU, use: 
+
+```
+python main.py --cpu_only
+```
+
+For all available options, run:
+
+```
+python main.py --help
+```
diff --git a/cifar/dataset.py b/cifar/dataset.py
new file mode 100644
index 00000000..f4a3cd63
--- /dev/null
+++ b/cifar/dataset.py
@@ -0,0 +1,39 @@
+import mlx.core as mx
+from mlx.data.datasets import load_cifar10
+import math
+
+
+def get_cifar10(batch_size, root=None):
+
+    tr = load_cifar10(root=root)
+    num_tr_samples = tr.size()
+
+    mean = mx.array([0.485, 0.456, 0.406]).reshape((1, 1, 3))
+    std = mx.array([0.229, 0.224, 0.225]).reshape((1, 1, 3))
+
+    tr_iter = (
+        tr.shuffle()
+        .to_stream()
+        .image_random_h_flip("image", prob=0.5)
+        .pad("image", 0, 4, 4, 0.0)
+        .pad("image", 1, 4, 4, 0.0)
+        .image_random_crop("image", 32, 32)
+        .key_transform("image", lambda x: (x.astype("float32") / 255.0))
+        .key_transform("image", lambda x: (x - mean) / std)
+        .batch(batch_size)
+    )
+
+    test = load_cifar10(root=root, train=False)
+    num_test_samples = test.size()
+
+    test_iter = (
+        test.to_stream()
+        .key_transform("image", lambda x: (x.astype("float32") / 255.0))
+        .key_transform("image", lambda x: (x - mean) / std)
+        .batch(batch_size)
+    )
+
+    num_tr_steps_per_epoch = num_tr_samples // batch_size
+    num_test_steps_per_epoch = num_test_samples // batch_size
+
+    return tr_iter, test_iter, num_tr_steps_per_epoch, num_test_steps_per_epoch
diff --git a/cifar/main.py b/cifar/main.py
new file mode 100644
index 00000000..5272733a
--- /dev/null
+++ b/cifar/main.py
@@ -0,0 +1,108 @@
+import argparse
+import resnet
+import numpy as np
+import mlx.nn as nn
+import mlx.core as mx
+import mlx.optimizers as optim
+from dataset import get_cifar10
+
+
+parser = argparse.ArgumentParser(add_help=True)
+parser.add_argument(
+    "--arch",
+    type=str,
+    default="resnet20",
+    help="model architecture [resnet20, resnet32, resnet44, resnet56, resnet110, resnet1202]",
+)
+parser.add_argument("--batch_size", type=int, default=128, help="batch size")
+parser.add_argument("--epochs", type=int, default=100, help="number of epochs")
+parser.add_argument("--lr", type=float, default=1e-3, help="learning rate")
+parser.add_argument("--seed", type=int, default=0, help="random seed")
+parser.add_argument("--cpu_only", action="store_true", help="use cpu only")
+
+
+def loss_fn(model, inp, tgt):
+    return mx.mean(nn.losses.cross_entropy(model(inp), tgt))
+
+
+def eval_fn(model, inp, tgt):
+    return mx.mean(mx.argmax(model(inp), axis=1) == tgt)
+
+
+def train_epoch(model, train_iter, optimizer, epoch):
+    def train_step(model, inp, tgt):
+        output = model(inp)
+        loss = mx.mean(nn.losses.cross_entropy(output, tgt))
+        acc = mx.mean(mx.argmax(output, axis=1) == tgt)
+        return loss, acc
+
+    train_step_fn = nn.value_and_grad(model, train_step)
+
+    losses = []
+    accs = []
+
+    for batch_counter, batch in enumerate(train_iter):
+        x = mx.array(batch["image"])
+        y = mx.array(batch["label"])
+        (loss, acc), grads = train_step_fn(model, x, y)
+        optimizer.update(model, grads)
+        mx.eval(model.parameters(), optimizer.state)
+
+        loss_value = loss.item()
+        acc_value = acc.item()
+        losses.append(loss_value)
+        accs.append(acc_value)
+
+        if batch_counter % 10 == 0:
+            print(
+                f"Epoch {epoch:02d}[{batch_counter:03d}]: tr_loss {loss_value:.3f}, tr_acc {acc_value:.3f}"
+            )
+
+    mean_tr_loss = np.mean(np.array(losses))
+    mean_tr_acc = np.mean(np.array(accs))
+    return mean_tr_loss, mean_tr_acc
+
+
+def test_epoch(model, test_iter, epoch):
+    accs = []
+    for batch_counter, batch in enumerate(test_iter):
+        x = mx.array(batch["image"])
+        y = mx.array(batch["label"])
+        acc = eval_fn(model, x, y)
+        acc_value = acc.item()
+        accs.append(acc_value)
+    mean_acc = np.mean(np.array(accs))
+
+    return mean_acc
+
+
+def main(args):
+    np.random.seed(args.seed)
+    mx.random.seed(args.seed)
+
+    model = resnet.__dict__[args.arch]()
+
+    print("num_params: {:0.04f} M".format(model.num_params() / 1e6))
+    mx.eval(model.parameters())
+
+    optimizer = optim.Adam(learning_rate=args.lr)
+
+    for epoch in range(args.epochs):
+        # get data every epoch
+        # or set .repeat() on the data stream appropriately
+        train_data, test_data, tr_batches, _ = get_cifar10(args.batch_size)
+
+        epoch_tr_loss, epoch_tr_acc = train_epoch(model, train_data, optimizer, epoch)
+        print(
+            f"Epoch {epoch}: avg. tr_loss {epoch_tr_loss:.3f}, avg. tr_acc {epoch_tr_acc:.3f}"
+        )
+
+        epoch_test_acc = test_epoch(model, test_data, epoch)
+        print(f"Epoch {epoch}: Test_acc {epoch_test_acc:.3f}")
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    if args.cpu_only:
+        mx.set_default_device(mx.cpu)
+    main(args)
diff --git a/cifar/requirements.txt b/cifar/requirements.txt
new file mode 100644
index 00000000..c4c2e575
--- /dev/null
+++ b/cifar/requirements.txt
@@ -0,0 +1,3 @@
+mlx
+mlx-data
+numpy
\ No newline at end of file
diff --git a/cifar/resnet.py b/cifar/resnet.py
new file mode 100644
index 00000000..3d88397b
--- /dev/null
+++ b/cifar/resnet.py
@@ -0,0 +1,129 @@
+"""
+Implementation of ResNets for CIFAR-10 as per the original paper [https://arxiv.org/abs/1512.03385].
+Configurations include ResNet-20, ResNet-32, ResNet-44, ResNet-56, ResNet-110, ResNet-1202.
+
+There's no BatchNorm is mlx==0.0.4, using LayerNorm instead.
+
+Authors:
+    Sarthak Yadav, 2023
+"""
+
+from typing import Any
+import mlx.core as mx
+import mlx.nn as nn
+from mlx.utils import tree_flatten
+
+
+__all__ = [
+    "ResNet",
+    "resnet20",
+    "resnet32",
+    "resnet44",
+    "resnet56",
+    "resnet110",
+    "resnet1202",
+]
+
+
+class ShortcutA(nn.Module):
+    def __init__(self, dims):
+        super().__init__()
+        self.dims = dims
+
+    def __call__(self, x):
+        return mx.pad(
+            x[:, ::2, ::2, :],
+            pad_width=[(0, 0), (0, 0), (0, 0), (self.dims // 4, self.dims // 4)],
+        )
+
+
+class Block(nn.Module):
+    expansion = 1
+
+    def __init__(self, in_dims, dims, stride=1):
+        super().__init__()
+
+        self.conv1 = nn.Conv2d(
+            in_dims, dims, kernel_size=3, stride=stride, padding=1, bias=False
+        )
+        self.bn1 = nn.LayerNorm(dims)
+
+        self.conv2 = nn.Conv2d(
+            dims, dims, kernel_size=3, stride=1, padding=1, bias=False
+        )
+        self.bn2 = nn.LayerNorm(dims)
+
+        if stride != 1 or in_dims != dims:
+            self.shortcut = ShortcutA(dims)
+        else:
+            self.shortcut = None
+
+    def __call__(self, x):
+
+        out = nn.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        if self.shortcut is None:
+            out += x
+        else:
+            out += self.shortcut(x)
+        out = nn.relu(out)
+        return out
+
+
+class ResNet(nn.Module):
+    def __init__(self, block, num_blocks, num_classes=10):
+        super().__init__()
+        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.LayerNorm(16)
+        self.in_dims = 16
+
+        self.layer1 = self._make_layer(block, 16, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(block, 32, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(block, 64, num_blocks[2], stride=2)
+
+        self.linear = nn.Linear(64, num_classes)
+
+    def _make_layer(self, block, dims, num_blocks, stride):
+        strides = [stride] + [1] * (num_blocks - 1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_dims, dims, stride))
+            self.in_dims = dims * block.expansion
+        return nn.Sequential(*layers)
+
+    def num_params(self):
+        nparams = sum(x.size for k, x in tree_flatten(self.parameters()))
+        return nparams
+
+    def __call__(self, x):
+        x = nn.relu(self.bn1(self.conv1(x)))
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = mx.mean(x, axis=[1, 2]).reshape(x.shape[0], -1)
+        x = self.linear(x)
+        return x
+
+
+def resnet20(**kwargs):
+    return ResNet(Block, [3, 3, 3], **kwargs)
+
+
+def resnet32(**kwargs):
+    return ResNet(Block, [5, 5, 5], **kwargs)
+
+
+def resnet44(**kwargs):
+    return ResNet(Block, [7, 7, 7], **kwargs)
+
+
+def resnet56(**kwargs):
+    return ResNet(Block, [9, 9, 9], **kwargs)
+
+
+def resnet110(**kwargs):
+    return ResNet(Block, [18, 18, 18], **kwargs)
+
+
+def resnet1202(**kwargs):
+    return ResNet(Block, [200, 200, 200], **kwargs)

From 2439333a57812cb407ca86c85e8a2d77f0eb9231 Mon Sep 17 00:00:00 2001
From: Sarthak Yadav <sarthak.yadav3@gmail.com>
Date: Tue, 12 Dec 2023 19:07:39 +0100
Subject: [PATCH 2/6] fixed doc for ResNet

---
 cifar/resnet.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cifar/resnet.py b/cifar/resnet.py
index 3d88397b..b89a612b 100644
--- a/cifar/resnet.py
+++ b/cifar/resnet.py
@@ -39,6 +39,10 @@ class ShortcutA(nn.Module):
 
 class Block(nn.Module):
     expansion = 1
+    """
+    Implements a ResNet block with two convolutional layers and a skip connection.
+    As per the paper, CIFAR-10 uses Shortcut type-A skip connections. (See paper for details)
+    """
 
     def __init__(self, in_dims, dims, stride=1):
         super().__init__()
@@ -71,6 +75,10 @@ class Block(nn.Module):
 
 
 class ResNet(nn.Module):
+    """
+    Creates a ResNet model for CIFAR-10, as specified in the original paper.
+    """
+
     def __init__(self, block, num_blocks, num_classes=10):
         super().__init__()
         self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)

From 15a6c155a815266c937d14402e8cf2608796aa76 Mon Sep 17 00:00:00 2001
From: Sarthak Yadav <sarthak.yadav3@gmail.com>
Date: Thu, 14 Dec 2023 09:05:04 +0100
Subject: [PATCH 3/6] simplified ResNet, expanded README with throughput and
 performance

---
 cifar/README.md        | 28 +++++++++++++++++++++++----
 cifar/dataset.py       |  2 +-
 cifar/main.py          | 43 ++++++++++++++++++++++--------------------
 cifar/requirements.txt |  3 +--
 cifar/resnet.py        | 16 +++++++---------
 5 files changed, 56 insertions(+), 36 deletions(-)

diff --git a/cifar/README.md b/cifar/README.md
index 0d793853..abb2c0f5 100644
--- a/cifar/README.md
+++ b/cifar/README.md
@@ -1,11 +1,10 @@
 # CIFAR and ResNets
 
-* This example shows how to run ResNets on CIFAR10 dataset, in accordance with the original [paper](https://arxiv.org/abs/1512.03385).
-* Also illustrates how to use `mlx-data` to download and load the dataset.
+An example of training a ResNet on CIFAR-10 with MLX. Several ResNet configurations in accordance with the original [paper](https://arxiv.org/abs/1512.03385) are available. Also illustrates how to use `mlx-data` to download and load the dataset.
 
 
 ## Pre-requisites
-* Install the dependencies:
+Install the dependencies:
 
 ```
 pip install -r requirements.txt
@@ -21,7 +20,7 @@ python main.py
 By default the example runs on the GPU. To run on the CPU, use: 
 
 ```
-python main.py --cpu_only
+python main.py --cpu
 ```
 
 For all available options, run:
@@ -29,3 +28,24 @@ For all available options, run:
 ```
 python main.py --help
 ```
+
+
+## Throughput
+
+On the tested device (M1 Macbook Pro, 16GB RAM), I get the following throughput with a `batch_size=256`:
+```
+Epoch: 0 | avg. tr_loss 2.074 | avg. tr_acc 0.216 | Train Throughput: 415.39 images/sec
+```
+
+When training on just the CPU (with the `--cpu` argument), the throughput is significantly lower (almost 30x!):
+```
+Epoch: 0 | avg. tr_loss 2.074 | avg. tr_acc 0.216 | Train Throughput: 13.5 images/sec
+```
+
+## Results
+After training for 100 epochs, the following results were observed:
+```
+Epoch: 99 | avg. tr_loss 0.320 | avg. tr_acc 0.888 | Train Throughput: 416.77 images/sec
+Epoch: 99 | test_acc 0.807
+```
+At the time of writing, `mlx` doesn't have in-built `schedulers`, nor a `BatchNorm` layer. We'll revisit this example for exact reproduction once these features are added.
\ No newline at end of file
diff --git a/cifar/dataset.py b/cifar/dataset.py
index f4a3cd63..29f558d1 100644
--- a/cifar/dataset.py
+++ b/cifar/dataset.py
@@ -36,4 +36,4 @@ def get_cifar10(batch_size, root=None):
     num_tr_steps_per_epoch = num_tr_samples // batch_size
     num_test_steps_per_epoch = num_test_samples // batch_size
 
-    return tr_iter, test_iter, num_tr_steps_per_epoch, num_test_steps_per_epoch
+    return tr_iter, test_iter
diff --git a/cifar/main.py b/cifar/main.py
index 5272733a..29b0cbc7 100644
--- a/cifar/main.py
+++ b/cifar/main.py
@@ -1,6 +1,6 @@
 import argparse
+import time
 import resnet
-import numpy as np
 import mlx.nn as nn
 import mlx.core as mx
 import mlx.optimizers as optim
@@ -14,11 +14,11 @@ parser.add_argument(
     default="resnet20",
     help="model architecture [resnet20, resnet32, resnet44, resnet56, resnet110, resnet1202]",
 )
-parser.add_argument("--batch_size", type=int, default=128, help="batch size")
+parser.add_argument("--batch_size", type=int, default=256, help="batch size")
 parser.add_argument("--epochs", type=int, default=100, help="number of epochs")
 parser.add_argument("--lr", type=float, default=1e-3, help="learning rate")
 parser.add_argument("--seed", type=int, default=0, help="random seed")
-parser.add_argument("--cpu_only", action="store_true", help="use cpu only")
+parser.add_argument("--cpu", action="store_true", help="use cpu only")
 
 
 def loss_fn(model, inp, tgt):
@@ -40,27 +40,30 @@ def train_epoch(model, train_iter, optimizer, epoch):
 
     losses = []
     accs = []
+    samples_per_sec = []
 
     for batch_counter, batch in enumerate(train_iter):
         x = mx.array(batch["image"])
         y = mx.array(batch["label"])
+        tic = time.perf_counter()
         (loss, acc), grads = train_step_fn(model, x, y)
         optimizer.update(model, grads)
         mx.eval(model.parameters(), optimizer.state)
-
+        toc = time.perf_counter()
         loss_value = loss.item()
         acc_value = acc.item()
         losses.append(loss_value)
         accs.append(acc_value)
-
+        samples_per_sec.append(x.shape[0] / (toc - tic))
         if batch_counter % 10 == 0:
             print(
-                f"Epoch {epoch:02d}[{batch_counter:03d}]: tr_loss {loss_value:.3f}, tr_acc {acc_value:.3f}"
+                f"Epoch {epoch:02d} [{batch_counter:03d}] | tr_loss {loss_value:.3f} | tr_acc {acc_value:.3f} | Throughput: {x.shape[0] / (toc - tic):.2f} images/second"
             )
 
-    mean_tr_loss = np.mean(np.array(losses))
-    mean_tr_acc = np.mean(np.array(accs))
-    return mean_tr_loss, mean_tr_acc
+    mean_tr_loss = mx.mean(mx.array(losses))
+    mean_tr_acc = mx.mean(mx.array(accs))
+    samples_per_sec = mx.mean(mx.array(samples_per_sec))
+    return mean_tr_loss, mean_tr_acc, samples_per_sec
 
 
 def test_epoch(model, test_iter, epoch):
@@ -71,13 +74,11 @@ def test_epoch(model, test_iter, epoch):
         acc = eval_fn(model, x, y)
         acc_value = acc.item()
         accs.append(acc_value)
-    mean_acc = np.mean(np.array(accs))
-
+    mean_acc = mx.mean(mx.array(accs))
     return mean_acc
 
 
 def main(args):
-    np.random.seed(args.seed)
     mx.random.seed(args.seed)
 
     model = resnet.__dict__[args.arch]()
@@ -87,22 +88,24 @@ def main(args):
 
     optimizer = optim.Adam(learning_rate=args.lr)
 
+    train_data, test_data = get_cifar10(args.batch_size)
     for epoch in range(args.epochs):
-        # get data every epoch
-        # or set .repeat() on the data stream appropriately
-        train_data, test_data, tr_batches, _ = get_cifar10(args.batch_size)
-
-        epoch_tr_loss, epoch_tr_acc = train_epoch(model, train_data, optimizer, epoch)
+        epoch_tr_loss, epoch_tr_acc, train_throughput = train_epoch(
+            model, train_data, optimizer, epoch
+        )
         print(
-            f"Epoch {epoch}: avg. tr_loss {epoch_tr_loss:.3f}, avg. tr_acc {epoch_tr_acc:.3f}"
+            f"Epoch: {epoch} | avg. tr_loss {epoch_tr_loss.item():.3f} | avg. tr_acc {epoch_tr_acc.item():.3f} | Train Throughput: {train_throughput.item():.2f} images/sec"
         )
 
         epoch_test_acc = test_epoch(model, test_data, epoch)
-        print(f"Epoch {epoch}: Test_acc {epoch_test_acc:.3f}")
+        print(f"Epoch: {epoch} | test_acc {epoch_test_acc.item():.3f}")
+
+        train_data.reset()
+        test_data.reset()
 
 
 if __name__ == "__main__":
     args = parser.parse_args()
-    if args.cpu_only:
+    if args.cpu:
         mx.set_default_device(mx.cpu)
     main(args)
diff --git a/cifar/requirements.txt b/cifar/requirements.txt
index c4c2e575..6ff78a64 100644
--- a/cifar/requirements.txt
+++ b/cifar/requirements.txt
@@ -1,3 +1,2 @@
 mlx
-mlx-data
-numpy
\ No newline at end of file
+mlx-data
\ No newline at end of file
diff --git a/cifar/resnet.py b/cifar/resnet.py
index b89a612b..6eeadda6 100644
--- a/cifar/resnet.py
+++ b/cifar/resnet.py
@@ -38,7 +38,6 @@ class ShortcutA(nn.Module):
 
 
 class Block(nn.Module):
-    expansion = 1
     """
     Implements a ResNet block with two convolutional layers and a skip connection.
     As per the paper, CIFAR-10 uses Shortcut type-A skip connections. (See paper for details)
@@ -57,7 +56,7 @@ class Block(nn.Module):
         )
         self.bn2 = nn.LayerNorm(dims)
 
-        if stride != 1 or in_dims != dims:
+        if stride != 1:
             self.shortcut = ShortcutA(dims)
         else:
             self.shortcut = None
@@ -83,20 +82,19 @@ class ResNet(nn.Module):
         super().__init__()
         self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
         self.bn1 = nn.LayerNorm(16)
-        self.in_dims = 16
 
-        self.layer1 = self._make_layer(block, 16, num_blocks[0], stride=1)
-        self.layer2 = self._make_layer(block, 32, num_blocks[1], stride=2)
-        self.layer3 = self._make_layer(block, 64, num_blocks[2], stride=2)
+        self.layer1 = self._make_layer(block, 16, 16, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(block, 16, 32, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(block, 32, 64, num_blocks[2], stride=2)
 
         self.linear = nn.Linear(64, num_classes)
 
-    def _make_layer(self, block, dims, num_blocks, stride):
+    def _make_layer(self, block, in_dims, dims, num_blocks, stride):
         strides = [stride] + [1] * (num_blocks - 1)
         layers = []
         for stride in strides:
-            layers.append(block(self.in_dims, dims, stride))
-            self.in_dims = dims * block.expansion
+            layers.append(block(in_dims, dims, stride))
+            in_dims = dims
         return nn.Sequential(*layers)
 
     def num_params(self):

From 29b7a973421222e52d182c24564c611955dcdfe4 Mon Sep 17 00:00:00 2001
From: Sarthak Yadav <sarthak.yadav3@gmail.com>
Date: Thu, 14 Dec 2023 16:28:00 +0100
Subject: [PATCH 4/6] updated header

---
 cifar/resnet.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/cifar/resnet.py b/cifar/resnet.py
index 6eeadda6..22b8a31a 100644
--- a/cifar/resnet.py
+++ b/cifar/resnet.py
@@ -3,9 +3,6 @@ Implementation of ResNets for CIFAR-10 as per the original paper [https://arxiv.
 Configurations include ResNet-20, ResNet-32, ResNet-44, ResNet-56, ResNet-110, ResNet-1202.
 
 There's no BatchNorm is mlx==0.0.4, using LayerNorm instead.
-
-Authors:
-    Sarthak Yadav, 2023
 """
 
 from typing import Any

From b1b9b11801e4d86f36ac569e199d70b39f00bfe2 Mon Sep 17 00:00:00 2001
From: Awni Hannun <awni@apple.com>
Date: Thu, 14 Dec 2023 12:09:10 -0800
Subject: [PATCH 5/6] updates + format

---
 cifar/README.md  | 38 ++++++++++++++++++------------------
 cifar/dataset.py | 21 ++++++--------------
 cifar/main.py    | 51 ++++++++++++++++++++++++++++--------------------
 cifar/resnet.py  |  1 -
 4 files changed, 55 insertions(+), 56 deletions(-)

diff --git a/cifar/README.md b/cifar/README.md
index abb2c0f5..118aef9e 100644
--- a/cifar/README.md
+++ b/cifar/README.md
@@ -1,9 +1,13 @@
 # CIFAR and ResNets
 
-An example of training a ResNet on CIFAR-10 with MLX. Several ResNet configurations in accordance with the original [paper](https://arxiv.org/abs/1512.03385) are available. Also illustrates how to use `mlx-data` to download and load the dataset.
-
+An example of training a ResNet on CIFAR-10 with MLX. Several ResNet
+configurations in accordance with the original
+[paper](https://arxiv.org/abs/1512.03385) are available. The example also
+illustrates how to use [MLX Data](https://github.com/ml-explore/mlx-data) to
+load the dataset.
 
 ## Pre-requisites
+
 Install the dependencies:
 
 ```
@@ -11,6 +15,7 @@ pip install -r requirements.txt
 ```
 
 ## Running the example
+
 Run the example with:
 
 ```
@@ -29,23 +34,18 @@ For all available options, run:
 python main.py --help
 ```
 
-
-## Throughput
-
-On the tested device (M1 Macbook Pro, 16GB RAM), I get the following throughput with a `batch_size=256`:
-```
-Epoch: 0 | avg. tr_loss 2.074 | avg. tr_acc 0.216 | Train Throughput: 415.39 images/sec
-```
-
-When training on just the CPU (with the `--cpu` argument), the throughput is significantly lower (almost 30x!):
-```
-Epoch: 0 | avg. tr_loss 2.074 | avg. tr_acc 0.216 | Train Throughput: 13.5 images/sec
-```
-
 ## Results
-After training for 100 epochs, the following results were observed:
+
+After training with the default `resnet20` architecture for 100 epochs, you
+should see the following results:
+
 ```
-Epoch: 99 | avg. tr_loss 0.320 | avg. tr_acc 0.888 | Train Throughput: 416.77 images/sec
-Epoch: 99 | test_acc 0.807
+Epoch: 99 | avg. Train loss 0.320 | avg. Train acc 0.888 | Throughput: 416.77 images/sec
+Epoch: 99 | Test acc 0.807
 ```
-At the time of writing, `mlx` doesn't have in-built `schedulers`, nor a `BatchNorm` layer. We'll revisit this example for exact reproduction once these features are added.
\ No newline at end of file
+
+Note this was run on an M1 Macbook Pro with 16GB RAM.
+
+At the time of writing, `mlx` doesn't have built-in learning rate schedules,
+nor a `BatchNorm` layer. We intend to update this example once these features
+are added.
diff --git a/cifar/dataset.py b/cifar/dataset.py
index 29f558d1..89b10136 100644
--- a/cifar/dataset.py
+++ b/cifar/dataset.py
@@ -4,13 +4,15 @@ import math
 
 
 def get_cifar10(batch_size, root=None):
-
     tr = load_cifar10(root=root)
-    num_tr_samples = tr.size()
 
     mean = mx.array([0.485, 0.456, 0.406]).reshape((1, 1, 3))
     std = mx.array([0.229, 0.224, 0.225]).reshape((1, 1, 3))
 
+    def normalize(x):
+        x = x.astype("float32") / 255.0
+        return (x - mean) / std
+
     tr_iter = (
         tr.shuffle()
         .to_stream()
@@ -18,22 +20,11 @@ def get_cifar10(batch_size, root=None):
         .pad("image", 0, 4, 4, 0.0)
         .pad("image", 1, 4, 4, 0.0)
         .image_random_crop("image", 32, 32)
-        .key_transform("image", lambda x: (x.astype("float32") / 255.0))
-        .key_transform("image", lambda x: (x - mean) / std)
+        .key_transform("image", normalize)
         .batch(batch_size)
     )
 
     test = load_cifar10(root=root, train=False)
-    num_test_samples = test.size()
-
-    test_iter = (
-        test.to_stream()
-        .key_transform("image", lambda x: (x.astype("float32") / 255.0))
-        .key_transform("image", lambda x: (x - mean) / std)
-        .batch(batch_size)
-    )
-
-    num_tr_steps_per_epoch = num_tr_samples // batch_size
-    num_test_steps_per_epoch = num_test_samples // batch_size
+    test_iter = test.to_stream().key_transform("image", normalize).batch(batch_size)
 
     return tr_iter, test_iter
diff --git a/cifar/main.py b/cifar/main.py
index 29b0cbc7..26d06a6a 100644
--- a/cifar/main.py
+++ b/cifar/main.py
@@ -12,7 +12,8 @@ parser.add_argument(
     "--arch",
     type=str,
     default="resnet20",
-    help="model architecture [resnet20, resnet32, resnet44, resnet56, resnet110, resnet1202]",
+    choices=[f"resnet{d}" for d in [20, 32, 44, 56, 110, 1202]],
+    help="model architecture",
 )
 parser.add_argument("--batch_size", type=int, default=256, help="batch size")
 parser.add_argument("--epochs", type=int, default=100, help="number of epochs")
@@ -21,10 +22,6 @@ parser.add_argument("--seed", type=int, default=0, help="random seed")
 parser.add_argument("--cpu", action="store_true", help="use cpu only")
 
 
-def loss_fn(model, inp, tgt):
-    return mx.mean(nn.losses.cross_entropy(model(inp), tgt))
-
-
 def eval_fn(model, inp, tgt):
     return mx.mean(mx.argmax(model(inp), axis=1) == tgt)
 
@@ -50,17 +47,25 @@ def train_epoch(model, train_iter, optimizer, epoch):
         optimizer.update(model, grads)
         mx.eval(model.parameters(), optimizer.state)
         toc = time.perf_counter()
-        loss_value = loss.item()
-        acc_value = acc.item()
-        losses.append(loss_value)
-        accs.append(acc_value)
-        samples_per_sec.append(x.shape[0] / (toc - tic))
+        loss = loss.item()
+        acc = acc.item()
+        losses.append(loss)
+        accs.append(acc)
+        throughput = x.shape[0] / (toc - tic)
+        samples_per_sec.append(throughput)
         if batch_counter % 10 == 0:
             print(
-                f"Epoch {epoch:02d} [{batch_counter:03d}] | tr_loss {loss_value:.3f} | tr_acc {acc_value:.3f} | Throughput: {x.shape[0] / (toc - tic):.2f} images/second"
+                " | ".join(
+                    (
+                        f"Epoch {epoch:02d} [{batch_counter:03d}]",
+                        f"Train loss {loss:.3f}",
+                        f"Train acc {acc:.3f}",
+                        f"Throughput: {throughput:.2f} images/second",
+                    )
+                )
             )
 
-    mean_tr_loss = mx.mean(mx.array(losses))
+    eean_tr_loss = mx.mean(mx.array(losses))
     mean_tr_acc = mx.mean(mx.array(accs))
     samples_per_sec = mx.mean(mx.array(samples_per_sec))
     return mean_tr_loss, mean_tr_acc, samples_per_sec
@@ -81,24 +86,28 @@ def test_epoch(model, test_iter, epoch):
 def main(args):
     mx.random.seed(args.seed)
 
-    model = resnet.__dict__[args.arch]()
+    model = getattr(resnet, args.arch)()
 
-    print("num_params: {:0.04f} M".format(model.num_params() / 1e6))
-    mx.eval(model.parameters())
+    print("Number of params: {:0.04f} M".format(model.num_params() / 1e6))
 
     optimizer = optim.Adam(learning_rate=args.lr)
 
     train_data, test_data = get_cifar10(args.batch_size)
     for epoch in range(args.epochs):
-        epoch_tr_loss, epoch_tr_acc, train_throughput = train_epoch(
-            model, train_data, optimizer, epoch
-        )
+        tr_loss, tr_acc, throughput = train_epoch(model, train_data, optimizer, epoch)
         print(
-            f"Epoch: {epoch} | avg. tr_loss {epoch_tr_loss.item():.3f} | avg. tr_acc {epoch_tr_acc.item():.3f} | Train Throughput: {train_throughput.item():.2f} images/sec"
+            " | ".join(
+                (
+                    f"Epoch: {epoch}",
+                    f"avg. Train loss {tr_loss.item():.3f}",
+                    f"avg. Train acc {tr_acc.item():.3f}",
+                    f"Throughput: {throughput.item():.2f} images/sec",
+                )
+            )
         )
 
-        epoch_test_acc = test_epoch(model, test_data, epoch)
-        print(f"Epoch: {epoch} | test_acc {epoch_test_acc.item():.3f}")
+        test_acc = test_epoch(model, test_data, epoch)
+        print(f"Epoch: {epoch} | Test acc {test_acc.item():.3f}")
 
         train_data.reset()
         test_data.reset()
diff --git a/cifar/resnet.py b/cifar/resnet.py
index 22b8a31a..758ee3de 100644
--- a/cifar/resnet.py
+++ b/cifar/resnet.py
@@ -59,7 +59,6 @@ class Block(nn.Module):
             self.shortcut = None
 
     def __call__(self, x):
-
         out = nn.relu(self.bn1(self.conv1(x)))
         out = self.bn2(self.conv2(out))
         if self.shortcut is None:

From b9439ce74e3040ad7e91f49327720f4a0b0aa912 Mon Sep 17 00:00:00 2001
From: Awni Hannun <awni@apple.com>
Date: Thu, 14 Dec 2023 12:14:01 -0800
Subject: [PATCH 6/6] typo / nits

---
 cifar/README.md | 2 +-
 cifar/main.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cifar/README.md b/cifar/README.md
index 118aef9e..d6bdaf9a 100644
--- a/cifar/README.md
+++ b/cifar/README.md
@@ -47,5 +47,5 @@ Epoch: 99 | Test acc 0.807
 Note this was run on an M1 Macbook Pro with 16GB RAM.
 
 At the time of writing, `mlx` doesn't have built-in learning rate schedules,
-nor a `BatchNorm` layer. We intend to update this example once these features
+or a `BatchNorm` layer. We intend to update this example once these features
 are added.
diff --git a/cifar/main.py b/cifar/main.py
index 26d06a6a..829417b1 100644
--- a/cifar/main.py
+++ b/cifar/main.py
@@ -65,7 +65,7 @@ def train_epoch(model, train_iter, optimizer, epoch):
                 )
             )
 
-    eean_tr_loss = mx.mean(mx.array(losses))
+    mean_tr_loss = mx.mean(mx.array(losses))
     mean_tr_acc = mx.mean(mx.array(accs))
     samples_per_sec = mx.mean(mx.array(samples_per_sec))
     return mean_tr_loss, mean_tr_acc, samples_per_sec