mlx-examples/cifar/main.py

import argparse
import time
from functools import partial

import mlx.core as mx
import mlx.nn as nn
import mlx.optimizers as optim
import resnet
from dataset import get_cifar10

parser = argparse.ArgumentParser(add_help=True)
parser.add_argument(
    "--arch",
    type=str,
    default="resnet20",
    choices=[f"resnet{d}" for d in [20, 32, 44, 56, 110, 1202]],
    help="model architecture",
)
parser.add_argument("--batch_size", type=int, default=256, help="batch size")
parser.add_argument("--epochs", type=int, default=30, help="number of epochs")
parser.add_argument("--lr", type=float, default=1e-3, help="learning rate")
parser.add_argument("--seed", type=int, default=0, help="random seed")
parser.add_argument("--cpu", action="store_true", help="use cpu only")


def print_zero(group, *args, **kwargs):
    if group.rank() != 0:
        return
    flush = kwargs.pop("flush", True)
    print(*args, **kwargs, flush=flush)


def eval_fn(model, inp, tgt):
    return mx.mean(mx.argmax(model(inp), axis=1) == tgt)


def train_epoch(model, train_iter, optimizer, epoch):
    def train_step(model, inp, tgt):
        output = model(inp)
        loss = mx.mean(nn.losses.cross_entropy(output, tgt))
        acc = mx.mean(mx.argmax(output, axis=1) == tgt)
        return loss, acc

    world = mx.distributed.init()
    losses = 0
    accuracies = 0
    samples_per_sec = 0
    count = 0

    def average_stats(stats, count):
        if world.size() == 1:
            return [s / count for s in stats]

        with mx.stream(mx.cpu):
            stats = mx.distributed.all_sum(mx.array(stats))
            count = mx.distributed.all_sum(count)
            return (stats / count).tolist()

    state = [model.state, optimizer.state]

    @partial(mx.compile, inputs=state, outputs=state)
    def step(inp, tgt):
        train_step_fn = nn.value_and_grad(model, train_step)
        (loss, acc), grads = train_step_fn(model, inp, tgt)
        grads = nn.utils.average_gradients(grads)
        optimizer.update(model, grads)
        return loss, acc

    for batch_counter, batch in enumerate(train_iter):
        x = mx.array(batch["image"])
        y = mx.array(batch["label"])
        tic = time.perf_counter()
        loss, acc = step(x, y)
        mx.eval(loss, acc, state)
        toc = time.perf_counter()
        losses += loss.item()
        accuracies += acc.item()
        samples_per_sec += x.shape[0] / (toc - tic)
        count += 1
        if batch_counter % 10 == 0:
            l, a, s = average_stats(
                [losses, accuracies, world.size() * samples_per_sec],
                count,
            )
            print_zero(
                world,
                " | ".join(
                    (
                        f"Epoch {epoch:02d} [{batch_counter:03d}]",
                        f"Train loss {l:.3f}",
                        f"Train acc {a:.3f}",
                        f"Throughput: {s:.2f} images/second",
                    )
                ),
            )

    return average_stats([losses, accuracies, world.size() * samples_per_sec], count)


def test_epoch(model, test_iter, epoch):
    accuracies = 0
    count = 0
    for batch_counter, batch in enumerate(test_iter):
        x = mx.array(batch["image"])
        y = mx.array(batch["label"])
        acc = eval_fn(model, x, y)
        accuracies += acc.item()
        count += 1

    with mx.stream(mx.cpu):
        accuracies = mx.distributed.all_sum(accuracies)
        count = mx.distributed.all_sum(count)
        return (accuracies / count).item()


def main(args):
    mx.random.seed(args.seed)

    # Initialize the distributed group and report the nodes that showed up
    world = mx.distributed.init()
    if world.size() > 1:
        print(f"Starting rank {world.rank()} of {world.size()}", flush=True)

    model = getattr(resnet, args.arch)()

    print_zero(world, f"Number of params: {model.num_params() / 1e6:0.04f} M")

    optimizer = optim.Adam(learning_rate=args.lr)

    train_data, test_data = get_cifar10(args.batch_size)
    for epoch in range(args.epochs):
        tr_loss, tr_acc, throughput = train_epoch(model, train_data, optimizer, epoch)
        print_zero(
            world,
            " | ".join(
                (
                    f"Epoch: {epoch}",
                    f"avg. Train loss {tr_loss:.3f}",
                    f"avg. Train acc {tr_acc:.3f}",
                    f"Throughput: {throughput:.2f} images/sec",
                )
            ),
        )

        test_acc = test_epoch(model, test_data, epoch)
        print_zero(world, f"Epoch: {epoch} | Test acc {test_acc:.3f}")

        train_data.reset()
        test_data.reset()


if __name__ == "__main__":
    args = parser.parse_args()
    if args.cpu:
        mx.set_default_device(mx.cpu)
    main(args)
added CIFAR10 + ResNet example 2023-12-13 02:01:06 +08:00			`import argparse`
simplified ResNet, expanded README with throughput and performance 2023-12-14 16:05:04 +08:00			`import time`
Update a few examples to use compile (#420) * update a few examples to use compile * update mnist * add compile to vae and rename some stuff for simplicity * update reqs * use state in eval * GCN example with RNG + dropout * add a bit of prefetching 2024-02-09 05:00:41 +08:00			`from functools import partial`
Add llms subdir + update README (#145) * add llms subdir + update README * nits * use same pre-commit as mlx * update readmes a bit * format 2023-12-21 02:22:25 +08:00
added CIFAR10 + ResNet example 2023-12-13 02:01:06 +08:00			`import mlx.core as mx`
Add llms subdir + update README (#145) * add llms subdir + update README * nits * use same pre-commit as mlx * update readmes a bit * format 2023-12-21 02:22:25 +08:00			`import mlx.nn as nn`
added CIFAR10 + ResNet example 2023-12-13 02:01:06 +08:00			`import mlx.optimizers as optim`
Add llms subdir + update README (#145) * add llms subdir + update README * nits * use same pre-commit as mlx * update readmes a bit * format 2023-12-21 02:22:25 +08:00			`import resnet`
added CIFAR10 + ResNet example 2023-12-13 02:01:06 +08:00			`from dataset import get_cifar10`

			`parser = argparse.ArgumentParser(add_help=True)`
			`parser.add_argument(`
			`"--arch",`
			`type=str,`
			`default="resnet20",`
updates + format 2023-12-15 04:09:10 +08:00			`choices=[f"resnet{d}" for d in [20, 32, 44, 56, 110, 1202]],`
			`help="model architecture",`
added CIFAR10 + ResNet example 2023-12-13 02:01:06 +08:00			`)`
simplified ResNet, expanded README with throughput and performance 2023-12-14 16:05:04 +08:00			`parser.add_argument("--batch_size", type=int, default=256, help="batch size")`
Updated CIFAR-10 ResNet example to use BatchNorm instead of LayerNorm (#257) * replaced nn.LayerNorm by nn.BatchNorm * mlx>=0.0.8 required * updated default to 30 epochs instead of 100 * updated README after adding BatchNorm * requires mlx>=0.0.9 * updated README.md with results for mlx-0.0.9 2024-01-12 21:43:11 +08:00			`parser.add_argument("--epochs", type=int, default=30, help="number of epochs")`
added CIFAR10 + ResNet example 2023-12-13 02:01:06 +08:00			`parser.add_argument("--lr", type=float, default=1e-3, help="learning rate")`
			`parser.add_argument("--seed", type=int, default=0, help="random seed")`
simplified ResNet, expanded README with throughput and performance 2023-12-14 16:05:04 +08:00			`parser.add_argument("--cpu", action="store_true", help="use cpu only")`
added CIFAR10 + ResNet example 2023-12-13 02:01:06 +08:00

Distributed support cifar (#1301) 2025-03-06 05:33:15 +08:00			`def print_zero(group, args, *kwargs):`
			`if group.rank() != 0:`
			`return`
			`flush = kwargs.pop("flush", True)`
			`print(args, *kwargs, flush=flush)`


added CIFAR10 + ResNet example 2023-12-13 02:01:06 +08:00			`def eval_fn(model, inp, tgt):`
			`return mx.mean(mx.argmax(model(inp), axis=1) == tgt)`


			`def train_epoch(model, train_iter, optimizer, epoch):`
			`def train_step(model, inp, tgt):`
			`output = model(inp)`
			`loss = mx.mean(nn.losses.cross_entropy(output, tgt))`
			`acc = mx.mean(mx.argmax(output, axis=1) == tgt)`
			`return loss, acc`

Distributed support cifar (#1301) 2025-03-06 05:33:15 +08:00			`world = mx.distributed.init()`
			`losses = 0`
			`accuracies = 0`
			`samples_per_sec = 0`
			`count = 0`

			`def average_stats(stats, count):`
			`if world.size() == 1:`
			`return [s / count for s in stats]`

			`with mx.stream(mx.cpu):`
			`stats = mx.distributed.all_sum(mx.array(stats))`
			`count = mx.distributed.all_sum(count)`
			`return (stats / count).tolist()`
added CIFAR10 + ResNet example 2023-12-13 02:01:06 +08:00
Update a few examples to use compile (#420) * update a few examples to use compile * update mnist * add compile to vae and rename some stuff for simplicity * update reqs * use state in eval * GCN example with RNG + dropout * add a bit of prefetching 2024-02-09 05:00:41 +08:00			`state = [model.state, optimizer.state]`

			`@partial(mx.compile, inputs=state, outputs=state)`
			`def step(inp, tgt):`
			`train_step_fn = nn.value_and_grad(model, train_step)`
			`(loss, acc), grads = train_step_fn(model, inp, tgt)`
Distributed support cifar (#1301) 2025-03-06 05:33:15 +08:00			`grads = nn.utils.average_gradients(grads)`
Update a few examples to use compile (#420) * update a few examples to use compile * update mnist * add compile to vae and rename some stuff for simplicity * update reqs * use state in eval * GCN example with RNG + dropout * add a bit of prefetching 2024-02-09 05:00:41 +08:00			`optimizer.update(model, grads)`
			`return loss, acc`

added CIFAR10 + ResNet example 2023-12-13 02:01:06 +08:00			`for batch_counter, batch in enumerate(train_iter):`
			`x = mx.array(batch["image"])`
			`y = mx.array(batch["label"])`
simplified ResNet, expanded README with throughput and performance 2023-12-14 16:05:04 +08:00			`tic = time.perf_counter()`
Update a few examples to use compile (#420) * update a few examples to use compile * update mnist * add compile to vae and rename some stuff for simplicity * update reqs * use state in eval * GCN example with RNG + dropout * add a bit of prefetching 2024-02-09 05:00:41 +08:00			`loss, acc = step(x, y)`
Distributed support cifar (#1301) 2025-03-06 05:33:15 +08:00			`mx.eval(loss, acc, state)`
simplified ResNet, expanded README with throughput and performance 2023-12-14 16:05:04 +08:00			`toc = time.perf_counter()`
Distributed support cifar (#1301) 2025-03-06 05:33:15 +08:00			`losses += loss.item()`
			`accuracies += acc.item()`
			`samples_per_sec += x.shape[0] / (toc - tic)`
			`count += 1`
added CIFAR10 + ResNet example 2023-12-13 02:01:06 +08:00			`if batch_counter % 10 == 0:`
Distributed support cifar (#1301) 2025-03-06 05:33:15 +08:00			`l, a, s = average_stats(`
			`[losses, accuracies, world.size() * samples_per_sec],`
			`count,`
			`)`
			`print_zero(`
			`world,`
updates + format 2023-12-15 04:09:10 +08:00			`" \| ".join(`
			`(`
			`f"Epoch {epoch:02d} [{batch_counter:03d}]",`
Distributed support cifar (#1301) 2025-03-06 05:33:15 +08:00			`f"Train loss {l:.3f}",`
			`f"Train acc {a:.3f}",`
			`f"Throughput: {s:.2f} images/second",`
updates + format 2023-12-15 04:09:10 +08:00			`)`
Distributed support cifar (#1301) 2025-03-06 05:33:15 +08:00			`),`
added CIFAR10 + ResNet example 2023-12-13 02:01:06 +08:00			`)`

Distributed support cifar (#1301) 2025-03-06 05:33:15 +08:00			`return average_stats([losses, accuracies, world.size() * samples_per_sec], count)`
added CIFAR10 + ResNet example 2023-12-13 02:01:06 +08:00

			`def test_epoch(model, test_iter, epoch):`
Distributed support cifar (#1301) 2025-03-06 05:33:15 +08:00			`accuracies = 0`
			`count = 0`
added CIFAR10 + ResNet example 2023-12-13 02:01:06 +08:00			`for batch_counter, batch in enumerate(test_iter):`
			`x = mx.array(batch["image"])`
			`y = mx.array(batch["label"])`
			`acc = eval_fn(model, x, y)`
Distributed support cifar (#1301) 2025-03-06 05:33:15 +08:00			`accuracies += acc.item()`
			`count += 1`

			`with mx.stream(mx.cpu):`
			`accuracies = mx.distributed.all_sum(accuracies)`
			`count = mx.distributed.all_sum(count)`
			`return (accuracies / count).item()`
added CIFAR10 + ResNet example 2023-12-13 02:01:06 +08:00

			`def main(args):`
			`mx.random.seed(args.seed)`

Distributed support cifar (#1301) 2025-03-06 05:33:15 +08:00			`# Initialize the distributed group and report the nodes that showed up`
			`world = mx.distributed.init()`
			`if world.size() > 1:`
			`print(f"Starting rank {world.rank()} of {world.size()}", flush=True)`

updates + format 2023-12-15 04:09:10 +08:00			`model = getattr(resnet, args.arch)()`
added CIFAR10 + ResNet example 2023-12-13 02:01:06 +08:00
Distributed support cifar (#1301) 2025-03-06 05:33:15 +08:00			`print_zero(world, f"Number of params: {model.num_params() / 1e6:0.04f} M")`
added CIFAR10 + ResNet example 2023-12-13 02:01:06 +08:00
			`optimizer = optim.Adam(learning_rate=args.lr)`

simplified ResNet, expanded README with throughput and performance 2023-12-14 16:05:04 +08:00			`train_data, test_data = get_cifar10(args.batch_size)`
added CIFAR10 + ResNet example 2023-12-13 02:01:06 +08:00			`for epoch in range(args.epochs):`
updates + format 2023-12-15 04:09:10 +08:00			`tr_loss, tr_acc, throughput = train_epoch(model, train_data, optimizer, epoch)`
Distributed support cifar (#1301) 2025-03-06 05:33:15 +08:00			`print_zero(`
			`world,`
updates + format 2023-12-15 04:09:10 +08:00			`" \| ".join(`
			`(`
			`f"Epoch: {epoch}",`
Distributed support cifar (#1301) 2025-03-06 05:33:15 +08:00			`f"avg. Train loss {tr_loss:.3f}",`
			`f"avg. Train acc {tr_acc:.3f}",`
			`f"Throughput: {throughput:.2f} images/sec",`
updates + format 2023-12-15 04:09:10 +08:00			`)`
Distributed support cifar (#1301) 2025-03-06 05:33:15 +08:00			`),`
added CIFAR10 + ResNet example 2023-12-13 02:01:06 +08:00			`)`

updates + format 2023-12-15 04:09:10 +08:00			`test_acc = test_epoch(model, test_data, epoch)`
Distributed support cifar (#1301) 2025-03-06 05:33:15 +08:00			`print_zero(world, f"Epoch: {epoch} \| Test acc {test_acc:.3f}")`
simplified ResNet, expanded README with throughput and performance 2023-12-14 16:05:04 +08:00
			`train_data.reset()`
			`test_data.reset()`
added CIFAR10 + ResNet example 2023-12-13 02:01:06 +08:00

			`if __name__ == "__main__":`
			`args = parser.parse_args()`
simplified ResNet, expanded README with throughput and performance 2023-12-14 16:05:04 +08:00			`if args.cpu:`
added CIFAR10 + ResNet example 2023-12-13 02:01:06 +08:00			`mx.set_default_device(mx.cpu)`
			`main(args)`