Added Keyword Transformer + SpeechCommands

2025-08-30 02:53:41 +08:00 · 2023-12-16 23:30:33 +01:00 · 2023-12-16 23:30:33 +01:00 · 3e24277ba3
commit 3e24277ba3
parent 08e862336a
4 changed files with 453 additions and 0 deletions
--- a/speechcommands/README.md
+++ b/speechcommands/README.md
@ -0,0 +1,60 @@
 # Training a Vision Transformer on SpeechCommands
 An example of training [Keyword Spotting Transformer](https://www.isca-speech.org/archive/interspeech_2021/berg21_interspeech.html), a variant of the Vision Transformer, on the [Speech Commands](https://arxiv.org/abs/1804.03209) (v0.02) dataset with MLX. All supervised only configurations from the paper are available.The example also
 illustrates how to use [MLX Data](https://github.com/ml-explore/mlx-data) to
 load and process an audio dataset.
 ## Pre-requisites
 Install the `mlx`
 ```
 pip install mlx==0.0.5
 ```
 At the time of writing, the SpeechCommands dataset is not yet a part of a `mlx-data` release. Install `mlx-data` from source using this [commit](https://github.com/ml-explore/mlx-data/commit/ae3431648b8e1594d63175a8f121d9873aeb9daa).
 ## Running the example
 Run the example with:
 ```
 python main.py
 ```
 By default the example runs on the GPU. To run on the CPU, use: 
 ```
 python main.py --cpu
 ```
 For all available options, run:
 ```
 python main.py --help
 ```
 ## Results
 After training with the `kwt1` architecture for 100 epochs, you
 should see the following results:
 ```
 Epoch: 99 | avg. Train loss 0.581 | avg. Train acc 0.826 | Throughput: 677.37 samples/sec
 Epoch: 99 | Val acc 0.710
 Testing best model from Epoch 98
 Test acc -> 0.687
 ```
 For the `kwt2` model, you should see:
 ```
 Epoch: 99 | avg. Train loss 0.137 | avg. Train acc 0.956 | Throughput: 401.47 samples/sec
 Epoch: 99 | Val acc 0.739
 Testing best model from Epoch 97
 Test acc -> 0.718
 ```
 Note that this was run on an M1 Macbook Pro with 16GB RAM.
 At the time of writing, `mlx` doesn't have built-in `cosine` learning rate schedules, which is used along with the AdamW optimizer in the official implementaiton. We intend to update this example once these features
 are added, as well as with appropriate data augmentations.
--- a/speechcommands/kwt.py
+++ b/speechcommands/kwt.py
@ -0,0 +1,231 @@
 from typing import Any
 import mlx.core as mx
 import mlx.nn as nn
 from mlx.utils import tree_flatten
 __all__ = ["KWT", "kwt1", "kwt2", "kwt3"]
 STD = 0.02
 class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout=0.0):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout) if dropout != 0.0 else Identity(),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout) if dropout != 0.0 else Identity(),
        )
    def __call__(self, x):
        return self.net(x)
 class Identity(nn.Module):
    def __init__(self):
        super().__init__()
    def __call__(self, x):
        return x
 class Attention(nn.Module):
    def __init__(self, dim, heads, dropout=0.0):
        super().__init__()
        self.heads = heads
        self.scale = dim**-0.5
        self.qkv = nn.Linear(dim, dim * 3, bias=False)
        self.out = nn.Sequential(
            nn.Linear(dim, dim), nn.Dropout(dropout) if dropout != 0.0 else Identity()
        )
    def __call__(self, x):
        b, n, _, h = *x.shape, self.heads
        qkv = self.qkv(x)
        qkv = qkv.reshape(b, n, 3, h, -1).transpose((2, 0, 3, 1, 4))
        q, k, v = qkv
        attn = (q @ k.transpose(0, 1, 3, 2)) * self.scale
        attn = mx.softmax(attn, axis=-1)
        x = (attn @ v).transpose((0, 2, 1, 3)).reshape(b, n, -1)
        x = self.out(x)
        return x
 class Block(nn.Module):
    def __init__(self, dim, heads, mlp_dim, dropout=0.0):
        super().__init__()
        # self.attn = nn.MultiHeadAttention(dim, heads)
        self.attn = Attention(dim, heads, dropout=dropout)
        self.norm1 = nn.LayerNorm(dim)
        self.ff = FeedForward(dim, mlp_dim, dropout=dropout)
        self.norm2 = nn.LayerNorm(dim)
    def __call__(self, x):
        x = self.attn(x)
        x = self.norm1(x)
        x = self.ff(x)
        x = self.norm2(x)
        return x
 class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, mlp_dim, dropout=0.0):
        super().__init__()
        self.layers = []
        for _ in range(depth):
            self.layers.append(Block(dim, heads, mlp_dim, dropout=dropout))
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
 class KWT(nn.Module):
    """
    Implements the Keyword Transformer (KWT) [1] model.
    KWT is essentially a vision transformer [2] with minor modifications:
    - Instead of square patches, KWT uses rectangular patches -> a patch across frequency for every timestep
    - KWT modules apply LayerNormalization after attention/feedforward layers, also referred to as PostNorm
    [1] https://arxiv.org/abs/2104.11178
    [2] https://arxiv.org/abs/2010.11929
    Parameters
    ----------
    input_res: tuple of ints
        Input resolution (time, frequency)
    patch_res: tuple of ints
        Patch resolution (time, frequency)
    num_classes: int
        Number of classes
    dim: int
        Model Embedding dimension
    depth: int
        Number of transformer layers
    heads: int
        Number of attention heads
    mlp_dim: int
        Feedforward hidden dimension
    pool: str
        Pooling type, either "cls" or "mean"
    in_channels: int, optional
        Number of input channels
    dropout: float, optional
        Dropout rate
    emb_dropout: float, optional
        Embedding dropout rate
    """
    def __init__(
        self,
        input_res,
        patch_res,
        num_classes,
        dim,
        depth,
        heads,
        mlp_dim,
        pool="mean",
        in_channels=1,
        dropout=0.0,
        emb_dropout=0.0,
    ):
        super().__init__()
        self.num_patches = int(
            (input_res[0] / patch_res[0]) * (input_res[1] / patch_res[1])
        )
        self.dim = dim
        self.patch_embedding = nn.Conv2d(
            in_channels, dim, kernel_size=patch_res, stride=patch_res
        )
        self.pos_embedding = mx.random.truncated_normal(
            -1 * STD / 2, STD / 2, (1, self.num_patches + 1, dim)
        )
        self.cls_token = mx.random.truncated_normal(-1 * STD / 2, STD / 2, (1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout) if emb_dropout != 0.0 else Identity()
        self.transformer = Transformer(dim, depth, heads, mlp_dim, dropout)
        self.pool = pool
        self.mlp_head = nn.Sequential(nn.LayerNorm(dim), nn.Linear(dim, num_classes))
    def num_params(self):
        nparams = sum(x.size for k, x in tree_flatten(self.parameters()))
        return nparams
    def __call__(self, x):
        if x.ndim != 4:
            x = mx.expand_dims(x, axis=-1)
        x = self.patch_embedding(x)
        x = x.reshape(x.shape[0], -1, self.dim)
        assert x.shape[1] == self.num_patches
        # x = x + self.pos_embedding[:, 1:, :]
        cls_tokens = mx.broadcast_to(self.cls_token, (x.shape[0], 1, self.dim))
        x = mx.concatenate((cls_tokens, x), axis=1)
        x = x + self.pos_embedding
        x = self.dropout(x)
        x = self.transformer(x)
        x = x.mean(axis=1) if self.pool == "mean" else x[:, 0]
        x = self.mlp_head(x)
        return x
 def parse_kwt_args(**kwargs):
    input_res = kwargs.pop("input_res", [98, 40])
    patch_res = kwargs.pop("patch_res", [1, 40])
    num_classes = kwargs.pop("num_classes", 35)
    emb_dropout = kwargs.pop("emb_dropout", 0.1)
    return input_res, patch_res, num_classes, emb_dropout, kwargs
 def kwt1(**kwargs):
    input_res, patch_res, num_classes, emb_dropout, kwargs = parse_kwt_args(**kwargs)
    return KWT(
        input_res,
        patch_res,
        num_classes,
        dim=64,
        depth=12,
        heads=1,
        mlp_dim=256,
        emb_dropout=emb_dropout,
        **kwargs
    )
 def kwt2(**kwargs):
    input_res, patch_res, num_classes, emb_dropout, kwargs = parse_kwt_args(**kwargs)
    return KWT(
        input_res,
        patch_res,
        num_classes,
        dim=128,
        depth=12,
        heads=2,
        mlp_dim=512,
        emb_dropout=emb_dropout,
        **kwargs
    )
 def kwt3(**kwargs):
    input_res, patch_res, num_classes, emb_dropout, kwargs = parse_kwt_args(**kwargs)
    return KWT(
        input_res,
        patch_res,
        num_classes,
        dim=192,
        depth=12,
        heads=3,
        mlp_dim=768,
        emb_dropout=emb_dropout,
        **kwargs
    )
--- a/speechcommands/main.py
+++ b/speechcommands/main.py
@ -0,0 +1,160 @@
 import argparse
 import time
 import kwt
 import mlx.nn as nn
 import mlx.data as dx
 import mlx.core as mx
 import mlx.optimizers as optim
 from mlx.data.features import mfsc
 from mlx.data.datasets import load_speechcommands
 parser = argparse.ArgumentParser(add_help=True)
 parser.add_argument(
    "--arch",
    type=str,
    default="kwt1",
    choices=[f"kwt{d}" for d in [1, 2, 3]],
    help="model architecture",
 )
 parser.add_argument("--batch_size", type=int, default=256, help="batch size")
 parser.add_argument("--epochs", type=int, default=100, help="number of epochs")
 parser.add_argument("--lr", type=float, default=1e-3, help="learning rate")
 parser.add_argument("--seed", type=int, default=0, help="random seed")
 parser.add_argument("--cpu", action="store_true", help="use cpu only")
 def prepare_dataset(batch_size, split, root=None):
    def normalize(x):
        return (x - x.mean()) / x.std()
    data = load_speechcommands(split=split, root=root)
    data_iter = (
        data.squeeze("audio")
        .key_transform(
            "audio",
            mfsc(
                40,
                16000,
                frame_size_ms=30,
                frame_stride_ms=10,
                high_freq=7600,
                low_freq=20,
            ),
        )
        .key_transform("audio", normalize)
        .shuffle()
        .batch(batch_size)
    )
    return data_iter
 def eval_fn(model, inp, tgt):
    return mx.mean(mx.argmax(model(inp), axis=1) == tgt)
 def train_epoch(model, train_iter, optimizer, epoch):
    def train_step(model, inp, tgt):
        output = model(inp)
        loss = mx.mean(nn.losses.cross_entropy(output, tgt))
        acc = mx.mean(mx.argmax(output, axis=1) == tgt)
        return loss, acc
    train_step_fn = nn.value_and_grad(model, train_step)
    losses = []
    accs = []
    samples_per_sec = []
    for batch_counter, batch in enumerate(train_iter):
        x = mx.array(batch["audio"])
        y = mx.array(batch["label"])
        tic = time.perf_counter()
        (loss, acc), grads = train_step_fn(model, x, y)
        optimizer.update(model, grads)
        mx.eval(model.parameters(), optimizer.state)
        toc = time.perf_counter()
        loss = loss.item()
        acc = acc.item()
        losses.append(loss)
        accs.append(acc)
        throughput = x.shape[0] / (toc - tic)
        samples_per_sec.append(throughput)
        if batch_counter % 25 == 0:
            print(
                " | ".join(
                    (
                        f"Epoch {epoch:02d} [{batch_counter:03d}]",
                        f"Train loss {loss:.3f}",
                        f"Train acc {acc:.3f}",
                        f"Throughput: {throughput:.2f} samples/second",
                    )
                )
            )
    mean_tr_loss = mx.mean(mx.array(losses))
    mean_tr_acc = mx.mean(mx.array(accs))
    samples_per_sec = mx.mean(mx.array(samples_per_sec))
    return mean_tr_loss, mean_tr_acc, samples_per_sec
 def test_epoch(model, test_iter):
    accs = []
    for batch_counter, batch in enumerate(test_iter):
        x = mx.array(batch["audio"])
        y = mx.array(batch["label"])
        acc = eval_fn(model, x, y)
        acc_value = acc.item()
        accs.append(acc_value)
    mean_acc = mx.mean(mx.array(accs))
    return mean_acc
 def main(args):
    mx.random.seed(args.seed)
    model = getattr(kwt, args.arch)()
    print("Number of params: {:0.04f} M".format(model.num_params() / 1e6))
    optimizer = optim.SGD(learning_rate=args.lr, momentum=0.9, weight_decay=1e-4)
    train_data = prepare_dataset(args.batch_size, "train")
    val_data = prepare_dataset(args.batch_size, "validation")
    best_params = None
    best_acc = 0.0
    best_epoch = 0
    for epoch in range(args.epochs):
        tr_loss, tr_acc, throughput = train_epoch(model, train_data, optimizer, epoch)
        print(
            " | ".join(
                (
                    f"Epoch: {epoch}",
                    f"avg. Train loss {tr_loss.item():.3f}",
                    f"avg. Train acc {tr_acc.item():.3f}",
                    f"Throughput: {throughput.item():.2f} samples/sec",
                )
            )
        )
        val_acc = test_epoch(model, val_data)
        print(f"Epoch: {epoch} | Val acc {val_acc.item():.3f}")
        if val_acc >= best_acc:
            best_acc = val_acc
            best_epoch = epoch
            best_params = model.parameters()
    print(f"Testing best model from Epoch {best_epoch}")
    model.update(best_params)
    test_data = prepare_dataset(args.batch_size, "test")
    test_acc = test_epoch(model, test_data)
    print(f"Test acc -> {test_acc.item():.3f}")
 if __name__ == "__main__":
    args = parser.parse_args()
    if args.cpu:
        mx.set_default_device(mx.cpu)
    main(args)
--- a/speechcommands/requirements.txt
+++ b/speechcommands/requirements.txt
@ -0,0 +1,2 @@
 mlx==0.0.5
 mlx-data