mlx/benchmarks/python/packed_qmm_bench.py

import argparse
import math

import mlx.core as mx
from time_utils import time_fn

B = 1024
D = 1024
M = 4 * D
group_size = 64
bits = 4
dtype = mx.float16
loops = 10


def qmm_(x, wq1, wq2, q_type):
    for i in range(loops):
        x = mx.quantized_matmul(
            x,
            *wq1,
            group_size=group_size,
            bits=bits,
            quantization_type=q_type,
        )
        x = mx.quantized_matmul(
            x,
            *wq2,
            group_size=group_size,
            bits=bits,
            quantization_type=q_type,
        )
    return x


def affine_qmm(x, wq1, wq2):
    return qmm_(x, wq1, wq2, "affine")


def affine_packed_qmm(x, wq1, wq2):
    return qmm_(x, wq1, wq2, "affine-packed")


def time_qmm():
    mx.random.seed(3)
    x = mx.random.normal(shape=(B, D)).astype(dtype)
    w1 = mx.random.normal(shape=(M, D)).astype(dtype)
    wq1 = mx.quantize(w1, group_size=group_size, bits=bits, quantization_type="affine")
    w2 = mx.random.normal(shape=(D, M)).astype(dtype)
    wq2 = mx.quantize(w2, group_size=group_size, bits=bits, quantization_type="affine")
    mx.eval(x, wq1, wq2)
    time_fn(affine_qmm, x, wq1, wq2)


def time_packed_qmm():
    mx.random.seed(3)
    x = mx.random.normal(shape=(B, D)).astype(dtype)
    w1 = mx.random.normal(shape=(M, D)).astype(dtype)
    wq1 = mx.quantize(
        w1, group_size=group_size, bits=bits, quantization_type="affine-packed"
    )
    w2 = mx.random.normal(shape=(D, M)).astype(dtype)
    wq2 = mx.quantize(
        w2, group_size=group_size, bits=bits, quantization_type="affine-packed"
    )
    mx.eval(x, wq1, wq2)
    time_fn(affine_packed_qmm, x, wq1, wq2)


if __name__ == "__main__":
    for b in [2, 4, 8]:
        bits = b
        print(f"Bits {bits}:")
        time_qmm()
        time_packed_qmm()
Add a small benchmark 2024-12-14 08:27:27 +08:00			`import argparse`
			`import math`

			`import mlx.core as mx`
			`from time_utils import time_fn`

Add packed_affine_qmm_t 2024-12-17 13:49:14 +08:00			`B = 1024`
Improve the benchmark 2024-12-15 15:04:29 +08:00			`D = 1024`
			`M = 4 * D`
Add a small benchmark 2024-12-14 08:27:27 +08:00			`group_size = 64`
Revert "Attempt different packing" This reverts commit e4b587819c3c75b0fb453274f193db0717e10946. 2024-12-14 15:23:21 +08:00			`bits = 4`
Add a small benchmark 2024-12-14 08:27:27 +08:00			`dtype = mx.float16`
Add packed_affine_qmm_t 2024-12-17 13:49:14 +08:00			`loops = 10`
Add a small benchmark 2024-12-14 08:27:27 +08:00

Add packed_affine_qmm_t 2024-12-17 13:49:14 +08:00			`def qmm_(x, wq1, wq2, q_type):`
Add a small benchmark 2024-12-14 08:27:27 +08:00			`for i in range(loops):`
			`x = mx.quantized_matmul(`
			`x,`
Improve the benchmark 2024-12-15 15:04:29 +08:00			`*wq1,`
			`group_size=group_size,`
			`bits=bits,`
Change the argument name to quantization_type 2024-12-17 05:31:34 +08:00			`quantization_type=q_type,`
Improve the benchmark 2024-12-15 15:04:29 +08:00			`)`
			`x = mx.quantized_matmul(`
			`x,`
			`*wq2,`
Add a small benchmark 2024-12-14 08:27:27 +08:00			`group_size=group_size,`
			`bits=bits,`
Change the argument name to quantization_type 2024-12-17 05:31:34 +08:00			`quantization_type=q_type,`
Add a small benchmark 2024-12-14 08:27:27 +08:00			`)`
			`return x`


Add packed_affine_qmm_t 2024-12-17 13:49:14 +08:00			`def affine_qmm(x, wq1, wq2):`
			`return qmm_(x, wq1, wq2, "affine")`
Add a small benchmark 2024-12-14 08:27:27 +08:00

Add packed_affine_qmm_t 2024-12-17 13:49:14 +08:00			`def affine_packed_qmm(x, wq1, wq2):`
			`return qmm_(x, wq1, wq2, "affine-packed")`
Add a small benchmark 2024-12-14 08:27:27 +08:00

Add packed_affine_qmm_t 2024-12-17 13:49:14 +08:00			`def time_qmm():`
Add a small benchmark 2024-12-14 08:27:27 +08:00			`mx.random.seed(3)`
Add packed_affine_qmm_t 2024-12-17 13:49:14 +08:00			`x = mx.random.normal(shape=(B, D)).astype(dtype)`
Improve the benchmark 2024-12-15 15:04:29 +08:00			`w1 = mx.random.normal(shape=(M, D)).astype(dtype)`
Change the argument name to quantization_type 2024-12-17 05:31:34 +08:00			`wq1 = mx.quantize(w1, group_size=group_size, bits=bits, quantization_type="affine")`
Improve the benchmark 2024-12-15 15:04:29 +08:00			`w2 = mx.random.normal(shape=(D, M)).astype(dtype)`
Change the argument name to quantization_type 2024-12-17 05:31:34 +08:00			`wq2 = mx.quantize(w2, group_size=group_size, bits=bits, quantization_type="affine")`
Improve the benchmark 2024-12-15 15:04:29 +08:00			`mx.eval(x, wq1, wq2)`
Add packed_affine_qmm_t 2024-12-17 13:49:14 +08:00			`time_fn(affine_qmm, x, wq1, wq2)`
Add a small benchmark 2024-12-14 08:27:27 +08:00

Add packed_affine_qmm_t 2024-12-17 13:49:14 +08:00			`def time_packed_qmm():`
Add a small benchmark 2024-12-14 08:27:27 +08:00			`mx.random.seed(3)`
Add packed_affine_qmm_t 2024-12-17 13:49:14 +08:00			`x = mx.random.normal(shape=(B, D)).astype(dtype)`
Improve the benchmark 2024-12-15 15:04:29 +08:00			`w1 = mx.random.normal(shape=(M, D)).astype(dtype)`
Change the argument name to quantization_type 2024-12-17 05:31:34 +08:00			`wq1 = mx.quantize(`
			`w1, group_size=group_size, bits=bits, quantization_type="affine-packed"`
			`)`
Improve the benchmark 2024-12-15 15:04:29 +08:00			`w2 = mx.random.normal(shape=(D, M)).astype(dtype)`
Change the argument name to quantization_type 2024-12-17 05:31:34 +08:00			`wq2 = mx.quantize(`
			`w2, group_size=group_size, bits=bits, quantization_type="affine-packed"`
			`)`
Improve the benchmark 2024-12-15 15:04:29 +08:00			`mx.eval(x, wq1, wq2)`
Add packed_affine_qmm_t 2024-12-17 13:49:14 +08:00			`time_fn(affine_packed_qmm, x, wq1, wq2)`
Add a small benchmark 2024-12-14 08:27:27 +08:00

			`if __name__ == "__main__":`
Change the argument name to quantization_type 2024-12-17 05:31:34 +08:00			`for b in [2, 4, 8]:`
Improve the benchmark 2024-12-15 15:04:29 +08:00			`bits = b`
			`print(f"Bits {bits}:")`
Add packed_affine_qmm_t 2024-12-17 13:49:14 +08:00			`time_qmm()`
			`time_packed_qmm()`