awni's commit files

This commit is contained in:
Awni Hannun
2023-11-29 10:30:41 -08:00
parent e411fcae68
commit 8ca7f9e8e9
130 changed files with 30159 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
Microbenchmarks comparing MLX to PyTorch
========================================
Implement the same microbenchmarks in MLX and PyTorch to compare and make a
list of the biggest possible performance improvements and/or regressions.
Run with `python bench_mlx.py sum_axis --size 8x1024x128 --axis 2 --cpu` for
instance to measure the times it takes to sum across the 3rd axis of the above
tensor on the cpu.
`compare.py` runs several benchmarks and compares the speed-up or lack thereof
in comparison to PyTorch.
Each bench script can be run with `--print-pid` to print the PID and wait for a
key in order to ease attaching a debugger.

View File

@@ -0,0 +1,313 @@
import argparse
import math
import os
import time
import mlx.core as mx
def int_or_list(x):
try:
return int(x)
except ValueError:
return [int(xi) for xi in x.split(",")]
def none_or_list(x):
if x == "":
return None
else:
return [int(xi) for xi in x.split(",")]
def bench(f, *args):
for i in range(10):
f(*args)
s = time.time()
for i in range(100):
f(*args)
e = time.time()
return e - s
def matmul_square(x):
y = x
for i in range(10):
y = y @ x
mx.eval(y)
return y
def matmul(x, y):
ys = []
for i in range(10):
ys.append(x @ y)
mx.eval(ys)
def conv1d(x, y):
ys = []
for i in range(10):
ys.append(mx.conv1d(x, y))
mx.eval(ys)
def conv2d(x, y):
ys = []
for i in range(10):
ys.append(mx.conv2d(x, y))
mx.eval(ys)
def binary(op, x, y):
for i in range(100):
y = getattr(mx, op)(x, y)
mx.eval(y)
def reduction(op, axis, x):
ys = []
for i in range(100):
ys.append(getattr(mx, op)(x, axis=axis))
mx.eval(ys)
def softmax(axis, x):
ys = []
for i in range(100):
ex = mx.exp(x - mx.max(x, axis=axis, keepdims=True))
y = ex / mx.sum(ex, axis=axis, keepdims=True)
ys.append(y)
mx.eval(ys)
def softmax_fused(axis, x):
ys = []
for i in range(100):
y = mx.softmax(x, axis=axis)
ys.append(y)
mx.eval(ys)
def relu(x):
y = x
for i in range(100):
y = mx.maximum(y, 0)
mx.eval(y)
def scalar_mult(x):
y = x
for i in range(100):
y = y * (1.0 / (1 + i))
mx.eval(y)
def cross_entropy(targets, x):
ys = []
for i in range(100):
y = mx.logsumexp(x, axis=-1, keepdims=True) - mx.take_along_axis(
x, mx.reshape(targets, (-1, 1)), axis=-1
)
ys.append(mx.mean(y))
mx.eval(ys)
def logsumexp(axis, x):
ys = []
for i in range(100):
ys.append(mx.logsumexp(x, axis=axis))
mx.eval(ys)
def linear(w, b, x):
ys = []
for i in range(10):
ys.append(x @ mx.transpose(w, (1, 0)) + b)
mx.eval(ys)
def rope(x):
*_, N, D = x.shape
ys = []
for i in range(10):
shape = x.shape
x = mx.reshape(x, (-1, N, D))
positions = mx.arange(N)
freqs = mx.exp(mx.arange(0.0, D // 2) / math.log(10000 / (D // 2 - 1)))
theta = mx.reshape(positions, (-1, 1)) * mx.reshape(freqs, (1, -1))
costheta = mx.cos(theta)
sintheta = mx.sin(theta)
x1 = x[..., ::2]
x2 = x[..., 1::2]
rx1 = x1 * costheta - x2 * sintheta
rx2 = x1 * sintheta + x2 * costheta
y = mx.concatenate([rx1[..., None], rx2[..., None]], axis=-1)
y = mx.reshape(y, (-1, N, D))
ys.append(y)
mx.eval(ys)
def concatenate(axis, x, y):
ys = []
for i in range(10):
ys.append(mx.concatenate([x, y], axis=axis))
mx.eval(ys)
def cumsum(axis, x):
ys = []
for i in range(10):
ys.append(mx.cumsum(x, axis))
mx.eval(ys)
def sort(axis, x):
ys = []
for i in range(10):
ys.append(mx.sort(x, axis))
mx.eval(ys)
def topk(axis, x):
k = x.shape[axis] // 3
ys = []
for i in range(10):
ys.append(mx.topk(x, k, axis))
mx.eval(ys)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("benchmark", help="Choose the benchmark to run")
parser.add_argument(
"--size",
default=[(1024, 1024)],
type=lambda x: list(map(int, x.split("x"))),
help="Set the matrix size",
action="append",
)
parser.add_argument(
"--axis",
default=[1],
type=int_or_list,
help="Set a reduction axis",
action="append",
)
parser.add_argument(
"--transpose",
type=none_or_list,
default=[],
help="Permute the matrix",
action="append",
)
parser.add_argument(
"--print-pid", action="store_true", help="Print the PID and pause"
)
parser.add_argument("--cpu", action="store_true", help="Use the CPU")
parser.add_argument(
"--fused", action="store_true", help="Use fused functions where possible"
)
parser.add_argument(
"--dtype", choices=["float32", "float16", "bfloat16"], default="float32"
)
args = parser.parse_args()
if len(args.size) > 1:
args.size.pop(0)
if len(args.axis) > 1:
args.axis.pop(0)
if args.print_pid:
print(os.getpid())
input("Press enter to run")
if args.cpu:
mx.set_default_device(mx.cpu)
else:
mx.set_default_device(mx.gpu)
dtype = dict(float32=mx.float32, float16=mx.float16, bfloat16=mx.bfloat16)[
args.dtype
]
xs = []
for size in args.size:
xs.append(mx.random.normal(size).astype(dtype))
for i, t in enumerate(args.transpose):
if t is None:
continue
xs[i] = mx.transpose(xs[i], t)
mx.eval(xs)
x = xs[0]
axis = args.axis[0]
if args.benchmark == "matmul_square":
print(bench(matmul_square, x))
elif args.benchmark == "matmul":
print(bench(matmul, *xs))
elif args.benchmark == "linear":
print(bench(linear, *xs))
elif args.benchmark == "sum_axis":
print(bench(reduction, "sum", axis, x))
elif args.benchmark == "sum_all":
print(bench(reduction, "sum", None, x))
elif args.benchmark == "argmax":
print(bench(reduction, "argmax", axis, x))
elif args.benchmark == "add":
print(bench(binary, "add", *xs))
elif args.benchmark == "mul":
print(bench(binary, "multiply", *xs))
elif args.benchmark == "softmax":
if args.fused:
print(bench(softmax_fused, axis, x))
else:
print(bench(softmax, axis, x))
elif args.benchmark == "relu":
print(bench(relu, x))
elif args.benchmark == "scalar_mul":
print(bench(scalar_mult, x))
elif args.benchmark == "cross_entropy":
if len(size) != 2:
raise ValueError("Error: [cross_entropy] benchmark requires a 2 dim size")
targets = mx.zeros((len(x),), dtype=mx.uint32)
print(bench(cross_entropy, targets, x))
elif args.benchmark == "logsumexp":
print(bench(logsumexp, axis, x))
elif args.benchmark == "rope":
print(bench(rope, x))
elif args.benchmark == "concatenate":
print(bench(concatenate, axis, *xs))
elif args.benchmark == "cumsum":
print(bench(cumsum, axis, *xs))
elif args.benchmark == "conv1d":
print(bench(conv1d, *xs))
elif args.benchmark == "conv2d":
print(bench(conv2d, *xs))
elif args.benchmark == "sort":
print(bench(sort, axis, x))
elif args.benchmark == "topk":
print(bench(topk, axis, x))
else:
raise ValueError("Unknown benchmark")

View File

@@ -0,0 +1,338 @@
import argparse
import os
import time
import torch
import torch.mps
def int_or_list(x):
try:
return int(x)
except ValueError:
return [int(xi) for xi in x.split(",")]
def none_or_list(x):
if x == "":
return None
else:
return [int(xi) for xi in x.split(",")]
def bench(f, *args):
for i in range(10):
f(*args)
s = time.time()
for i in range(100):
f(*args)
e = time.time()
return e - s
def sync_if_needed(x):
if x.device != torch.device("cpu"):
torch.mps.synchronize()
@torch.no_grad()
def matmul_square(x):
y = x
for i in range(10):
y = y @ x
sync_if_needed(x)
@torch.no_grad()
def matmul(x, y):
ys = []
for i in range(10):
ys.append(x @ y)
sync_if_needed(x)
@torch.no_grad()
def conv1d(x, y):
x = torch.transpose(x, -1, -2)
y = torch.transpose(y, -1, -2)
ys = []
for i in range(10):
ys.append(torch.nn.functional.conv1d(x, y))
sync_if_needed(x)
@torch.no_grad()
def conv2d(x, y):
x = torch.permute(x, (0, 3, 1, 2))
y = torch.permute(y, (0, 3, 1, 2))
ys = []
for i in range(10):
ys.append(torch.nn.functional.conv2d(x, y))
sync_if_needed(x)
@torch.no_grad()
def binary(op, x, y):
for i in range(100):
y = getattr(torch, op)(x, y)
sync_if_needed(x)
@torch.no_grad()
def reduction(op, axis, x):
ys = []
for i in range(100):
ys.append(getattr(x, op)(axis))
sync_if_needed(x)
@torch.no_grad()
def softmax(axis, x):
ys = []
for i in range(100):
ex = torch.exp(x - torch.max(x, dim=axis, keepdims=True).values)
y = ex / torch.sum(ex, dim=axis, keepdims=True)
ys.append(y)
sync_if_needed(x)
@torch.no_grad()
def softmax_fused(axis, x):
ys = []
for i in range(100):
ys.append(torch.nn.functional.softmax(x, dim=axis))
sync_if_needed(x)
@torch.no_grad()
def relu(x):
y = x
for i in range(100):
y = torch.nn.functional.relu(y)
sync_if_needed(x)
@torch.no_grad()
def scalar_mult(x):
y = x
for i in range(100):
y = y * (1.0 / (1 + i))
sync_if_needed(x)
@torch.no_grad()
def cross_entropy(targets, x):
ys = []
for i in range(100):
ys.append(torch.nn.functional.cross_entropy(x, targets))
sync_if_needed(x)
@torch.no_grad()
def logsumexp(axis, x):
ys = []
for i in range(100):
ys.append(torch.logsumexp(x, dim=axis))
sync_if_needed(x)
@torch.no_grad()
def linear_fused(w, b, x):
ys = []
for i in range(10):
ys.append(torch.nn.functional.linear(x, w, b))
sync_if_needed(x)
@torch.no_grad()
def linear(w, b, x):
ys = []
for i in range(10):
ys.append((x @ torch.transpose(w, -2, -1)) + b)
sync_if_needed(x)
@torch.no_grad()
def rope(x):
*_, N, D = x.shape
ys = []
for i in range(10):
x = x.view(-1, N, D)
positions = torch.arange(N, device=x.device)
freqs = 10000 ** torch.linspace(0, 1, D // 2, device=x.device)
theta = positions[:, None] * freqs[None]
costheta = torch.cos(theta)
sintheta = torch.sin(theta)
x1 = x[..., ::2]
x2 = x[..., 1::2]
rx1 = x1 * costheta - x2 * sintheta
rx2 = x1 * sintheta + x2 * costheta
y = torch.cat([rx1[..., None], rx2[..., None]], dim=-1)
y = y.reshape(-1, N, D)
ys.append(y)
sync_if_needed(x)
@torch.no_grad()
def concatenate(axis, x, y):
ys = []
for i in range(10):
ys.append(torch.cat([x, y], dim=axis))
sync_if_needed(x)
@torch.no_grad()
def cumsum(axis, x):
ys = []
for i in range(10):
ys.append(x.cumsum(axis))
sync_if_needed(x)
@torch.no_grad()
def sort(axis, x):
ys = []
for i in range(10):
ys.append(torch.sort(x, dim=axis)[0])
sync_if_needed(x)
@torch.no_grad()
def topk(axis, x):
k = x.shape[axis] // 3
ys = []
for i in range(10):
ys.append(torch.topk(x, k, dim=axis)[0])
sync_if_needed(x)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("benchmark", help="Choose the benchmark to run")
parser.add_argument(
"--size",
default=[(1024, 1024)],
type=lambda x: list(map(int, x.split("x"))),
help="Set the matrix size",
action="append",
)
parser.add_argument(
"--axis",
default=[1],
type=int_or_list,
help="Set a reduction axis",
action="append",
)
parser.add_argument(
"--transpose",
type=none_or_list,
default=[],
help="Permute the matrix",
action="append",
)
parser.add_argument(
"--print-pid", action="store_true", help="Print the PID and pause"
)
parser.add_argument("--cpu", action="store_true", help="Use the CPU")
parser.add_argument(
"--fused", action="store_true", help="Use fused functions where possible"
)
parser.add_argument("--dtype", choices=["float32", "float16"], default="float32")
args = parser.parse_args()
if len(args.size) > 1:
args.size.pop(0)
if len(args.axis) > 1:
args.axis.pop(0)
if args.print_pid:
print(os.getpid())
input("Press enter to run")
torch.set_num_threads(1)
device = "cpu" if args.cpu else "mps"
dtype = dict(float32=torch.float32, float16=torch.float16)[args.dtype]
xs = []
for size in args.size:
xs.append(torch.randn(*size).to(device).to(dtype))
for i, t in enumerate(args.transpose):
if t is None:
continue
xs[i] = xs[i].permute(*t)
x = xs[0]
axis = args.axis[0]
if args.benchmark == "matmul_square":
print(bench(matmul_square, x))
elif args.benchmark == "matmul":
print(bench(matmul, *xs))
elif args.benchmark == "linear":
if args.fused:
print(bench(linear_fused, *xs))
else:
print(bench(linear, *xs))
elif args.benchmark == "sum_axis":
print(bench(reduction, "sum", axis, x))
elif args.benchmark == "sum_all":
print(bench(reduction, "sum", None, x))
elif args.benchmark == "argmax":
print(bench(reduction, "argmax", axis, x))
elif args.benchmark == "add":
print(bench(binary, "add", *xs))
elif args.benchmark == "mul":
print(bench(binary, "mul", *xs))
elif args.benchmark == "softmax":
if args.fused:
print(bench(softmax_fused, axis, x))
else:
print(bench(softmax, axis, x))
elif args.benchmark == "relu":
print(bench(relu, x))
elif args.benchmark == "scalar_mul":
print(bench(scalar_mult, x))
elif args.benchmark == "cross_entropy":
if len(size) != 2:
raise ValueError("Error: [cross_entropy] benchmark requires a 2 dim size")
targets = torch.zeros(len(x), dtype=torch.long).to(x.device)
print(bench(cross_entropy, targets, x))
elif args.benchmark == "logsumexp":
print(bench(logsumexp, axis, x))
elif args.benchmark == "rope":
print(bench(rope, x))
elif args.benchmark == "concatenate":
print(bench(concatenate, axis, *xs))
elif args.benchmark == "cumsum":
print(bench(cumsum, axis, *xs))
elif args.benchmark == "conv1d":
print(bench(conv1d, *xs))
elif args.benchmark == "conv2d":
print(bench(conv2d, *xs))
elif args.benchmark == "sort":
print(bench(sort, axis, x))
elif args.benchmark == "topk":
print(bench(topk, axis, x))
else:
raise ValueError("Unknown benchmark")

View File

@@ -0,0 +1,253 @@
#!/usr/bin/env python
import argparse
import re
from pathlib import Path
from subprocess import run
BENCH_MLX = Path(__file__).parent / "bench_mlx.py"
BENCH_TORCH = Path(__file__).parent / "bench_torch.py"
def run_or_raise(*args, **kwargs):
try:
result = run(*args, capture_output=True, **kwargs)
return float(result.stdout)
except ValueError:
raise ValueError(f"stdout: {result.stdout}\nstderr: {result.stderr}")
def compare(args):
t_mlx = run_or_raise(["python", BENCH_MLX] + args)
t_torch = run_or_raise(["python", BENCH_TORCH] + args)
print((t_torch - t_mlx) / t_torch, " ".join(args), sep="\t")
def compare_mlx_dtypes(args, dt1, dt2):
t_mlx_dt1 = run_or_raise(["python", BENCH_MLX] + args + ["--dtype", dt1])
t_mlx_dt2 = run_or_raise(["python", BENCH_MLX] + args + ["--dtype", dt2])
print((t_mlx_dt2 - t_mlx_dt1) / t_mlx_dt2, " ".join(args), sep="\t")
def make_regex_search(regexes):
compiled_regexes = list(map(re.compile, regexes))
def search(x):
return (c.search(x) is not None for c in compiled_regexes)
return search
def make_predicate(positive_filter, negative_filter):
if positive_filter is not None:
positive_filter_search = make_regex_search(positive_filter)
positive_filter = lambda x: all(positive_filter_search(x))
else:
positive_filter = lambda x: True
if negative_filter is not None:
negative_filter_search = make_regex_search(negative_filter)
negative_filter = lambda x: not any(negative_filter_search(x))
else:
negative_filter = lambda x: True
def predicate(x):
return positive_filter(x) and negative_filter(x)
return predicate
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run comparisons agains PyTorch")
parser.add_argument(
"--filter", "-f", help="Regex filter to select benchmarks", nargs="+"
)
parser.add_argument(
"--negative_filter", "-n", help="Regex filter to remove benchmarks", nargs="+"
)
parser.add_argument(
"--mlx_dtypes",
"-d",
help="Compare mlx benchmarks between the 2 provided data types",
nargs=2,
)
args, rest = parser.parse_known_args()
_filter = make_predicate(args.filter, args.negative_filter)
if args.mlx_dtypes:
compare_filtered = (
lambda x: compare_mlx_dtypes(
x.split() + rest, args.mlx_dtypes[0], args.mlx_dtypes[1]
)
if _filter(x)
else None
)
else:
compare_filtered = lambda x: compare(x.split() + rest) if _filter(x) else None
# Binary ops
compare_filtered("add --size 10x1024x128 --size 1x1024x128 --cpu")
compare_filtered("add --size 10x1024x128 --size 1x1024x128")
compare_filtered("add --size 1024x128 --size 1x128 --cpu")
compare_filtered("add --size 1024x128 --size 1x128")
compare_filtered("add --size 1024x4096 --size 1x4096 --cpu")
compare_filtered("add --size 1024x4096 --size 1x4096")
compare_filtered("add --size 1024x4096 --size 1x1024 --transpose 1,0 --cpu")
compare_filtered("add --size 1024x4096 --size 1x1024 --transpose 1,0")
compare_filtered("add --size 1024x1024 --size 1024x1024 --cpu")
compare_filtered("add --size 1024x1024 --size 1024x1024")
compare_filtered("add --size 1024x1024 --size 1024x1024 --transpose 1,0 --cpu")
compare_filtered("add --size 1024x1024 --size 1024x1024 --transpose 1,0")
compare_filtered(
"add --size 1024x1024 --size 1024x1024 --transpose 1,0 --transpose 1,0 --cpu"
)
compare_filtered(
"add --size 1024x1024 --size 1024x1024 --transpose 1,0 --transpose 1,0"
)
# Reduction ops
compare_filtered("sum_all --size 10x1024x128 --cpu")
compare_filtered("sum_all --size 10x1024x128")
compare_filtered("sum_axis --size 16x1024x128 --axis 2 --cpu")
compare_filtered("sum_axis --size 16x1024x128 --axis 2")
compare_filtered("sum_axis --size 16x128x1024 --axis 2 --cpu")
compare_filtered("sum_axis --size 16x128x1024 --axis 2")
compare_filtered("sum_axis --size 1024x1024 --axis 1 --cpu")
compare_filtered("sum_axis --size 1024x1024 --axis 1")
compare_filtered("sum_axis --size 1024x1024 --axis 0 --cpu")
compare_filtered("sum_axis --size 1024x1024 --axis 0")
compare_filtered("sum_axis --size 16x128x1024 --axis 1 --cpu")
compare_filtered("sum_axis --size 16x128x1024 --axis 1")
compare_filtered("sum_axis --size 16x128x1024 --axis 0 --cpu")
compare_filtered("sum_axis --size 16x128x1024 --axis 0")
compare_filtered("argmax --size 10x1024x128 --axis 1 --cpu")
compare_filtered("argmax --size 10x1024x128 --axis 1")
compare_filtered("argmax --size 10x1024x128 --axis 2 --cpu")
compare_filtered("argmax --size 10x1024x128 --axis 2")
compare_filtered("argmax --size 1024x1024 --axis 1 --cpu")
compare_filtered("argmax --size 1024x1024 --axis 1")
# Matmul ops
compare_filtered("matmul_square --size 1024x1024")
compare_filtered("matmul_square --size 1024x1024 --cpu")
compare_filtered("matmul_square --size 16x1024x1024")
compare_filtered("matmul_square --size 16x1024x1024 --cpu")
compare_filtered(
"matmul --size 16x768x768 --size 16x768x768 --transpose= --transpose 0,2,1"
)
compare_filtered(
"matmul --size 16x768x768 --size 16x768x768 --transpose= --transpose 0,2,1 --cpu"
)
compare_filtered(
"matmul --size 16x768x128 --size 16x768x128 --transpose= --transpose 0,2,1"
)
compare_filtered(
"matmul --size 16x768x128 --size 16x768x128 --transpose= --transpose 0,2,1 --cpu"
)
compare_filtered("matmul --size 512x8192 --size 8192x512")
compare_filtered("matmul --size 512x8192 --size 8192x512 --cpu")
# compare_filtered("matmul --size 512x131072 --size 131072x512")
# compare_filtered("matmul --size 512x131072 --size 131072x512 --cpu")
compare_filtered("matmul --size 8192x512 --size 512x8192")
compare_filtered("matmul --size 8192x512 --size 512x8192 --cpu")
# compare_filtered("matmul --size 131072x512 --size 512x512")
# compare_filtered("matmul --size 131072x512 --size 512x512 --cpu")
compare_filtered("linear --size 1024x1024 --size 1024 --size 128x1024")
compare_filtered("linear --size 1024x1024 --size 1024 --size 128x1024 --cpu")
compare_filtered("linear --size 1024x1024 --size 1024 --size 128x1024 --fused")
compare_filtered(
"linear --size 1024x1024 --size 1024 --size 128x1024 --fused --cpu"
)
# Matvec ops
compare_filtered("matmul --size 1x1x4096 --size 4096x4096 --cpu")
compare_filtered("matmul --size 1x1x4096 --size 4096x4096")
compare_filtered(
"matmul --size 1x1x4096 --size 4096x4096 --transpose= --transpose 1,0 --cpu"
)
compare_filtered(
"matmul --size 1x1x4096 --size 4096x4096 --transpose= --transpose 1,0"
)
compare_filtered("matmul --size 32x1x1000 --size 32x1000x128 --cpu")
compare_filtered("matmul --size 32x1x1000 --size 32x1000x128")
compare_filtered(
"matmul --size 32x1x1000 --size 32x128x1000 --transpose= --transpose 0,2,1 --cpu"
)
compare_filtered(
"matmul --size 32x1x1000 --size 32x128x1000 --transpose= --transpose 0,2,1"
)
# Various ops
compare_filtered("softmax --size 32x16x1024 --axis 2")
compare_filtered("softmax --size 32x16x1024 --axis 2 --cpu")
compare_filtered("softmax --size 32x16x1024 --axis 2 --fused")
compare_filtered("softmax --size 32x16x1024 --axis 2 --fused --cpu")
compare_filtered("softmax --size 2x1024x1024 --axis 1")
compare_filtered("softmax --size 2x1024x1024 --axis 1 --cpu")
compare_filtered("softmax --size 2x1024x1024 --axis 1 --fused")
compare_filtered("softmax --size 2x1024x1024 --axis 1 --fused --cpu")
compare_filtered("relu --size 32x16x1024")
compare_filtered("relu --size 32x16x1024 --cpu")
compare_filtered("scalar_mul --size 32x16x1024")
compare_filtered("scalar_mul --size 32x16x1024 --cpu")
compare_filtered("cross_entropy --size 256x1024")
compare_filtered("cross_entropy --size 256x1024 --cpu")
compare_filtered("logsumexp --size 1024x1024 --axis 1")
compare_filtered("logsumexp --size 1024x1024 --axis 1 --cpu")
compare_filtered("logsumexp --size 1024x1024 --axis 0")
compare_filtered("logsumexp --size 1024x1024 --axis 0 --cpu")
compare_filtered("concatenate --size 32x1024x128 --size 32x1024x128 --axis 2")
compare_filtered("concatenate --size 32x1024x128 --size 32x1024x128 --axis 2 --cpu")
compare_filtered("concatenate --size 32x1024x128 --size 32x1024x128 --axis 1")
compare_filtered("concatenate --size 32x1024x128 --size 32x1024x128 --axis 1 --cpu")
compare_filtered("concatenate --size 32x1024x128 --size 32x1024x128 --axis 0")
compare_filtered("concatenate --size 32x1024x128 --size 32x1024x128 --axis 0 --cpu")
compare_filtered("concatenate --size 32x1024x128 --size 32x16x128 --axis 1")
compare_filtered("concatenate --size 32x1024x128 --size 32x16x128 --axis 1 --cpu")
compare_filtered("concatenate --size 32x1024x128 --size 32x1x128 --axis 1")
compare_filtered("concatenate --size 32x1024x128 --size 32x1x128 --axis 1 --cpu")
compare_filtered("concatenate --size 1x32x1024x128 --size 1x32x1x128 --axis 2")
compare_filtered(
"concatenate --size 1x32x1024x128 --size 1x32x1x128 --axis 2 --cpu"
)
compare_filtered("conv1d --size 1x1000x80 --size 128x11x80")
compare_filtered("conv1d --size 1x1000x80 --size 128x11x80 --cpu")
compare_filtered("conv1d --size 16x1000x80 --size 128x11x80")
compare_filtered("conv1d --size 4x1000x80 --size 128x11x80 --cpu")
compare_filtered("conv2d --size 1x256x256x3 --size 8x3x3x3")
compare_filtered("conv2d --size 1x256x256x3 --size 8x3x3x3 --cpu")
compare_filtered("conv2d --size 16x256x256x3 --size 8x3x3x3")
compare_filtered("conv2d --size 4x256x256x3 --size 8x3x3x3 --cpu")
compare_filtered("cumsum --size 1024x1024 --axis 1 --cpu")
compare_filtered("cumsum --size 1024x1024 --axis 0 --cpu")
compare_filtered("cumsum --size 1024x1024 --axis 1")
compare_filtered("cumsum --size 1024x1024 --axis 0")
compare_filtered("cumsum --size 128x1024 --axis 1")
compare_filtered("cumsum --size 128x1024 --axis 0")
compare_filtered("cumsum --size 1024x4096 --axis 1")
compare_filtered("cumsum --size 1024x4096 --axis 0")
compare_filtered("cumsum --size 128x4096 --axis 1")
compare_filtered("cumsum --size 128x4096 --axis 0")
compare_filtered("cumsum --size 1024x7777 --axis 1")
compare_filtered("cumsum --size 1024x7777 --axis 0")
compare_filtered("cumsum --size 128x7777 --axis 1")
compare_filtered("cumsum --size 128x7777 --axis 0")
compare_filtered("cumsum --size 32768x128 --axis 1")
compare_filtered("cumsum --size 32768x128 --axis 0")
compare_filtered("sort --size 1024x1024 --axis 0")
compare_filtered("sort --size 1024x1024 --axis 1")
compare_filtered("sort --size 32768x128 --axis 0")
compare_filtered("sort --size 32768x128 --axis 1")
compare_filtered("sort --size 128x128 --axis 0 --cpu")
compare_filtered("sort --size 128x128 --axis 1 --cpu")
compare_filtered("topk --size 1024x1024 --axis 0")
compare_filtered("topk --size 1024x1024 --axis 1")
compare_filtered("topk --size 32768x128 --axis 0")
compare_filtered("topk --size 32768x128 --axis 1")
compare_filtered("topk --size 128x128 --axis 0 --cpu")
compare_filtered("topk --size 128x128 --axis 1 --cpu")