mirror of
https://github.com/ml-explore/mlx.git
synced 2025-06-26 02:33:21 +08:00
286 lines
8.8 KiB
Python
286 lines
8.8 KiB
Python
![]() |
#!/usr/bin/env python3
|
||
|
"""
|
||
|
Benchmark script for SVD operations comparing CPU vs Metal performance.
|
||
|
This benchmark should be run before and after the Metal SVD implementation
|
||
|
to measure performance improvements.
|
||
|
"""
|
||
|
|
||
|
import time
|
||
|
from typing import Dict, List, Tuple
|
||
|
|
||
|
import mlx.core as mx
|
||
|
import numpy as np
|
||
|
|
||
|
|
||
|
def benchmark_svd_sizes() -> List[Tuple[int, int]]:
|
||
|
"""Return list of matrix sizes to benchmark."""
|
||
|
return [
|
||
|
(32, 32),
|
||
|
(64, 64),
|
||
|
(128, 128),
|
||
|
(256, 256),
|
||
|
(512, 512),
|
||
|
(1024, 1024),
|
||
|
(64, 128),
|
||
|
(128, 256),
|
||
|
(256, 512),
|
||
|
(512, 1024),
|
||
|
]
|
||
|
|
||
|
|
||
|
def create_test_matrix(m: int, n: int, dtype=mx.float32) -> mx.array:
|
||
|
"""Create a test matrix with known properties for SVD."""
|
||
|
# Create a matrix with controlled singular values for consistent benchmarking
|
||
|
np.random.seed(42) # Fixed seed for reproducible results
|
||
|
|
||
|
# Create matrix with known rank and condition number
|
||
|
U = np.random.randn(m, min(m, n)).astype(np.float32)
|
||
|
V = np.random.randn(min(m, n), n).astype(np.float32)
|
||
|
|
||
|
# Create diagonal matrix with decreasing singular values
|
||
|
s = np.logspace(0, -3, min(m, n)).astype(np.float32)
|
||
|
S = np.diag(s)
|
||
|
|
||
|
# Construct A = U @ S @ V
|
||
|
if m >= n:
|
||
|
A = U @ S @ V
|
||
|
else:
|
||
|
A = U @ S @ V[:m, :]
|
||
|
|
||
|
return mx.array(A, dtype=dtype)
|
||
|
|
||
|
|
||
|
def benchmark_svd_operation(
|
||
|
matrix: mx.array,
|
||
|
compute_uv: bool = True,
|
||
|
device: str = "gpu",
|
||
|
warmup_runs: int = 3,
|
||
|
benchmark_runs: int = 10,
|
||
|
) -> Dict[str, float]:
|
||
|
"""Benchmark SVD operation with proper warmup and timing."""
|
||
|
|
||
|
# Set device
|
||
|
if device == "cpu":
|
||
|
mx.set_default_device(mx.cpu)
|
||
|
else:
|
||
|
mx.set_default_device(mx.gpu)
|
||
|
|
||
|
# Move matrix to target device
|
||
|
matrix = mx.array(matrix, copy=True)
|
||
|
|
||
|
# Warmup runs
|
||
|
for _ in range(warmup_runs):
|
||
|
if compute_uv:
|
||
|
u, s, vt = mx.linalg.svd(matrix, compute_uv=True)
|
||
|
mx.eval(u, s, vt)
|
||
|
else:
|
||
|
s = mx.linalg.svd(matrix, compute_uv=False)
|
||
|
mx.eval(s)
|
||
|
|
||
|
# Benchmark runs
|
||
|
times = []
|
||
|
for _ in range(benchmark_runs):
|
||
|
start_time = time.perf_counter()
|
||
|
|
||
|
if compute_uv:
|
||
|
u, s, vt = mx.linalg.svd(matrix, compute_uv=True)
|
||
|
mx.eval(u, s, vt)
|
||
|
else:
|
||
|
s = mx.linalg.svd(matrix, compute_uv=False)
|
||
|
mx.eval(s)
|
||
|
|
||
|
end_time = time.perf_counter()
|
||
|
times.append(end_time - start_time)
|
||
|
|
||
|
return {
|
||
|
"mean_time": np.mean(times),
|
||
|
"std_time": np.std(times),
|
||
|
"min_time": np.min(times),
|
||
|
"max_time": np.max(times),
|
||
|
}
|
||
|
|
||
|
|
||
|
def run_comprehensive_benchmark():
|
||
|
"""Run comprehensive SVD benchmark comparing CPU and GPU performance."""
|
||
|
|
||
|
print("MLX SVD Performance Benchmark")
|
||
|
print("=" * 50)
|
||
|
print(f"Device: {mx.default_device()}")
|
||
|
print(f"MLX Version: {mx.__version__ if hasattr(mx, '__version__') else 'Unknown'}")
|
||
|
print()
|
||
|
|
||
|
sizes = benchmark_svd_sizes()
|
||
|
results = []
|
||
|
|
||
|
# Test both singular values only and full SVD
|
||
|
for compute_uv in [False, True]:
|
||
|
mode = "Full SVD" if compute_uv else "Singular Values Only"
|
||
|
print(f"\n{mode}")
|
||
|
print("-" * 30)
|
||
|
print(
|
||
|
f"{'Size':<12} {'CPU (ms)':<12} {'GPU (ms)':<12} {'Speedup':<10} {'Status'}"
|
||
|
)
|
||
|
print("-" * 60)
|
||
|
|
||
|
for m, n in sizes:
|
||
|
matrix = create_test_matrix(m, n)
|
||
|
|
||
|
try:
|
||
|
# CPU benchmark
|
||
|
cpu_stats = benchmark_svd_operation(matrix, compute_uv, "cpu")
|
||
|
cpu_time = cpu_stats["mean_time"] * 1000 # Convert to ms
|
||
|
|
||
|
# GPU benchmark
|
||
|
try:
|
||
|
gpu_stats = benchmark_svd_operation(matrix, compute_uv, "gpu")
|
||
|
gpu_time = gpu_stats["mean_time"] * 1000 # Convert to ms
|
||
|
speedup = cpu_time / gpu_time
|
||
|
status = "✓"
|
||
|
except Exception as e:
|
||
|
gpu_time = float("inf")
|
||
|
speedup = 0.0
|
||
|
status = f"✗ ({str(e)[:20]}...)"
|
||
|
|
||
|
print(
|
||
|
f"{m}x{n:<8} {cpu_time:<12.2f} {gpu_time:<12.2f} {speedup:<10.2f} {status}"
|
||
|
)
|
||
|
|
||
|
results.append(
|
||
|
{
|
||
|
"size": (m, n),
|
||
|
"compute_uv": compute_uv,
|
||
|
"cpu_time": cpu_time,
|
||
|
"gpu_time": gpu_time,
|
||
|
"speedup": speedup,
|
||
|
"status": status,
|
||
|
}
|
||
|
)
|
||
|
|
||
|
except Exception as e:
|
||
|
print(
|
||
|
f"{m}x{n:<8} {'ERROR':<12} {'ERROR':<12} {'N/A':<10} ✗ {str(e)[:30]}..."
|
||
|
)
|
||
|
|
||
|
# Summary statistics
|
||
|
print("\n" + "=" * 50)
|
||
|
print("SUMMARY")
|
||
|
print("=" * 50)
|
||
|
|
||
|
successful_results = [r for r in results if r["speedup"] > 0]
|
||
|
if successful_results:
|
||
|
speedups = [r["speedup"] for r in successful_results]
|
||
|
print(f"Average Speedup: {np.mean(speedups):.2f}x")
|
||
|
print(f"Max Speedup: {np.max(speedups):.2f}x")
|
||
|
print(f"Min Speedup: {np.min(speedups):.2f}x")
|
||
|
print(f"Successful Tests: {len(successful_results)}/{len(results)}")
|
||
|
else:
|
||
|
print("No successful GPU tests completed.")
|
||
|
|
||
|
return results
|
||
|
|
||
|
|
||
|
def benchmark_batch_processing():
|
||
|
"""Benchmark batch processing capabilities."""
|
||
|
print("\n" + "=" * 50)
|
||
|
print("BATCH PROCESSING BENCHMARK")
|
||
|
print("=" * 50)
|
||
|
|
||
|
matrix_size = (128, 128)
|
||
|
batch_sizes = [1, 2, 4, 8, 16, 32]
|
||
|
|
||
|
print(f"{'Batch Size':<12} {'CPU (ms)':<12} {'GPU (ms)':<12} {'Speedup':<10}")
|
||
|
print("-" * 50)
|
||
|
|
||
|
for batch_size in batch_sizes:
|
||
|
# Create batch of matrices
|
||
|
matrices = []
|
||
|
for _ in range(batch_size):
|
||
|
matrices.append(create_test_matrix(*matrix_size))
|
||
|
|
||
|
batch_matrix = mx.stack(matrices, axis=0)
|
||
|
|
||
|
try:
|
||
|
cpu_stats = benchmark_svd_operation(
|
||
|
batch_matrix, True, "cpu", warmup_runs=2, benchmark_runs=5
|
||
|
)
|
||
|
gpu_stats = benchmark_svd_operation(
|
||
|
batch_matrix, True, "gpu", warmup_runs=2, benchmark_runs=5
|
||
|
)
|
||
|
|
||
|
cpu_time = cpu_stats["mean_time"] * 1000
|
||
|
gpu_time = gpu_stats["mean_time"] * 1000
|
||
|
speedup = cpu_time / gpu_time
|
||
|
|
||
|
print(
|
||
|
f"{batch_size:<12} {cpu_time:<12.2f} {gpu_time:<12.2f} {speedup:<10.2f}"
|
||
|
)
|
||
|
|
||
|
except Exception as e:
|
||
|
print(f"{batch_size:<12} {'ERROR':<12} {'ERROR':<12} {'N/A':<10}")
|
||
|
|
||
|
|
||
|
def verify_correctness():
|
||
|
"""Verify that GPU results match CPU results."""
|
||
|
print("\n" + "=" * 50)
|
||
|
print("CORRECTNESS VERIFICATION")
|
||
|
print("=" * 50)
|
||
|
|
||
|
test_sizes = [(64, 64), (128, 128), (100, 150)]
|
||
|
|
||
|
for m, n in test_sizes:
|
||
|
matrix = create_test_matrix(m, n)
|
||
|
|
||
|
# CPU computation
|
||
|
mx.set_default_device(mx.cpu)
|
||
|
cpu_matrix = mx.array(matrix, copy=True)
|
||
|
u_cpu, s_cpu, vt_cpu = mx.linalg.svd(cpu_matrix, compute_uv=True)
|
||
|
mx.eval(u_cpu, s_cpu, vt_cpu)
|
||
|
|
||
|
# GPU computation
|
||
|
try:
|
||
|
mx.set_default_device(mx.gpu)
|
||
|
gpu_matrix = mx.array(matrix, copy=True)
|
||
|
u_gpu, s_gpu, vt_gpu = mx.linalg.svd(gpu_matrix, compute_uv=True)
|
||
|
mx.eval(u_gpu, s_gpu, vt_gpu)
|
||
|
|
||
|
# Compare singular values (most important)
|
||
|
s_diff = mx.abs(s_cpu - s_gpu)
|
||
|
max_s_diff = mx.max(s_diff).item()
|
||
|
|
||
|
# Reconstruction test
|
||
|
reconstructed_cpu = u_cpu @ mx.diag(s_cpu) @ vt_cpu
|
||
|
reconstructed_gpu = u_gpu @ mx.diag(s_gpu) @ vt_gpu
|
||
|
|
||
|
recon_diff = mx.abs(cpu_matrix - reconstructed_cpu)
|
||
|
max_recon_diff_cpu = mx.max(recon_diff).item()
|
||
|
|
||
|
recon_diff = mx.abs(gpu_matrix - reconstructed_gpu)
|
||
|
max_recon_diff_gpu = mx.max(recon_diff).item()
|
||
|
|
||
|
print(f"Size {m}x{n}:")
|
||
|
print(f" Max singular value difference: {max_s_diff:.2e}")
|
||
|
print(f" Max reconstruction error (CPU): {max_recon_diff_cpu:.2e}")
|
||
|
print(f" Max reconstruction error (GPU): {max_recon_diff_gpu:.2e}")
|
||
|
|
||
|
if max_s_diff < 1e-5 and max_recon_diff_gpu < 1e-5:
|
||
|
print(f" Status: ✓ PASS")
|
||
|
else:
|
||
|
print(f" Status: ✗ FAIL")
|
||
|
|
||
|
except Exception as e:
|
||
|
print(f"Size {m}x{n}: ✗ ERROR - {str(e)}")
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
print("Starting MLX SVD Benchmark...")
|
||
|
print("This benchmark compares CPU vs GPU performance for SVD operations.")
|
||
|
print("Run this before and after implementing Metal SVD to measure improvements.\n")
|
||
|
|
||
|
# Run all benchmarks
|
||
|
results = run_comprehensive_benchmark()
|
||
|
benchmark_batch_processing()
|
||
|
verify_correctness()
|
||
|
|
||
|
print("\nBenchmark completed!")
|
||
|
print("Save these results to compare with post-implementation performance.")
|