#!/usr/bin/env python3
"""
Benchmark script for SVD operations comparing CPU vs Metal performance.
This benchmark should be run before and after the Metal SVD implementation
to measure performance improvements.
"""

import time
from typing import Dict, List, Tuple

import mlx.core as mx
import numpy as np


def benchmark_svd_sizes() -> List[Tuple[int, int]]:
    """Return list of matrix sizes to benchmark."""
    return [
        (32, 32),
        (64, 64),
        (128, 128),
        (256, 256),
        (512, 512),
        (1024, 1024),
        (64, 128),
        (128, 256),
        (256, 512),
        (512, 1024),
    ]


def create_test_matrix(m: int, n: int, dtype=mx.float32) -> mx.array:
    """Create a test matrix with known properties for SVD."""
    # Create a matrix with controlled singular values for consistent benchmarking
    np.random.seed(42)  # Fixed seed for reproducible results

    # Create matrix with known rank and condition number
    U = np.random.randn(m, min(m, n)).astype(np.float32)
    V = np.random.randn(min(m, n), n).astype(np.float32)

    # Create diagonal matrix with decreasing singular values
    s = np.logspace(0, -3, min(m, n)).astype(np.float32)
    S = np.diag(s)

    # Construct A = U @ S @ V
    if m >= n:
        A = U @ S @ V
    else:
        A = U @ S @ V[:m, :]

    return mx.array(A, dtype=dtype)


def benchmark_svd_operation(
    matrix: mx.array,
    compute_uv: bool = True,
    device: str = "gpu",
    warmup_runs: int = 3,
    benchmark_runs: int = 10,
) -> Dict[str, float]:
    """Benchmark SVD operation with proper warmup and timing."""

    # Set device
    if device == "cpu":
        mx.set_default_device(mx.cpu)
    else:
        mx.set_default_device(mx.gpu)

    # Move matrix to target device
    matrix = mx.array(matrix, copy=True)

    # Warmup runs
    for _ in range(warmup_runs):
        if compute_uv:
            u, s, vt = mx.linalg.svd(matrix, compute_uv=True)
            mx.eval(u, s, vt)
        else:
            s = mx.linalg.svd(matrix, compute_uv=False)
            mx.eval(s)

    # Benchmark runs
    times = []
    for _ in range(benchmark_runs):
        start_time = time.perf_counter()

        if compute_uv:
            u, s, vt = mx.linalg.svd(matrix, compute_uv=True)
            mx.eval(u, s, vt)
        else:
            s = mx.linalg.svd(matrix, compute_uv=False)
            mx.eval(s)

        end_time = time.perf_counter()
        times.append(end_time - start_time)

    return {
        "mean_time": np.mean(times),
        "std_time": np.std(times),
        "min_time": np.min(times),
        "max_time": np.max(times),
    }


def run_comprehensive_benchmark():
    """Run comprehensive SVD benchmark comparing CPU and GPU performance."""

    print("MLX SVD Performance Benchmark")
    print("=" * 50)
    print(f"Device: {mx.default_device()}")
    print(f"MLX Version: {mx.__version__ if hasattr(mx, '__version__') else 'Unknown'}")
    print()

    sizes = benchmark_svd_sizes()
    results = []

    # Test both singular values only and full SVD
    for compute_uv in [False, True]:
        mode = "Full SVD" if compute_uv else "Singular Values Only"
        print(f"\n{mode}")
        print("-" * 30)
        print(
            f"{'Size':<12} {'CPU (ms)':<12} {'GPU (ms)':<12} {'Speedup':<10} {'Status'}"
        )
        print("-" * 60)

        for m, n in sizes:
            matrix = create_test_matrix(m, n)

            try:
                # CPU benchmark
                cpu_stats = benchmark_svd_operation(matrix, compute_uv, "cpu")
                cpu_time = cpu_stats["mean_time"] * 1000  # Convert to ms

                # GPU benchmark
                try:
                    gpu_stats = benchmark_svd_operation(matrix, compute_uv, "gpu")
                    gpu_time = gpu_stats["mean_time"] * 1000  # Convert to ms
                    speedup = cpu_time / gpu_time
                    status = "✓"
                except Exception as e:
                    gpu_time = float("inf")
                    speedup = 0.0
                    status = f"✗ ({str(e)[:20]}...)"

                print(
                    f"{m}x{n:<8} {cpu_time:<12.2f} {gpu_time:<12.2f} {speedup:<10.2f} {status}"
                )

                results.append(
                    {
                        "size": (m, n),
                        "compute_uv": compute_uv,
                        "cpu_time": cpu_time,
                        "gpu_time": gpu_time,
                        "speedup": speedup,
                        "status": status,
                    }
                )

            except Exception as e:
                print(
                    f"{m}x{n:<8} {'ERROR':<12} {'ERROR':<12} {'N/A':<10} ✗ {str(e)[:30]}..."
                )

    # Summary statistics
    print("\n" + "=" * 50)
    print("SUMMARY")
    print("=" * 50)

    successful_results = [r for r in results if r["speedup"] > 0]
    if successful_results:
        speedups = [r["speedup"] for r in successful_results]
        print(f"Average Speedup: {np.mean(speedups):.2f}x")
        print(f"Max Speedup: {np.max(speedups):.2f}x")
        print(f"Min Speedup: {np.min(speedups):.2f}x")
        print(f"Successful Tests: {len(successful_results)}/{len(results)}")
    else:
        print("No successful GPU tests completed.")

    return results


def benchmark_batch_processing():
    """Benchmark batch processing capabilities."""
    print("\n" + "=" * 50)
    print("BATCH PROCESSING BENCHMARK")
    print("=" * 50)

    matrix_size = (128, 128)
    batch_sizes = [1, 2, 4, 8, 16, 32]

    print(f"{'Batch Size':<12} {'CPU (ms)':<12} {'GPU (ms)':<12} {'Speedup':<10}")
    print("-" * 50)

    for batch_size in batch_sizes:
        # Create batch of matrices
        matrices = []
        for _ in range(batch_size):
            matrices.append(create_test_matrix(*matrix_size))

        batch_matrix = mx.stack(matrices, axis=0)

        try:
            cpu_stats = benchmark_svd_operation(
                batch_matrix, True, "cpu", warmup_runs=2, benchmark_runs=5
            )
            gpu_stats = benchmark_svd_operation(
                batch_matrix, True, "gpu", warmup_runs=2, benchmark_runs=5
            )

            cpu_time = cpu_stats["mean_time"] * 1000
            gpu_time = gpu_stats["mean_time"] * 1000
            speedup = cpu_time / gpu_time

            print(
                f"{batch_size:<12} {cpu_time:<12.2f} {gpu_time:<12.2f} {speedup:<10.2f}"
            )

        except Exception as e:
            print(f"{batch_size:<12} {'ERROR':<12} {'ERROR':<12} {'N/A':<10}")


def verify_correctness():
    """Verify that GPU results match CPU results."""
    print("\n" + "=" * 50)
    print("CORRECTNESS VERIFICATION")
    print("=" * 50)

    test_sizes = [(64, 64), (128, 128), (100, 150)]

    for m, n in test_sizes:
        matrix = create_test_matrix(m, n)

        # CPU computation
        mx.set_default_device(mx.cpu)
        cpu_matrix = mx.array(matrix, copy=True)
        u_cpu, s_cpu, vt_cpu = mx.linalg.svd(cpu_matrix, compute_uv=True)
        mx.eval(u_cpu, s_cpu, vt_cpu)

        # GPU computation
        try:
            mx.set_default_device(mx.gpu)
            gpu_matrix = mx.array(matrix, copy=True)
            u_gpu, s_gpu, vt_gpu = mx.linalg.svd(gpu_matrix, compute_uv=True)
            mx.eval(u_gpu, s_gpu, vt_gpu)

            # Compare singular values (most important)
            s_diff = mx.abs(s_cpu - s_gpu)
            max_s_diff = mx.max(s_diff).item()

            # Reconstruction test
            reconstructed_cpu = u_cpu @ mx.diag(s_cpu) @ vt_cpu
            reconstructed_gpu = u_gpu @ mx.diag(s_gpu) @ vt_gpu

            recon_diff = mx.abs(cpu_matrix - reconstructed_cpu)
            max_recon_diff_cpu = mx.max(recon_diff).item()

            recon_diff = mx.abs(gpu_matrix - reconstructed_gpu)
            max_recon_diff_gpu = mx.max(recon_diff).item()

            print(f"Size {m}x{n}:")
            print(f"  Max singular value difference: {max_s_diff:.2e}")
            print(f"  Max reconstruction error (CPU): {max_recon_diff_cpu:.2e}")
            print(f"  Max reconstruction error (GPU): {max_recon_diff_gpu:.2e}")

            if max_s_diff < 1e-5 and max_recon_diff_gpu < 1e-5:
                print(f"  Status: ✓ PASS")
            else:
                print(f"  Status: ✗ FAIL")

        except Exception as e:
            print(f"Size {m}x{n}: ✗ ERROR - {str(e)}")


if __name__ == "__main__":
    print("Starting MLX SVD Benchmark...")
    print("This benchmark compares CPU vs GPU performance for SVD operations.")
    print("Run this before and after implementing Metal SVD to measure improvements.\n")

    # Run all benchmarks
    results = run_comprehensive_benchmark()
    benchmark_batch_processing()
    verify_correctness()

    print("\nBenchmark completed!")
    print("Save these results to compare with post-implementation performance.")