# Copyright © 2024 Apple Inc.

import matplotlib
import mlx.core as mx
import numpy as np
import sympy
import torch
from time_utils import measure_runtime

matplotlib.use("Agg")
import matplotlib.pyplot as plt


def bandwidth_gb(runtime_ms, system_size):
    bytes_per_fft = np.dtype(np.complex64).itemsize * 2
    bytes_per_gb = 1e9
    ms_per_s = 1e3
    return system_size * bytes_per_fft / runtime_ms * ms_per_s / bytes_per_gb


def run_bench(system_size, fft_sizes, backend="mlx", dim=1):
    def fft_mlx(x):
        if dim == 1:
            out = mx.fft.fft(x)
        elif dim == 2:
            out = mx.fft.fft2(x)
        mx.eval(out)
        return out

    def fft_mps(x):
        if dim == 1:
            out = torch.fft.fft(x)
        elif dim == 2:
            out = torch.fft.fft2(x)
        torch.mps.synchronize()
        return out

    bandwidths = []
    for n in fft_sizes:
        batch_size = system_size // n**dim
        shape = [batch_size] + [n for _ in range(dim)]
        if backend == "mlx":
            x_np = np.random.uniform(size=(system_size // n, n)).astype(np.complex64)
            x = mx.array(x_np)
            mx.eval(x)
            fft = fft_mlx
        elif backend == "mps":
            x_np = np.random.uniform(size=(system_size // n, n)).astype(np.complex64)
            x = torch.tensor(x_np, device="mps")
            torch.mps.synchronize()
            fft = fft_mps
        else:
            raise NotImplementedError()
        runtime_ms = measure_runtime(fft, x=x)
        bandwidth = bandwidth_gb(runtime_ms, np.prod(shape))
        print(n, bandwidth)
        bandwidths.append(bandwidth)

    return np.array(bandwidths)


def time_fft():
    x = np.array(range(2, 512))
    system_size = int(2**26)

    print("MLX GPU")
    with mx.stream(mx.gpu):
        gpu_bandwidths = run_bench(system_size=system_size, fft_sizes=x)

    print("MPS GPU")
    mps_bandwidths = run_bench(system_size=system_size, fft_sizes=x, backend="mps")

    print("CPU")
    system_size = int(2**20)
    with mx.stream(mx.cpu):
        cpu_bandwidths = run_bench(system_size=system_size, fft_sizes=x)

    x = np.array(x)

    all_indices = x - x[0]
    radix_2to13 = (
        np.array([i for i in x if all(p <= 13 for p in sympy.primefactors(i))]) - x[0]
    )
    bluesteins = (
        np.array([i for i in x if any(p > 13 for p in sympy.primefactors(i))]) - x[0]
    )

    for indices, name in [
        (all_indices, "All"),
        (radix_2to13, "Radix 2-13"),
        (bluesteins, "Bluestein's"),
    ]:
        # plot bandwidths
        print(name)
        plt.scatter(x[indices], gpu_bandwidths[indices], color="green", label="GPU")
        plt.scatter(x[indices], mps_bandwidths[indices], color="blue", label="MPS")
        plt.scatter(x[indices], cpu_bandwidths[indices], color="red", label="CPU")
        plt.title(f"MLX FFT Benchmark -- {name}")
        plt.xlabel("N")
        plt.ylabel("Bandwidth (GB/s)")
        plt.legend()
        plt.savefig(f"{name}.png")
        plt.clf()

    av_gpu_bandwidth = np.mean(gpu_bandwidths)
    av_mps_bandwidth = np.mean(mps_bandwidths)
    av_cpu_bandwidth = np.mean(cpu_bandwidths)
    print("Average bandwidths:")
    print("GPU:", av_gpu_bandwidth)
    print("MPS:", av_mps_bandwidth)
    print("CPU:", av_cpu_bandwidth)

    portion_faster = len(np.where(gpu_bandwidths > mps_bandwidths)[0]) / len(x)
    print("Percent MLX faster than MPS: ", portion_faster * 100)


if __name__ == "__main__":
    time_fft()