CMakeLists.txt update

WIP (python)
WIP
2025-12-16 01:49:05 +08:00 · 2025-10-31 16:55:04 -07:00 · 2025-10-31 16:24:51 -07:00 · 2025-10-31 16:24:35 -07:00 · 2025-10-31 16:24:21 -07:00 · 2025-10-31 16:24:09 -07:00
346 changed files with 13765 additions and 5042 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -18,16 +18,17 @@ jobs:
        type: boolean
        default: false
    macos:
-      xcode: "16.2.0"
-    resource_class: m2pro.medium
+      xcode: "26.0.0"
+    resource_class: m4pro.medium
    steps:
      - checkout
      - run:
          name: Install
          command: |
-            brew install python@3.9
+            xcodebuild -downloadComponent MetalToolchain
+            brew install python@3.10
            brew install doxygen
-            python3.9 -m venv env
+            python3.10 -m venv env
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
@@ -89,7 +90,8 @@ jobs:
          command: |
            uv venv
            uv pip install cmake
-            uv pip install -e ".[dev]" -v
+            DEBUG=1 CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
+              uv pip install -e ".[dev]" -v
      - run:
          name: Generate package stubs
          command: |
@@ -118,7 +120,7 @@ jobs:
    parameters:
      xcode_version:
        type: string
-        default: "16.2.0"
+        default: "26.0.0"
      macosx_deployment_target:
        type: string
        default: ""
@@ -126,18 +128,19 @@ jobs:
      xcode: << parameters.xcode_version >>
    environment:
      MACOSX_DEPLOYMENT_TARGET: << parameters.macosx_deployment_target >>
-    resource_class: m2pro.medium
+    resource_class: m4pro.medium
    steps:
      - checkout
      - run:
          name: Install dependencies
          command: |
+            xcodebuild -downloadComponent MetalToolchain
            HOMEBREW_NO_AUTO_UPDATE=1 HOMEBREW_NO_INSTALL_CLEANUP=1 \
              brew install openmpi uv
      - run:
          name: Install Python package
          command: |
-            uv venv --python 3.9
+            uv venv --python 3.10
            uv pip install \
              nanobind==2.4.0 \
              cmake \
@@ -196,7 +199,7 @@ jobs:
          name: Run Python tests with JIT
          command: |
            CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
-              uv pip install -e .
+              uv pip install -e . -v
            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 \
              METAL_DEBUG_ERROR_MODE=0 \
              uv run --no-project python -m xmlrunner discover \
@@ -222,15 +225,20 @@ jobs:
            sudo apt-get update
            sudo apt-get install libcudnn9-dev-cuda-12
            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
+            sudo apt-get install libnccl2 libnccl-dev
            curl -sL https://github.com/ccache/ccache/releases/download/v4.11.3/ccache-4.11.3-linux-x86_64.tar.xz | tar xJf -
            sudo mv ccache-4.11.3-linux-x86_64/ccache /usr/bin/ccache
            rm -rf ccache-4.11.3-linux-x86_64
            curl -LsSf https://astral.sh/uv/install.sh | sh
+      - run:
+          name: Set CCache size
+          command: ccache --max-size 1G
      - run:
          name: Install Python package
          command: |
            uv venv
-            CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+            uv pip install cmake
+            DEBUG=1 CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
              uv pip install -e ".[dev]" -v
      - run:
          name: Run Python tests
@@ -238,12 +246,23 @@ jobs:
            source .venv/bin/activate
            LOW_MEMORY=1 DEVICE=cpu python -m unittest discover python/tests -v
            LOW_MEMORY=1 DEVICE=gpu python -m tests discover python/tests -v
+      - run:
+          name: Build CPP only
+          command: |
+            source .venv/bin/activate
+            cmake . -B build \
+              -DMLX_BUILD_CUDA=ON \
+              -DCMAKE_CUDA_COMPILER=`which nvcc` \
+              -DCMAKE_BUILD_TYPE=DEBUG
+            cmake --build build -j `nproc`
+      - run:
+          name: Run CPP tests
+          command: ./build/tests/tests -sfe="*fft_tests.cpp,*linalg_tests.cpp"
      - run:
          name: CCache report
          command: |
            ccache --show-stats
            ccache --zero-stats
-            ccache --max-size 400MB
            ccache --cleanup
      - save_cache:
          key: cuda-<< parameters.image_date >>-{{ arch }}-{{ epoch }}
@@ -254,10 +273,10 @@ jobs:
    parameters:
      python_version:
        type: string
-        default: "3.9"
+        default: "3.10"
      xcode_version:
        type: string
-        default: "16.2.0"
+        default: "26.0.0"
      build_env:
        type: string
        default: ""
@@ -266,7 +285,7 @@ jobs:
        default: ""
    macos:
      xcode: << parameters.xcode_version >>
-    resource_class: m2pro.medium
+    resource_class: m4pro.medium
    environment:
      MACOSX_DEPLOYMENT_TARGET: << parameters.macosx_deployment_target >>
    steps:
@@ -274,11 +293,15 @@ jobs:
      - run:
          name: Install dependencies
          command: |
-            brew install python@<< parameters.python_version >>
-            brew install openmpi
-            python<< parameters.python_version >> -m venv env
-            source env/bin/activate
-            pip install --upgrade pip
+            xcodebuild -downloadComponent MetalToolchain
+            mkdir -p ~/miniconda3
+            curl https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh -o ~/miniconda3/miniconda.sh
+            bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
+            rm ~/miniconda3/miniconda.sh
+            source ~/miniconda3/bin/activate
+            conda init --all
+            conda create -n env python=<< parameters.python_version >> -y
+            conda activate env
            pip install --upgrade cmake
            pip install nanobind==2.4.0
            pip install --upgrade setuptools
@@ -288,29 +311,29 @@ jobs:
      - run:
          name: Install Python package
          command: |
-            source env/bin/activate
+            conda activate env
            env -u MACOSX_DEPLOYMENT_TARGET DEV_RELEASE=1 \
              pip install . -v
      - run:
          name: Generate package stubs
          command: |
-            source env/bin/activate
+            conda activate env
            pip install typing_extensions
            python setup.py generate_stubs
      - run:
          name: Build Python package
          command: |
-            source env/bin/activate
+            conda activate env
            python setup.py clean --all
            << parameters.build_env >> MLX_BUILD_STAGE=1 python -m build -w
      - when:
          condition:
-            equal: ["3.9", << parameters.python_version >>]
+            equal: ["3.10", << parameters.python_version >>]
          steps:
            - run:
                name: Build common package
                command: |
-                  source env/bin/activate
+                  conda activate env
                  python setup.py clean --all
                  << parameters.build_env >> MLX_BUILD_STAGE=2 python -m build -w
      - when:
@@ -319,7 +342,7 @@ jobs:
            - run:
                name: Upload package
                command: |
-                  source env/bin/activate
+                  conda activate env
                  twine upload dist/*
      - store_artifacts:
          path: dist/
@@ -328,7 +351,7 @@ jobs:
    parameters:
      python_version:
        type: string
-        default: "3.9"
+        default: "3.10"
      build_env:
        type: string
        default: ""
@@ -364,7 +387,7 @@ jobs:
            bash python/scripts/repair_linux.sh
      - when:
          condition:
-            equal: ["3.9", << parameters.python_version >>]
+            equal: ["3.10", << parameters.python_version >>]
          steps:
            - run:
                name: Build common package
@@ -392,7 +415,7 @@ jobs:
        default: ""
    machine:
      image: ubuntu-2204:current
-      resource_class: large
+      resource_class: xlarge
    steps:
      - checkout
      - run:
@@ -439,7 +462,7 @@ workflows:
      - mac_build_and_test:
          matrix:
            parameters:
-              macosx_deployment_target: ["13.5", "14.0"]
+              macosx_deployment_target: ["13.5", "15.0"]
      - linux_build_and_test
      - cuda_build_and_test:
          matrix:
@@ -461,71 +484,10 @@ workflows:
              ignore: /.*/
          matrix:
            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
              macosx_deployment_target: ["13.5", "14.0", "15.0"]
              build_env: ["PYPI_RELEASE=1"]
-              xcode_version: ["16.2.0", "15.0.0"]
-            exclude:
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.9"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.10"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.11"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.12"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.13"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.9"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.10"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.11"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.12"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.13"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.9"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.10"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.11"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.12"
-                build_env: "PYPI_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.13"
-                build_env: "PYPI_RELEASE=1"
+              xcode_version: ["26.0.0"]
      - build_documentation:
          filters:
            tags:
@@ -541,7 +503,7 @@ workflows:
              ignore: /.*/
          matrix:
            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
              build_env: ["PYPI_RELEASE=1"]
      - build_cuda_release:
          filters:
@@ -567,7 +529,7 @@ workflows:
          requires: [ hold ]
          matrix:
            parameters:
-              macosx_deployment_target: ["13.5", "14.0"]
+              macosx_deployment_target: ["13.5", "15.0"]
      - linux_build_and_test:
          requires: [ hold ]
      - cuda_build_and_test:
@@ -584,59 +546,13 @@ workflows:
      - build_release:
          matrix:
            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
              macosx_deployment_target: ["13.5", "14.0", "15.0"]
-              xcode_version: ["16.2.0", "15.0.0"]
-            exclude:
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.9"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.10"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.11"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.12"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.13"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.9"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.10"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.11"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.12"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.13"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.9"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.10"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.11"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.12"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.13"
+              xcode_version: ["26.0.0"]
      - build_linux_release:
          matrix:
            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
      - build_cuda_release

  build_dev_release:
@@ -648,75 +564,14 @@ workflows:
      - build_release:
          matrix:
            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
              macosx_deployment_target: ["13.5", "14.0", "15.0"]
              build_env: ["DEV_RELEASE=1"]
-              xcode_version: ["16.2.0", "15.0.0"]
-            exclude:
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.9"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.10"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.11"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.12"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.13"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.9"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.10"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.11"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.12"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.13"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.9"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.10"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.11"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.12"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.13"
-                build_env: "DEV_RELEASE=1"
+              xcode_version: ["26.0.0"]
      - build_linux_release:
          matrix:
            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
              build_env: ["DEV_RELEASE=1"]
      - build_cuda_release:
          matrix:
--- a/ACKNOWLEDGMENTS.md
+++ b/ACKNOWLEDGMENTS.md
@@ -19,12 +19,17 @@ MLX was developed with contributions from the following individuals:
 - Gleb Pobudzey: Added the `where` primitive, and groups in 1D and 2D convolutions.
 - Paul Paczuski: Improved stability of BCE loss calculation
 - Max-Heinrich Laves: Added `conv_transpose1d`, `conv_transpose2d`, and `conv_transpose3d` ops.
- Gökdeniz Gülmez: Added the `Muon (MomentUm Orthogonalized by Newton-schulz)` optimizer.
+- Gökdeniz Gülmez: Added the `Muon (MomentUm Orthogonalized by Newton-schulz)` optimizer, and the `ReLU²` activation function.

 <a href="https://github.com/ml-explore/mlx/graphs/contributors">
  <img class="dark-light" src="https://contrib.rocks/image?repo=ml-explore/mlx&anon=0&columns=20&max=100&r=true" />
 </a>

+# Organizations
+
+MLX has received contributions from the following companies:
+- NVIDIA Corporation & Affiliates
+
 # Third-Party Software

 MLX leverages several third-party software, listed here together with
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,12 +20,17 @@ project(
  LANGUAGES C CXX
  VERSION ${MLX_PROJECT_VERSION})

+if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
+  add_compile_options(-Wall -Wextra)
+endif()
+
 # ----------------------------- Setup -----------------------------
 set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_INSTALL_MESSAGE NEVER)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

 # ----------------------------- Configuration -----------------------------
 option(MLX_BUILD_TESTS "Build tests for mlx" ON)
@@ -87,22 +92,21 @@ cmake_policy(SET CMP0135 NEW)

 add_library(mlx)

-if(MLX_BUILD_METAL)
-  set(METAL_LIB "-framework Metal")
-  set(FOUNDATION_LIB "-framework Foundation")
-  set(QUARTZ_LIB "-framework QuartzCore")
-endif()
-
 if(MLX_BUILD_CUDA)
  enable_language(CUDA)
 endif()

-if(MLX_BUILD_METAL AND NOT METAL_LIB)
-  message(STATUS "Metal not found. Unable to build GPU")
-  set(MLX_BUILD_METAL OFF)
-  set(MLX_METAL_DEBUG OFF)
-elseif(MLX_BUILD_METAL)
-  message(STATUS "Building METAL sources")
+if(MLX_BUILD_METAL)
+  find_library(METAL_LIB Metal)
+  find_library(FOUNDATION_LIB Foundation)
+  find_library(QUARTZ_LIB QuartzCore)
+  if(METAL_LIB)
+    message(STATUS "Metal found ${METAL_LIB}")
+  else()
+    message(
+      FATAL_ERROR
+        "Metal not found. Set MLX_BUILD_METAL=OFF to build without GPU")
+  endif()

  if(MLX_METAL_DEBUG)
    add_compile_definitions(MLX_METAL_DEBUG)
@@ -111,7 +115,8 @@ elseif(MLX_BUILD_METAL)
  # Throw an error if xcrun not found
  execute_process(
    COMMAND zsh "-c" "/usr/bin/xcrun -sdk macosx --show-sdk-version"
-    OUTPUT_VARIABLE MACOS_SDK_VERSION COMMAND_ERROR_IS_FATAL ANY)
+    OUTPUT_VARIABLE MACOS_SDK_VERSION
+    OUTPUT_STRIP_TRAILING_WHITESPACE COMMAND_ERROR_IS_FATAL ANY)

  if(${MACOS_SDK_VERSION} LESS 14.0)
    message(
@@ -140,6 +145,12 @@ elseif(MLX_BUILD_METAL)
  target_link_libraries(mlx PUBLIC ${METAL_LIB} ${FOUNDATION_LIB} ${QUARTZ_LIB})
 endif()

+if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+  # With newer clang/gcc versions following libs are implicitly linked, but when
+  # building on old distributions they need to be explicitly listed.
+  target_link_libraries(mlx PRIVATE dl pthread)
+endif()
+
 if(WIN32)
  if(MSVC)
    # GGUF does not build with MSVC.
@@ -167,7 +178,7 @@ if(MLX_BUILD_CPU)
    message(STATUS "Accelerate found ${ACCELERATE_LIBRARY}")
    set(MLX_BUILD_ACCELERATE ON)
  else()
-    message(STATUS "Accelerate or arm neon not found, using default backend.")
+    message(STATUS "Accelerate not found, using default backend.")
    set(MLX_BUILD_ACCELERATE OFF)
  endif()

--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@

 [**Quickstart**](#quickstart) | [**Installation**](#installation) |
 [**Documentation**](https://ml-explore.github.io/mlx/build/html/index.html) |
-[**Examples**](#examples) 
+[**Examples**](#examples)

 [![CircleCI](https://circleci.com/gh/ml-explore/mlx.svg?style=svg)](https://circleci.com/gh/ml-explore/mlx)

@@ -11,37 +11,37 @@ brought to you by Apple machine learning research.

 Some key features of MLX include:

- - **Familiar APIs**: MLX has a Python API that closely follows NumPy. MLX
+- **Familiar APIs**: MLX has a Python API that closely follows NumPy. MLX
   also has fully featured C++, [C](https://github.com/ml-explore/mlx-c), and
   [Swift](https://github.com/ml-explore/mlx-swift/) APIs, which closely mirror
   the Python API. MLX has higher-level packages like `mlx.nn` and
   `mlx.optimizers` with APIs that closely follow PyTorch to simplify building
   more complex models.

- - **Composable function transformations**: MLX supports composable function
-   transformations for automatic differentiation, automatic vectorization,
-   and computation graph optimization.
+- **Composable function transformations**: MLX supports composable function
+  transformations for automatic differentiation, automatic vectorization,
+  and computation graph optimization.

- - **Lazy computation**: Computations in MLX are lazy. Arrays are only
-   materialized when needed.
+- **Lazy computation**: Computations in MLX are lazy. Arrays are only
+  materialized when needed.

- - **Dynamic graph construction**: Computation graphs in MLX are constructed
-   dynamically. Changing the shapes of function arguments does not trigger
-   slow compilations, and debugging is simple and intuitive.
+- **Dynamic graph construction**: Computation graphs in MLX are constructed
+  dynamically. Changing the shapes of function arguments does not trigger
+  slow compilations, and debugging is simple and intuitive.

- - **Multi-device**: Operations can run on any of the supported devices
-   (currently the CPU and the GPU).
+- **Multi-device**: Operations can run on any of the supported devices
+  (currently the CPU and the GPU).

- - **Unified memory**: A notable difference from MLX and other frameworks
-   is the *unified memory model*. Arrays in MLX live in shared memory.
-   Operations on MLX arrays can be performed on any of the supported
-   device types without transferring data.
+- **Unified memory**: A notable difference from MLX and other frameworks
+  is the *unified memory model*. Arrays in MLX live in shared memory.
+  Operations on MLX arrays can be performed on any of the supported
+  device types without transferring data.

 MLX is designed by machine learning researchers for machine learning
 researchers. The framework is intended to be user-friendly, but still efficient
 to train and deploy models. The design of the framework itself is also
 conceptually simple. We intend to make it easy for researchers to extend and
-improve MLX with the goal of quickly exploring new ideas. 
+improve MLX with the goal of quickly exploring new ideas.

 The design of MLX is inspired by frameworks like
 [NumPy](https://numpy.org/doc/stable/index.html),
@@ -91,7 +91,7 @@ Checkout the
 [documentation](https://ml-explore.github.io/mlx/build/html/install.html#)
 for more information on building the C++ and Python APIs from source.

-## Contributing 
+## Contributing

 Check out the [contribution guidelines](https://github.com/ml-explore/mlx/tree/main/CONTRIBUTING.md) for more information
 on contributing to MLX. See the
@@ -110,7 +110,7 @@ Hannun, Jagrit Digani, Angelos Katharopoulos, and Ronan Collobert. If you find
 MLX useful in your research and wish to cite it, please use the following
 BibTex entry:

-```
+```text
@software{mlx2023,
  author = {Awni Hannun and Jagrit Digani and Angelos Katharopoulos and Ronan Collobert},
  title = {{MLX}: Efficient and flexible machine learning on Apple silicon},
--- a/benchmarks/python/blas/bench_gemm.py
+++ b/benchmarks/python/blas/bench_gemm.py
@@ -142,9 +142,7 @@ def bench_shape(B, M, N, K, np_dtype, transpose="nn"):
    t_b = (0, 1, 2) if transpose[1] == "n" else (0, 2, 1)

    c_mlx = a_mx.transpose(t_a) @ b_mx.transpose(t_b)
-    c_npy = a_np.transpose(t_a).astype(np.float32) @ b_np.transpose(t_b).astype(
-        np.float32
-    )
+    c_npy = a_np.transpose(t_a).astype(np_dtype) @ b_np.transpose(t_b).astype(np_dtype)

    atol = 1e-5 if np_dtype == np.float32 else 1e-4

@@ -163,7 +161,7 @@ def get_gflop_count(B, M, N, K):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run gemm benchmarks")

-    dtypes = ("float32", "float16")
+    dtypes = ("float32", "float16", "complex64")
    transposes = ("nn", "nt", "tn")
    shapes = (
        (16, 234, 768, 3072),
@@ -187,7 +185,7 @@ if __name__ == "__main__":
                diff = gflops_mx / gflops_pt - 1.0

                print(
-                    f"{B:3d}, {M:4d}, {N:4d}, {K:4d}, {dtype}, {transpose}, {gflops_pt:05.3f}, {gflops_mx:05.3f}, {100. * diff:+5.2f}%"
+                    f"{B:3d}, {M:4d}, {N:4d}, {K:4d}, {dtype}, {transpose}, {gflops_pt:05.3f}, {gflops_mx:05.3f}, {100.0 * diff:+5.2f}%"
                )
                if gflops_pt >= 2.0 * gflops_mx:
                    print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/blas/bench_gemv.py
+++ b/benchmarks/python/blas/bench_gemv.py
@@ -196,7 +196,7 @@ def bench_with_out_len(ax, out_vec_len, in_vector_lens, dtype, transpose):


 for transpose in (False, True):
-    for dtype in ("float32", "float16"):
+    for dtype in ("float32", "float16", "complex64"):
        fig, axs = plt.subplots(
            len(in_vec_sizes), 2, figsize=(8.5, 11), layout="constrained"
        )
@@ -215,7 +215,7 @@ for transpose in (False, True):
        fig.suptitle(f"{device_name}: {dtype} {op_name}")
        fig.savefig(
            os.path.join(
-                results_dir, f'{device_name.replace(" ", "_")}_{dtype}_{op_name}.pdf'
+                results_dir, f"{device_name.replace(' ', '_')}_{dtype}_{op_name}.pdf"
            )
        )
        plt.close(fig)
--- a/cmake/FindNCCL.cmake
+++ b/cmake/FindNCCL.cmake
@@ -0,0 +1,54 @@
+# FindNCCL.cmake This module finds the NVIDIA NCCL library and its include
+# directories.
+
+set(NCCL_ROOT_DIR
+    $ENV{NCCL_ROOT_DIR}
+    CACHE PATH "Folder contains NVIDIA NCCL")
+
+find_path(
+  NCCL_INCLUDE_DIRS
+  NAMES nccl.h
+  HINTS ${NCCL_INCLUDE_DIR} ${NCCL_ROOT_DIR} ${NCCL_ROOT_DIR}/include
+        ${CUDA_TOOLKIT_ROOT_DIR}/include)
+
+if($ENV{USE_STATIC_NCCL})
+  message(
+    STATUS "USE_STATIC_NCCL detected. Linking against static NCCL library")
+  set(NCCL_LIBNAME "libnccl_static.a")
+else()
+  set(NCCL_LIBNAME "nccl")
+endif()
+
+find_library(
+  NCCL_LIBRARIES
+  NAMES ${NCCL_LIBNAME}
+  HINTS ${NCCL_LIB_DIR}
+        ${NCCL_ROOT_DIR}
+        ${NCCL_ROOT_DIR}/lib
+        ${NCCL_ROOT_DIR}/lib/x86_64-linux-gnu
+        ${NCCL_ROOT_DIR}/lib64
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS
+                                  NCCL_LIBRARIES)
+
+if(NCCL_FOUND)
+  set(NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
+  message(
+    STATUS "Determining NCCL version from the header file: ${NCCL_HEADER_FILE}")
+  file(
+    STRINGS ${NCCL_HEADER_FILE} NCCL_MAJOR_VERSION_DEFINED
+    REGEX "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+[0-9]+.*$"
+    LIMIT_COUNT 1)
+  if(NCCL_MAJOR_VERSION_DEFINED)
+    string(REGEX REPLACE "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+" ""
+                         NCCL_MAJOR_VERSION ${NCCL_MAJOR_VERSION_DEFINED})
+    message(STATUS "NCCL_MAJOR_VERSION: ${NCCL_MAJOR_VERSION}")
+  endif()
+  message(
+    STATUS
+      "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
+  mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
+endif()
--- a/docs/src/dev/custom_metal_kernels.rst
+++ b/docs/src/dev/custom_metal_kernels.rst
@@ -127,7 +127,8 @@ relying on a copy from ``ensure_row_contiguous``:
      name="myexp_strided",
      input_names=["inp"],
      output_names=["out"],
-      source=source
+      source=source,
+      ensure_row_contiguous=False,
  )

  def exp_elementwise(a: mx.array):
@@ -138,7 +139,6 @@ relying on a copy from ``ensure_row_contiguous``:
          threadgroup=(256, 1, 1),
          output_shapes=[a.shape],
          output_dtypes=[a.dtype],
-          ensure_row_contiguous=False,
      )
      return outputs[0]

--- a/docs/src/index.rst
+++ b/docs/src/index.rst
@@ -70,6 +70,7 @@ are the CPU and GPU.
   python/fft
   python/linalg
   python/metal
+   python/cuda
   python/memory_management
   python/nn
   python/optimizers
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -16,7 +16,7 @@ silicon computer is
 To install from PyPI your system must meet the following requirements:

 - Using an M series chip (Apple silicon)
- Using a native Python >= 3.9
+- Using a native Python >= 3.10
 - macOS >= 13.5

 .. note::
@@ -39,7 +39,7 @@ requirements:
 - Nvidia driver >= 550.54.14
 - CUDA toolkit >= 12.0
 - Linux distribution with glibc >= 2.35
- Python >= 3.9
+- Python >= 3.10


 CPU-only (Linux)
@@ -55,7 +55,7 @@ To install the CPU-only package from PyPi your system must meet the following
 requirements:

 - Linux distribution with glibc >= 2.35
- Python >= 3.9
+- Python >= 3.10


 Troubleshooting
@@ -271,7 +271,7 @@ and the CUDA toolkit. For example on Ubuntu, run the following:
   dpkg -i cuda-keyring_1.1-1_all.deb
   apt-get update -y
   apt-get -y install cuda-toolkit-12-9
-   apt-get install libblas-dev liblapack-dev liblapacke-dev -y
+   apt-get install libblas-dev liblapack-dev liblapacke-dev libcudnn9-dev-cuda-12 -y


 When building either the Python or C++ APIs make sure to pass the cmake flag
--- a/docs/src/python/cuda.rst
+++ b/docs/src/python/cuda.rst
@@ -0,0 +1,9 @@
+CUDA
+=====
+
+.. currentmodule:: mlx.core.cuda
+
+.. autosummary::
+  :toctree: _autosummary
+
+  is_available
--- a/docs/src/python/fast.rst
+++ b/docs/src/python/fast.rst
@@ -13,3 +13,4 @@ Fast
  rope
  scaled_dot_product_attention
  metal_kernel
+  cuda_kernel
--- a/docs/src/python/nn/functions.rst
+++ b/docs/src/python/nn/functions.rst
@@ -27,6 +27,7 @@ simple functions.
   mish
   prelu
   relu
+   relu2
   relu6
   selu
   sigmoid
--- a/docs/src/python/nn/layers.rst
+++ b/docs/src/python/nn/layers.rst
@@ -50,6 +50,7 @@ Layers
   QuantizedLinear
   RMSNorm
   ReLU
+   ReLU2
   ReLU6
   RNN
   RoPE
--- a/docs/src/python/ops.rst
+++ b/docs/src/python/ops.rst
@@ -112,6 +112,7 @@ Operations
   max
   maximum
   mean
+   median
   meshgrid
   min
   minimum
--- a/docs/src/usage/compile.rst
+++ b/docs/src/usage/compile.rst
@@ -130,8 +130,8 @@ Now make an array, and benchmark both functions:
 .. code-block:: python

  x = mx.random.uniform(shape=(32, 1000, 4096))
-  timeit(nn.gelu, x)
-  timeit(mx.compile(nn.gelu), x)
+  timeit(gelu, x)
+  timeit(mx.compile(gelu), x)

 On an M1 Max the times are 15.5 and 3.1 milliseconds. The compiled ``gelu`` is
 five times faster.
@@ -225,7 +225,7 @@ In some cases returning updated state can be pretty inconvenient. Hence,
  def fun(x, y):
      z = x + y
      state.append(z)
-      return mx.exp(z), state
+      return mx.exp(z)

  fun(mx.array(1.0), mx.array(2.0))
  # Prints [array(3, dtype=float32)]
--- a/docs/src/usage/distributed.rst
+++ b/docs/src/usage/distributed.rst
@@ -184,7 +184,7 @@ almost identical to the example above:

    def step(model, x, y):
        loss, grads = loss_grad_fn(model, x, y)
-        grads = mlx.nn.average_gradients(grads) # <---- This line was added
+        grads = mx.nn.average_gradients(grads)  # <---- This line was added
        optimizer.update(model, grads)
        return loss

--- a/docs/src/usage/export.rst
+++ b/docs/src/usage/export.rst
@@ -164,11 +164,11 @@ to export a function which can be used for inputs with variable shapes:

 .. code-block:: python

-  mx.export_function("fun.mlxfn", mx.abs, mx.array(0.0), shapeless=True)
+  mx.export_function("fun.mlxfn", mx.abs, mx.array([0.0]), shapeless=True)
  imported_abs = mx.import_function("fun.mlxfn")

  # Ok
-  out, = imported_abs(mx.array(-1.0))
+  out, = imported_abs(mx.array([-1.0]))

  # Also ok
  out, = imported_abs(mx.array([-1.0, -2.0]))
--- a/docs/src/usage/indexing.rst
+++ b/docs/src/usage/indexing.rst
@@ -107,8 +107,20 @@ same array:
  >>> a
  array([1, 2, 0], dtype=int32)

+Note that unlike NumPy, slicing an array creates a copy, not a view. So
+mutating it does not mutate the original array:

-Note, unlike NumPy, updates to the same location are nondeterministic:
+.. code-block:: shell
+
+  >>> a = mx.array([1, 2, 3])
+  >>> b = a[:]
+  >>> b[2] = 0
+  >>> b
+  array([1, 2, 0], dtype=int32)
+  >>> a
+  array([1, 2, 3], dtype=int32)
+
+Also unlike NumPy, updates to the same location are nondeterministic:

 .. code-block:: shell

--- a/examples/cpp/tutorial.cpp
+++ b/examples/cpp/tutorial.cpp
@@ -14,14 +14,17 @@ void array_basics() {
  // Get the value out of it:
  auto s = x.item<float>();
  assert(s == 1.0);
+  (void)s;

  // Scalars have a size of 1:
-  size_t size = x.size();
+  int64_t size = x.size();
  assert(size == 1);
+  (void)size;

  // Scalars have 0 dimensions:
  int ndim = x.ndim();
  assert(ndim == 0);
+  (void)ndim;

  // The shape should be an empty vector:
  auto shape = x.shape();
@@ -30,6 +33,7 @@ void array_basics() {
  // The datatype should be float32:
  auto dtype = x.dtype();
  assert(dtype == mx::float32);
+  (void)dtype;

  // Specify the dtype when constructing the array:
  x = mx::array(1, mx::int32);
--- a/mlx/array.cpp
+++ b/mlx/array.cpp
@@ -44,11 +44,11 @@ std::vector<array> array::make_arrays(
    const std::shared_ptr<Primitive>& primitive,
    const std::vector<array>& inputs) {
  std::vector<array> outputs;
-  for (size_t i = 0; i < shapes.size(); ++i) {
+  for (int i = 0; i < std::ssize(shapes); ++i) {
    outputs.emplace_back(std::move(shapes[i]), dtypes[i], primitive, inputs);
  }
  // For each node in |outputs|, its siblings are the other nodes.
-  for (size_t i = 0; i < outputs.size(); ++i) {
+  for (int i = 0; i < std::ssize(outputs); ++i) {
    auto siblings = outputs;
    siblings.erase(siblings.begin() + i);
    outputs[i].set_siblings(std::move(siblings), i);
@@ -145,8 +145,9 @@ void array::set_data(allocator::Buffer buffer, Deleter d) {
  array_desc_->data_size = size();
  array_desc_->flags.contiguous = true;
  array_desc_->flags.row_contiguous = true;
-  auto max_dim = std::max_element(shape().begin(), shape().end());
-  array_desc_->flags.col_contiguous = size() <= 1 || size() == *max_dim;
+  auto max_dim =
+      static_cast<int64_t>(*std::max_element(shape().begin(), shape().end()));
+  array_desc_->flags.col_contiguous = size() <= 1 || size() == max_dim;
 }

 void array::set_data(
@@ -192,7 +193,7 @@ array::~array() {
  }

  // Break circular reference for non-detached arrays with siblings
-  if (auto n = siblings().size(); n > 0) {
+  if (auto n = std::ssize(siblings()); n > 0) {
    bool do_detach = true;
    // If all siblings have siblings.size() references except
    // the one we are currently destroying (which has siblings.size() + 1)
@@ -241,8 +242,8 @@ array::ArrayDesc::ArrayDesc(
    std::vector<array> inputs)
    : shape(std::move(shape)),
      dtype(dtype),
-      status(Status::unscheduled),
      primitive(std::move(primitive)),
+      status(Status::unscheduled),
      inputs(std::move(inputs)) {
  init();
 }
@@ -274,7 +275,7 @@ array::ArrayDesc::~ArrayDesc() {
    ad.inputs.clear();
    for (auto& [_, a] : input_map) {
      bool is_deletable =
-          (a.array_desc_.use_count() <= a.siblings().size() + 1);
+          (a.array_desc_.use_count() <= std::ssize(a.siblings()) + 1);
      // An array with siblings is deletable only if all of its siblings
      // are deletable
      for (auto& s : a.siblings()) {
@@ -283,7 +284,7 @@ array::ArrayDesc::~ArrayDesc() {
        }
        int is_input = (input_map.find(s.id()) != input_map.end());
        is_deletable &=
-            s.array_desc_.use_count() <= a.siblings().size() + is_input;
+            s.array_desc_.use_count() <= std::ssize(a.siblings()) + is_input;
      }
      if (is_deletable) {
        for_deletion.push_back(std::move(a.array_desc_));
--- a/mlx/array.h
+++ b/mlx/array.h
@@ -81,22 +81,22 @@ class array {
  }

  /** The size of the array's datatype in bytes. */
-  size_t itemsize() const {
+  int itemsize() const {
    return size_of(dtype());
  }

  /** The number of elements in the array. */
-  size_t size() const {
+  int64_t size() const {
    return array_desc_->size;
  }

  /** The number of bytes in the array. */
-  size_t nbytes() const {
+  int64_t nbytes() const {
    return size() * itemsize();
  }

  /** The number of dimensions of the array. */
-  size_t ndim() const {
+  int ndim() const {
    return array_desc_->shape.size();
  }

@@ -329,7 +329,7 @@ class array {
   * corresponding to ``arr[-1, -1, ...]``) then ``data_size = last - first``.
   * Note, ``data_size`` is in units of ``item_size`` (not bytes).
   **/
-  size_t data_size() const {
+  int64_t data_size() const {
    return array_desc_->data_size;
  }

@@ -340,7 +340,7 @@ class array {
    return array_desc_->data->buffer;
  }

-  size_t buffer_size() const {
+  int64_t buffer_size() const {
    return allocator::allocator().size(buffer());
  }

@@ -530,7 +530,7 @@ array::array(
    Shape shape,
    Dtype dtype /* = TypeToDtype<T>() */)
    : array_desc_(std::make_shared<ArrayDesc>(std::move(shape), dtype)) {
-  if (data.size() != size()) {
+  if (std::ssize(data) != size()) {
    throw std::invalid_argument(
        "Data size and provided shape mismatch in array construction.");
  }
--- a/mlx/backend/common/common.cpp
+++ b/mlx/backend/common/common.cpp
@@ -21,8 +21,8 @@ void AsStrided::eval(const std::vector<array>& inputs, array& out) {

  // Compute the flags given the shape and strides
  bool row_contiguous = true, col_contiguous = true;
-  size_t r = 1, c = 1;
-  for (int i = strides_.size() - 1, j = 0; i >= 0; i--, j++) {
+  int64_t r = 1, c = 1;
+  for (int i = std::ssize(strides_) - 1, j = 0; i >= 0; i--, j++) {
    row_contiguous &= (r == strides_[i]) || (shape_[i] == 1);
    col_contiguous &= (c == strides_[j]) || (shape_[j] == 1);
    r *= shape_[i];
@@ -60,7 +60,8 @@ void CustomTransforms::eval(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() > outputs.size());
-  for (int i = 0, j = inputs.size() - outputs.size(); i < outputs.size();
+  for (int i = 0, j = std::ssize(inputs) - std::ssize(outputs);
+       i < std::ssize(outputs);
       i++, j++) {
    outputs[i].copy_shared_buffer(inputs[j]);
  }
@@ -70,7 +71,7 @@ void Depends::eval(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() > outputs.size());
-  for (int i = 0; i < outputs.size(); i++) {
+  for (int i = 0; i < std::ssize(outputs); i++) {
    outputs[i].copy_shared_buffer(inputs[i]);
  }
 }
@@ -206,11 +207,11 @@ void Split::eval(

  auto compute_new_flags = [](const auto& shape,
                              const auto& strides,
-                              size_t in_data_size,
+                              int64_t in_data_size,
                              auto flags) {
-    size_t data_size = 1;
-    size_t f_stride = 1;
-    size_t b_stride = 1;
+    int64_t data_size = 1;
+    int64_t f_stride = 1;
+    int64_t b_stride = 1;
    flags.row_contiguous = true;
    flags.col_contiguous = true;
    for (int i = 0, ri = shape.size() - 1; ri >= 0; i++, ri--) {
@@ -240,7 +241,7 @@ void Split::eval(

  std::vector<int> indices(1, 0);
  indices.insert(indices.end(), indices_.begin(), indices_.end());
-  for (int i = 0; i < indices.size(); i++) {
+  for (int i = 0; i < std::ssize(indices); i++) {
    size_t offset = indices[i] * in.strides()[axis_];
    auto [new_flags, data_size] = compute_new_flags(
        outputs[i].shape(), in.strides(), in.data_size(), in.flags());
@@ -254,7 +255,7 @@ void Squeeze::eval(const std::vector<array>& inputs, array& out) {
  const auto& in = inputs[0];
  Strides strides;
  for (int i = 0, j = 0; i < in.ndim(); ++i) {
-    if (j < axes_.size() && i == axes_[j]) {
+    if (j < std::ssize(axes_) && i == axes_[j]) {
      j++;
    } else {
      strides.push_back(in.strides(i));
@@ -272,7 +273,7 @@ void Transpose::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  Strides out_strides(out.ndim());
  auto& in = inputs[0];
-  for (int ax = 0; ax < axes_.size(); ++ax) {
+  for (int ax = 0; ax < std::ssize(axes_); ++ax) {
    out_strides[ax] = in.strides()[axes_[ax]];
  }

--- a/mlx/backend/common/compiled.cpp
+++ b/mlx/backend/common/compiled.cpp
@@ -120,7 +120,7 @@ void compiled_allocate_outputs(
    Strides strides;
    size_t data_size;
    array::Flags flags;
-    for (int i = 0; i < inputs.size() && o < outputs.size(); ++i) {
+    for (int i = 0; i < std::ssize(inputs) && o < std::ssize(outputs); ++i) {
      auto& in = inputs[i];
      // Conditions for donation
      // - Correct size
@@ -138,7 +138,7 @@ void compiled_allocate_outputs(
        data_size = in.data_size();
      }
    }
-    for (; o < outputs.size(); ++o) {
+    for (; o < std::ssize(outputs); ++o) {
      outputs[o].set_data(
          allocator::malloc(data_size * outputs[o].itemsize()),
          data_size,
@@ -147,7 +147,7 @@ void compiled_allocate_outputs(
    }
  } else {
    int o = 0;
-    for (int i = 0; i < inputs.size() && o < outputs.size(); ++i) {
+    for (int i = 0; i < std::ssize(inputs) && o < std::ssize(outputs); ++i) {
      auto& in = inputs[i];
      // Conditions for donation
      // - Row contiguous
@@ -162,7 +162,7 @@ void compiled_allocate_outputs(
        o++;
      }
    }
-    for (; o < outputs.size(); ++o) {
+    for (; o < std::ssize(outputs); ++o) {
      outputs[o].set_data(allocator::malloc(outputs[o].nbytes()));
    }
  }
@@ -193,7 +193,7 @@ std::tuple<bool, Shape, std::vector<Strides>> compiled_collapse_contiguous_dims(

    // Broadcast the inputs to the output shape.
    Strides xstrides;
-    size_t j = 0;
+    int j = 0;
    for (; j < shape.size() - x.ndim(); ++j) {
      if (shape[j] == 1) {
        xstrides.push_back(out.strides()[j]);
@@ -201,7 +201,7 @@ std::tuple<bool, Shape, std::vector<Strides>> compiled_collapse_contiguous_dims(
        xstrides.push_back(0);
      }
    }
-    for (size_t i = 0; i < x.ndim(); ++i, ++j) {
+    for (int i = 0; i < x.ndim(); ++i, ++j) {
      if (x.shape(i) == 1) {
        if (shape[j] == 1) {
          xstrides.push_back(out.strides()[j]);
@@ -224,13 +224,13 @@ bool compiled_use_large_index(
    const std::vector<array>& outputs,
    bool contiguous) {
  if (contiguous) {
-    size_t max_size = 0;
+    int64_t max_size = 0;
    for (const auto& in : inputs) {
      max_size = std::max(max_size, in.data_size());
    }
    return max_size > UINT32_MAX;
  } else {
-    size_t max_size = 0;
+    int64_t max_size = 0;
    for (const auto& o : outputs) {
      max_size = std::max(max_size, o.size());
    }
--- a/mlx/backend/common/load.cpp
+++ b/mlx/backend/common/load.cpp
@@ -27,7 +27,7 @@ void swap_endianness(uint8_t* data_bytes, size_t N) {

 namespace mlx::core {

-void Load::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Load::eval_cpu(const std::vector<array>& /* inputs */, array& out) {
  out.set_data(allocator::malloc(out.nbytes()));
  auto read_task = [out_ptr = out.data<char>(),
                    size = out.size(),
--- a/mlx/backend/common/matmul.h
+++ b/mlx/backend/common/matmul.h
@@ -13,7 +13,7 @@ inline std::tuple<Shape, Strides, Strides> collapse_batches(
    const array& a,
    const array& b) {
  if (a.ndim() == 2) {
-    return {{1}, {0}, {0}};
+    return {Shape{1}, Strides{0}, Strides{0}};
  }

  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
@@ -38,7 +38,7 @@ inline std::tuple<Shape, Strides, Strides> collapse_batches(
 inline std::tuple<Shape, Strides, Strides, Strides>
 collapse_batches(const array& a, const array& b, const array& c) {
  if (a.ndim() == 2) {
-    return {{1}, {0}, {0}, {0}};
+    return {Shape{1}, Strides{0}, Strides{0}, Strides{0}};
  }

  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
--- a/mlx/backend/common/reduce.cpp
+++ b/mlx/backend/common/reduce.cpp
@@ -28,7 +28,7 @@ std::pair<Shape, Strides> shapes_without_reduction_axes(

 ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {
  // The data is all there and we are reducing over everything
-  if (x.size() == x.data_size() && axes.size() == x.ndim() &&
+  if (x.size() == x.data_size() && std::ssize(axes) == x.ndim() &&
      x.flags().contiguous) {
    return ContiguousAllReduce;
  }
@@ -38,7 +38,7 @@ ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {
    // Merge consecutive axes
    Shape shape = {x.shape(axes[0])};
    Strides strides = {x.strides()[axes[0]]};
-    for (int i = 1; i < axes.size(); i++) {
+    for (int i = 1; i < std::ssize(axes); i++) {
      if (axes[i] - 1 == axes[i - 1] && x.shape(axes[i]) > 1) {
        shape.back() *= x.shape(axes[i]);
        strides.back() = x.strides()[axes[i]];
--- a/mlx/backend/common/slicing.cpp
+++ b/mlx/backend/common/slicing.cpp
@@ -24,8 +24,8 @@ std::tuple<int64_t, Strides> prepare_slice(
 void shared_buffer_slice(
    const array& in,
    const Strides& out_strides,
-    size_t data_offset,
-    size_t data_size,
+    int64_t data_offset,
+    int64_t data_size,
    array& out) {
  // Compute row/col contiguity
  auto [no_bsx_size, is_row_contiguous, is_col_contiguous] =
@@ -61,7 +61,7 @@ void slice(
  if (data_end < 0) {
    data_end += in.data_size();
  }
-  size_t data_size = (data_end - data_offset);
+  int64_t data_size = (data_end - data_offset);
  shared_buffer_slice(in, inp_strides, data_offset, data_size, out);
 }

--- a/mlx/backend/common/ternary.h
+++ b/mlx/backend/common/ternary.h
@@ -11,6 +11,8 @@ namespace mlx::core {
 enum class TernaryOpType {
  ScalarScalarScalar,
  VectorVectorVector,
+  VectorVectorScalar,
+  VectorScalarVector,
  General,
 };

@@ -25,6 +27,14 @@ get_ternary_op_type(const array& a, const array& b, const array& c) {
      (a.flags().col_contiguous && b.flags().col_contiguous &&
       c.flags().col_contiguous)) {
    topt = TernaryOpType::VectorVectorVector;
+  } else if (
+      b.data_size() == 1 && a.flags().row_contiguous &&
+      c.flags().row_contiguous) {
+    topt = TernaryOpType::VectorScalarVector;
+  } else if (
+      c.data_size() == 1 && a.flags().row_contiguous &&
+      b.flags().row_contiguous) {
+    topt = TernaryOpType::VectorVectorScalar;
  } else {
    topt = TernaryOpType::General;
  }
@@ -59,6 +69,8 @@ inline void set_ternary_op_output_data(
            b.flags());
      }
      break;
+    case TernaryOpType::VectorVectorScalar:
+    case TernaryOpType::VectorScalarVector:
    case TernaryOpType::General:
      // Try to donate an input which is row_contiguous
      if (!((a.flags().row_contiguous && maybe_donate(a)) ||
--- a/mlx/backend/common/utils.cpp
+++ b/mlx/backend/common/utils.cpp
@@ -28,7 +28,7 @@ std::tuple<Shape, std::vector<Strides>> collapse_contiguous_dims(
    if (shape[0] != 1) {
      to_collapse.push_back(0);
    }
-    size_t size = shape[0];
+    int64_t size = shape[0];
    for (int i = 1; i < shape.size(); i++) {
      bool contiguous = true;
      size *= shape[i];
@@ -64,7 +64,7 @@ std::tuple<Shape, std::vector<Strides>> collapse_contiguous_dims(
      current_shape *= shape[to_collapse[k]];
    }
    out_shape.push_back(current_shape);
-    for (int j = 0; j < strides.size(); j++) {
+    for (int j = 0; j < std::ssize(strides); j++) {
      const auto& st = strides[j];
      out_strides[j].push_back(st[to_collapse[k - 1]]);
    }
@@ -228,31 +228,4 @@ std::pair<Dims, Dims> get_grid_and_block_common(int dim0, int dim1, int dim2) {
      std::make_tuple(gx, gy, gz), std::make_tuple(bx, by, bz));
 }

-array swapaxes_in_eval(const array& x, int axis1, int axis2) {
-  int ndim = x.ndim();
-  if (axis1 < 0) {
-    axis1 += ndim;
-  }
-  if (axis2 < 0) {
-    axis2 += ndim;
-  }
-
-  auto shape = x.shape();
-  std::swap(shape[axis1], shape[axis2]);
-  auto strides = x.strides();
-  std::swap(strides[axis1], strides[axis2]);
-
-  auto [data_size, row_contiguous, col_contiguous] =
-      check_contiguity(shape, strides);
-  bool contiguous = data_size == x.data_size();
-
-  array out(std::move(shape), x.dtype(), nullptr, {});
-  out.copy_shared_buffer(
-      x,
-      std::move(strides),
-      {contiguous, row_contiguous, col_contiguous},
-      x.data_size());
-  return out;
-}
-
 } // namespace mlx::core
--- a/mlx/backend/common/utils.h
+++ b/mlx/backend/common/utils.h
@@ -162,7 +162,7 @@ struct ContiguousIterator {
 };

 inline auto check_contiguity(const Shape& shape, const Strides& strides) {
-  size_t no_broadcast_data_size = 1;
+  int64_t no_broadcast_data_size = 1;
  int64_t f_stride = 1;
  int64_t b_stride = 1;
  bool is_row_contiguous = true;
@@ -183,7 +183,7 @@ inline auto check_contiguity(const Shape& shape, const Strides& strides) {
 }

 inline bool is_donatable(const array& in, const array& out) {
-  constexpr size_t donation_extra = 16384;
+  constexpr int64_t donation_extra = 16384;

  return in.is_donatable() && in.itemsize() == out.itemsize() &&
      in.buffer_size() <= out.nbytes() + donation_extra;
@@ -196,9 +196,6 @@ void shared_buffer_reshape(
    const Strides& out_strides,
    array& out);

-// Like the swapaxes op but safe to call in eval_gpu.
-array swapaxes_in_eval(const array& x, int axis1, int axis2);
-
 template <typename T>
 inline SmallVector<T> remove_index(SmallVector<T> vec, size_t index) {
  vec.erase(std::next(vec.begin(), index));
--- a/mlx/backend/cpu/arange.h
+++ b/mlx/backend/cpu/arange.h
@@ -10,7 +10,7 @@ namespace mlx::core {
 namespace {

 template <typename T>
-void arange(T start, T next, array& out, size_t size, Stream stream) {
+void arange(T start, T next, array& out, int64_t size, Stream stream) {
  auto ptr = out.data<T>();
  auto step_size = next - start;
  auto& encoder = cpu::get_command_encoder(stream);
--- a/mlx/backend/cpu/arg_reduce.cpp
+++ b/mlx/backend/cpu/arg_reduce.cpp
@@ -19,12 +19,12 @@ void arg_reduce(const array& in, array& out, const OpT& op, int axis) {
  auto in_ptr = in.data<InT>();
  auto out_ptr = out.data<uint32_t>();

-  for (uint32_t i = 0; i < out.size(); ++i) {
+  for (int64_t i = 0; i < out.size(); ++i) {
    auto loc = elem_to_loc(i, shape, strides);
    auto local_in_ptr = in_ptr + loc;
    uint32_t ind_v = 0;
    InT v = (*local_in_ptr);
-    for (uint32_t j = 0; j < axis_size; ++j, local_in_ptr += axis_stride) {
+    for (int64_t j = 0; j < axis_size; ++j, local_in_ptr += axis_stride) {
      op(j, (*local_in_ptr), &ind_v, &v);
    }
    out_ptr[i] = ind_v;
--- a/mlx/backend/cpu/binary.cpp
+++ b/mlx/backend/cpu/binary.cpp
@@ -17,7 +17,12 @@ namespace mlx::core {
 namespace {

 template <typename Op>
-void binary(const array& a, const array& b, array& out, Op op, Stream stream) {
+void binary(
+    const array& a,
+    const array& b,
+    array& out,
+    Op /* op */,
+    Stream stream) {
  auto bopt = get_binary_op_type(a, b);
  set_binary_op_output_data(a, b, out, bopt);

@@ -81,7 +86,7 @@ void comparison_op(
    const array& a,
    const array& b,
    array& out,
-    Op op,
+    Op /* op */,
    Stream stream) {
  auto bopt = get_binary_op_type(a, b);
  set_binary_op_output_data(a, b, out, bopt);
@@ -146,7 +151,7 @@ void binary_float(
    const array& a,
    const array& b,
    array& out,
-    Op op,
+    Op /* op */,
    Stream stream) {
  auto bopt = get_binary_op_type(a, b);
  set_binary_op_output_data(a, b, out, bopt);
@@ -187,7 +192,7 @@ void binary_int(
    const array& a,
    const array& b,
    array& out,
-    Op op,
+    Op /* op */,
    Stream stream) {
  auto bopt = get_binary_op_type(a, b);
  set_binary_op_output_data(a, b, out, bopt);
--- a/mlx/backend/cpu/binary_two.h
+++ b/mlx/backend/cpu/binary_two.h
@@ -99,7 +99,7 @@ void binary_op_dispatch_dims(
  ContiguousIterator a_it(shape, a_strides, ndim - 2);
  ContiguousIterator b_it(shape, b_strides, ndim - 2);
  auto stride = out_strides[ndim - 3];
-  for (size_t elem = 0; elem < a.size(); elem += stride) {
+  for (int64_t elem = 0; elem < std::ssize(a); elem += stride) {
    binary_op_dims<T, U, Op, 2>(
        a_ptr + a_it.loc,
        b_ptr + b_it.loc,
@@ -137,21 +137,21 @@ void binary_op(
  if (bopt == BinaryOpType::ScalarScalar) {
    std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
  } else if (bopt == BinaryOpType::ScalarVector) {
-    for (size_t i = 0; i < b.data_size(); ++i) {
+    for (int64_t i = 0; i < b.data_size(); ++i) {
      std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
      out_a_ptr++;
      out_b_ptr++;
      b_ptr++;
    }
  } else if (bopt == BinaryOpType::VectorScalar) {
-    for (size_t i = 0; i < a.data_size(); ++i) {
+    for (int64_t i = 0; i < a.data_size(); ++i) {
      std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
      out_a_ptr++;
      out_b_ptr++;
      a_ptr++;
    }
  } else { // VectorVector
-    for (size_t i = 0; i < a.size(); ++i) {
+    for (int64_t i = 0; i < a.size(); ++i) {
      std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
      out_a_ptr++;
      out_b_ptr++;
--- a/mlx/backend/cpu/cholesky.cpp
+++ b/mlx/backend/cpu/cholesky.cpp
@@ -33,8 +33,8 @@ void cholesky_impl(const array& a, array& factor, bool upper, Stream stream) {
                    N = a.shape(-1),
                    size = a.size()]() mutable {
    char uplo = (upper) ? 'L' : 'U';
-    size_t num_matrices = size / (N * N);
-    for (int i = 0; i < num_matrices; i++) {
+    int64_t num_matrices = size / (N * N);
+    for (int64_t i = 0; i < num_matrices; i++) {
      // Compute Cholesky factorization.
      int info;
      potrf<T>(
--- a/mlx/backend/cpu/compiled.cpp
+++ b/mlx/backend/cpu/compiled.cpp
@@ -15,6 +15,7 @@
 #include "mlx/backend/cpu/jit_compiler.h"
 #include "mlx/device.h"
 #include "mlx/graph_utils.h"
+#include "mlx/version.h"

 namespace mlx::core {

@@ -48,7 +49,7 @@ static CompilerCache& cache() {
 // GPU compile is always available if the GPU is available and since we are in
 // this file CPU compile is also available.
 namespace detail {
-bool compile_available_for_device(const Device& device) {
+bool compile_available_for_device(const Device& /* device */) {
  return true;
 }

@@ -94,7 +95,11 @@ void* compile(
    kernel_file_name = kernel_name;
  }

-  auto output_dir = std::filesystem::temp_directory_path();
+  auto output_dir =
+      std::filesystem::temp_directory_path() / "mlx" / version() / "cpu";
+  if (!std::filesystem::exists(output_dir)) {
+    std::filesystem::create_directories(output_dir);
+  }

  std::string shared_lib_name = "lib" + kernel_file_name + ".so";
  auto shared_lib_path = (output_dir / shared_lib_name).string();
@@ -157,11 +162,13 @@ inline void build_kernel(
 #endif

  // Start the kernel
-  os << "void " << kernel_name << "(void** args) {" << std::endl;
+  os << "void " << kernel_name
+     << "(int* shape, int64_t** strides, void** args) {" << std::endl;

  // Add the input arguments
  int cnt = 0;
-  for (size_t i = 0; i < inputs.size(); ++i) {
+  int strides_index = 1;
+  for (int i = 0; i < std::ssize(inputs); ++i) {
    // Skip constants from the input list
    if (is_constant(i)) {
      continue;
@@ -175,8 +182,8 @@ inline void build_kernel(
       << "];" << std::endl;
    // Scalars and contiguous need no strides
    if (!is_scalar(x) && !contiguous) {
-      os << "  const size_t* " << xname << "_strides = (size_t*)args[" << cnt++
-         << "];" << std::endl;
+      os << "  const int64_t* " << xname << "_strides = strides["
+         << strides_index++ << "];" << std::endl;
    }
  }

@@ -186,10 +193,8 @@ inline void build_kernel(
    os << "  " << tstr << "* " << namer.get_name(x) << " = (" << tstr
       << "*)args[" << cnt++ << "];" << std::endl;
  }
-  // Add output strides and shape to extract the indices.
-  if (!contiguous) {
-    os << "  const int* shape = (int*)args[" << cnt++ << "];" << std::endl;
-  } else {
+  // Add output size
+  if (contiguous) {
    os << "  const size_t size = (size_t)args[" << cnt++ << "];" << std::endl;
  }

@@ -233,7 +238,7 @@ inline void build_kernel(
    } else {
      os << x.primitive().name();
      os << "()(";
-      for (int i = 0; i < x.inputs().size() - 1; i++) {
+      for (int i = 0; i < std::ssize(x.inputs()) - 1; i++) {
        os << "tmp_" << namer.get_name(x.inputs()[i]) << ", ";
      }
      os << "tmp_" << namer.get_name(x.inputs().back()) << ");" << std::endl;
@@ -288,17 +293,8 @@ void Compiled::eval_cpu(
  auto [contiguous, shape, strides] =
      compiled_collapse_contiguous_dims(inputs, outputs[0], is_constant_);

-  // Force allocating shape/strides on heap so we can take their data() first
-  // and then std::move them.
-  // TODO: Refactor code to avoid heap allocation.
-  shape.grow();
-  for (auto& s : strides) {
-    s.grow();
-  }
-
  // Collect function input arguments.
  std::vector<void*> args;
-  int strides_index = 1;
  for (size_t i = 0; i < inputs.size(); ++i) {
    if (is_constant_(i)) {
      continue;
@@ -306,9 +302,6 @@ void Compiled::eval_cpu(
    const auto& x = inputs[i];
    encoder.set_input_array(x);
    args.push_back((void*)x.data<void>());
-    if (!contiguous && !is_scalar(x)) {
-      args.push_back(strides[strides_index++].data());
-    }
  }

  // Get the kernel name from the lib
@@ -343,16 +336,20 @@ void Compiled::eval_cpu(
    args.push_back(x.data<void>());
    encoder.set_output_array(x);
  }
-  if (!contiguous) {
-    args.push_back((void*)shape.data());
-  } else {
+  if (contiguous) {
    args.push_back((void*)outputs[0].data_size());
  }
-  auto fun = (void (*)(void**))fn_ptr;
+  auto fun = reinterpret_cast<void (*)(int*, int64_t**, void**)>(fn_ptr);
  encoder.dispatch([fun,
                    args = std::move(args),
                    strides = std::move(strides),
-                    shape = std::move(shape)]() mutable { fun(args.data()); });
+                    shape = std::move(shape)]() mutable {
+    SmallVector<int64_t*> strides_ptrs;
+    for (auto& s : strides) {
+      strides_ptrs.push_back(s.data());
+    }
+    fun(shape.data(), strides_ptrs.data(), args.data());
+  });
 }

 } // namespace mlx::core
--- a/mlx/backend/cpu/conv.cpp
+++ b/mlx/backend/cpu/conv.cpp
@@ -860,7 +860,7 @@ void explicit_gemm_conv_1D_cpu(
    const std::vector<int>& padding_lo,
    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
-    const std::vector<int>& wt_dilation,
+    const std::vector<int>& /* wt_dilation */,
    Stream stream) {
  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
  const int iH = in.shape(1); // Input spatial dim
@@ -996,131 +996,6 @@ void explicit_gemm_conv_1D_cpu(
  encoder.add_temporaries(std::move(temps));
 }

-void explicit_gemm_conv_2D_cpu(
-    const array& in,
-    const array& wt,
-    array out,
-    const std::vector<int>& padding_lo,
-    const std::vector<int>& padding_hi,
-    const std::vector<int>& wt_strides,
-    const std::vector<int>& wt_dilation,
-    Stream stream) {
-  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
-  const int iH = in.shape(1); // Input spatial dim
-  const int iW = in.shape(2); // Input spatial dim
-  const int oH = out.shape(1); // Output spatial dim
-  const int oW = out.shape(2); // Output spatial dim
-  const int O = wt.shape(0); // Out channels
-  const int C = wt.shape(3); // In channels
-  const int wH = wt.shape(1); // Weight spatial dim
-  const int wW = wt.shape(2); // Weight spatial dim
-
-  auto conv_dtype = out.dtype();
-  auto& encoder = cpu::get_command_encoder(stream);
-
-  // Pad input
-  Shape padded_shape = {
-      N,
-      iH + padding_lo[0] + padding_hi[0],
-      iW + padding_lo[1] + padding_hi[1],
-      C};
-  array in_padded(padded_shape, conv_dtype, nullptr, {});
-
-  // Fill with zeros
-  std::vector<array> temps;
-  temps.push_back(array(0, conv_dtype));
-  copy_cpu(temps.back(), in_padded, CopyType::Scalar, stream);
-
-  // Pick input slice from padded
-  size_t data_offset = padding_lo[0] * in_padded.strides()[1] +
-      padding_lo[1] * in_padded.strides()[2];
-  array in_padded_slice(in.shape(), in_padded.dtype(), nullptr, {});
-  in_padded_slice.copy_shared_buffer(
-      in_padded,
-      in_padded.strides(),
-      in_padded.flags(),
-      in_padded_slice.size(),
-      data_offset);
-  temps.push_back(in_padded_slice);
-
-  // Copy input values into the slice
-  copy_cpu_inplace(in, in_padded_slice, CopyType::GeneralGeneral, stream);
-
-  // Make strided view
-  Shape strided_shape = {N, oH, oW, wH, wW, C};
-
-  Strides strided_strides = {
-      in_padded.strides()[0],
-      in_padded.strides()[1] * wt_strides[0],
-      in_padded.strides()[2] * wt_strides[1],
-      in_padded.strides()[1],
-      in_padded.strides()[2],
-      in_padded.strides()[3]};
-  auto flags = in_padded.flags();
-
-  array in_strided_view(strided_shape, in_padded.dtype(), nullptr, {});
-  in_strided_view.copy_shared_buffer(
-      in_padded, strided_strides, flags, in_strided_view.size(), 0);
-
-  // Materialize strided view
-  Shape strided_reshape = {N * oH * oW, wH * wW * C};
-  array in_strided(strided_reshape, in_strided_view.dtype(), nullptr, {});
-  copy_cpu(in_strided_view, in_strided, CopyType::General, stream);
-  temps.push_back(in_strided);
-
-  // Check wt dtype and prepare
-  auto gemm_wt = wt;
-  auto gemm_out = out;
-
-  if (wt.dtype() != float32 || !wt.flags().row_contiguous) {
-    auto ctype =
-        wt.flags().row_contiguous ? CopyType::Vector : CopyType::General;
-    gemm_wt = array(wt.shape(), float32, nullptr, {});
-    copy_cpu(wt, gemm_wt, ctype, stream);
-    temps.push_back(gemm_wt);
-  }
-
-  if (out.dtype() != float32) {
-    gemm_out = array(out.shape(), float32, nullptr, {});
-    gemm_out.set_data(allocator::malloc(gemm_out.nbytes()));
-    temps.push_back(gemm_out);
-  }
-
-  encoder.set_input_array(in_strided);
-  encoder.set_input_array(gemm_wt);
-  encoder.set_output_array(gemm_out);
-
-  encoder.dispatch([in_strided_ptr = in_strided.data<float>(),
-                    gemm_wt_ptr = gemm_wt.data<float>(),
-                    gemm_out_ptr = gemm_out.data<float>(),
-                    strided_reshape = std::move(strided_reshape),
-                    O]() {
-    // Perform gemm
-    cblas_sgemm(
-        CblasRowMajor,
-        CblasNoTrans, // no trans A
-        CblasTrans, // transB
-        strided_reshape[0], // M
-        O, // N
-        strided_reshape[1], // K
-        1.0f, // alpha
-        in_strided_ptr,
-        strided_reshape[1], // lda
-        gemm_wt_ptr,
-        strided_reshape[1], // ldb
-        0.0f, // beta
-        gemm_out_ptr,
-        O // ldc
-    );
-  });
-
-  // Copy results if needed
-  if (out.dtype() != float32) {
-    copy_cpu_inplace(gemm_out, out, CopyType::Vector, stream);
-  }
-  encoder.add_temporaries(std::move(temps));
-}
-
 void explicit_gemm_conv_ND_cpu(
    const array& in,
    const array& wt,
@@ -1128,7 +1003,7 @@ void explicit_gemm_conv_ND_cpu(
    const std::vector<int>& padding_lo,
    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
-    const std::vector<int>& wt_dilation,
+    const std::vector<int>& /* wt_dilation */,
    const bool flip,
    Stream stream) {
  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
@@ -1148,7 +1023,7 @@ void explicit_gemm_conv_ND_cpu(
  // Pad input
  Shape padded_shape(in.shape().size());
  padded_shape.front() = N;
-  for (size_t i = 0; i < iDim.size(); i++) {
+  for (int i = 0; i < iDim.size(); i++) {
    padded_shape[i + 1] = iDim[i] + padding_lo[i] + padding_hi[i];
  }
  padded_shape.back() = C;
@@ -1179,20 +1054,20 @@ void explicit_gemm_conv_ND_cpu(
  // Make strided view
  Shape strided_shape(oDim.size() + wDim.size() + 2);
  strided_shape.front() = N;
-  for (size_t i = 0; i < oDim.size(); i++) {
+  for (int i = 0; i < oDim.size(); i++) {
    strided_shape[i + 1] = oDim[i];
  }
-  for (size_t i = 0; i < wDim.size(); i++) {
+  for (int i = 0; i < wDim.size(); i++) {
    strided_shape[i + 1 + oDim.size()] = wDim[i];
  }
  strided_shape.back() = C;

  Strides strided_strides(in.shape().size() * 2 - 2);
  strided_strides[0] = in_padded.strides()[0];
-  for (size_t i = 0; i < wt_strides.size(); i++) {
+  for (int i = 0; i < std::ssize(wt_strides); i++) {
    strided_strides[i + 1] = in_padded.strides()[i + 1] * wt_strides[i];
  }
-  for (size_t i = 1; i < in_padded.strides().size(); i++) {
+  for (int i = 1; i < std::ssize(in_padded.strides()); i++) {
    strided_strides[i + wt_strides.size()] = in_padded.strides()[i];
  }

--- a/mlx/backend/cpu/distributed.cpp
+++ b/mlx/backend/cpu/distributed.cpp
@@ -90,6 +90,7 @@ void Recv::eval_cpu(
    std::vector<array>& outputs) {
  assert(inputs.size() == 0);
  assert(outputs.size() == 1);
+  (void)inputs;

  outputs[0].set_data(allocator::malloc(outputs[0].nbytes()));
  distributed::detail::recv(group(), outputs[0], src_, stream());
--- a/mlx/backend/cpu/eig.cpp
+++ b/mlx/backend/cpu/eig.cpp
@@ -46,7 +46,6 @@ void eig_impl(
    int info;
    {
      T work;
-      int iwork;
      geev<T>(
          &jobl,
          &jobr,
@@ -71,7 +70,7 @@ void eig_impl(
    auto eig_tmp = static_cast<T*>(eig_tmp_data.buffer.raw_ptr());
    auto vec_tmp = static_cast<T*>(vec_tmp_data.buffer.raw_ptr());
    auto work_buf = array::Data{allocator::malloc(sizeof(T) * lwork)};
-    for (size_t i = 0; i < size / (N * N); ++i) {
+    for (int64_t i = 0; i < size / (N * N); ++i) {
      geev<T>(
          &jobl,
          &jobr,
--- a/mlx/backend/cpu/eigh.cpp
+++ b/mlx/backend/cpu/eigh.cpp
@@ -165,7 +165,7 @@ void eigh_impl(
    EighWork<T> work(jobz, uplo, N);

    // Work loop
-    for (size_t i = 0; i < size / (N * N); ++i) {
+    for (int64_t i = 0; i < size / (N * N); ++i) {
      work.run(vec_ptr, eig_ptr);
      vec_ptr += N * N;
      eig_ptr += N;
--- a/mlx/backend/cpu/encoder.h
+++ b/mlx/backend/cpu/encoder.h
@@ -20,8 +20,8 @@ struct CommandEncoder {
  CommandEncoder(CommandEncoder&&) = delete;
  CommandEncoder& operator=(CommandEncoder&&) = delete;

-  void set_input_array(const array& a) {}
-  void set_output_array(array& a) {}
+  void set_input_array(const array& /* a */) {}
+  void set_output_array(array& /* a */) {}

  // Hold onto a temporary until any already scheduled tasks which use it as
  // an input are complete.
--- a/mlx/backend/cpu/gemm.h
+++ b/mlx/backend/cpu/gemm.h
@@ -12,12 +12,12 @@ void matmul(
    T* out,
    bool a_transposed,
    bool b_transposed,
-    size_t lda,
-    size_t ldb,
-    size_t ldc,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
    float alpha,
    float beta,
-    size_t batch_size,
+    int64_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
--- a/mlx/backend/cpu/gemms/bnns.cpp
+++ b/mlx/backend/cpu/gemms/bnns.cpp
@@ -1,5 +1,4 @@
 // Copyright © 2023-2024 Apple Inc.
-
 #include <Accelerate/Accelerate.h>

 #include "mlx/array.h"
@@ -35,7 +34,7 @@ void matmul_bnns(
    bool b_transposed,
    size_t lda,
    size_t ldb,
-    size_t ldc,
+    size_t /* ldc */,
    float alpha,
    float beta,
    size_t batch_size,
@@ -49,9 +48,15 @@ void matmul_bnns(
  size_t K = a_shape[ndim - 1];

  BNNSDataType bnns_dtype = to_bnns_dtype<T>();
-
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+  if (beta != 1.0 && beta != 0.0) {
+    // scale the output
+    for (size_t i = 0; i < batch_size * M * N; ++i) {
+      out[i] *= beta;
+    }
+    beta = 1.0;
+  }
  const BNNSLayerParametersBroadcastMatMul gemm_params{
      /* float alpha = */ alpha,
      /* float beta = */ beta,
@@ -122,7 +127,7 @@ void matmul_bnns(
  auto bnns_filter =
      BNNSFilterCreateLayerBroadcastMatMul(&gemm_params, nullptr);

-  for (int i = 0; i < batch_size; ++i) {
+  for (size_t i = 0; i < batch_size; ++i) {
    BNNSFilterApplyTwoInput(
        bnns_filter,
        reinterpret_cast<const uint8_t*>(
@@ -143,12 +148,12 @@ void matmul<float16_t>(
    float16_t* out,
    bool a_transposed,
    bool b_transposed,
-    size_t lda,
-    size_t ldb,
-    size_t ldc,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
    float alpha,
    float beta,
-    size_t batch_size,
+    int64_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
@@ -178,12 +183,12 @@ void matmul<bfloat16_t>(
    bfloat16_t* out,
    bool a_transposed,
    bool b_transposed,
-    size_t lda,
-    size_t ldb,
-    size_t ldc,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
    float alpha,
    float beta,
-    size_t batch_size,
+    int64_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
--- a/mlx/backend/cpu/gemms/cblas.cpp
+++ b/mlx/backend/cpu/gemms/cblas.cpp
@@ -13,20 +13,20 @@ void matmul<float>(
    float* out,
    bool a_transposed,
    bool b_transposed,
-    size_t lda,
-    size_t ldb,
-    size_t ldc,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
    float alpha,
    float beta,
-    size_t batch_size,
+    int64_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
    const Strides& b_strides) {
  auto ndim = a_shape.size();
-  size_t M = a_shape[ndim - 2];
-  size_t N = b_shape[ndim - 1];
-  size_t K = a_shape[ndim - 1];
+  int64_t M = a_shape[ndim - 2];
+  int64_t N = b_shape[ndim - 1];
+  int64_t K = a_shape[ndim - 1];

  for (int i = 0; i < batch_size; ++i) {
    cblas_sgemm(
@@ -54,20 +54,20 @@ void matmul<double>(
    double* out,
    bool a_transposed,
    bool b_transposed,
-    size_t lda,
-    size_t ldb,
-    size_t ldc,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
    float alpha,
    float beta,
-    size_t batch_size,
+    int64_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
    const Strides& b_strides) {
  auto ndim = a_shape.size();
-  size_t M = a_shape[ndim - 2];
-  size_t N = b_shape[ndim - 1];
-  size_t K = a_shape[ndim - 1];
+  int64_t M = a_shape[ndim - 2];
+  int64_t N = b_shape[ndim - 1];
+  int64_t K = a_shape[ndim - 1];

  for (int i = 0; i < batch_size; ++i) {
    cblas_dgemm(
@@ -88,4 +88,47 @@ void matmul<double>(
  }
 }

+template <>
+void matmul<complex64_t>(
+    const complex64_t* a,
+    const complex64_t* b,
+    complex64_t* out,
+    bool a_transposed,
+    bool b_transposed,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    float alpha,
+    float beta,
+    int64_t batch_size,
+    const Shape& a_shape,
+    const Strides& a_strides,
+    const Shape& b_shape,
+    const Strides& b_strides) {
+  auto ndim = a_shape.size();
+  int64_t M = a_shape[ndim - 2];
+  int64_t N = b_shape[ndim - 1];
+  int64_t K = a_shape[ndim - 1];
+  auto calpha = static_cast<complex64_t>(alpha);
+  auto cbeta = static_cast<complex64_t>(beta);
+
+  for (int i = 0; i < batch_size; ++i) {
+    cblas_cgemm(
+        CblasRowMajor,
+        a_transposed ? CblasTrans : CblasNoTrans, // transA
+        b_transposed ? CblasTrans : CblasNoTrans, // transB
+        M,
+        N,
+        K,
+        &calpha,
+        a + elem_to_loc(M * K * i, a_shape, a_strides),
+        lda,
+        b + elem_to_loc(K * N * i, b_shape, b_strides),
+        ldb,
+        &cbeta,
+        out + M * N * i,
+        ldc);
+  }
+}
+
 } // namespace mlx::core
--- a/mlx/backend/cpu/hadamard.cpp
+++ b/mlx/backend/cpu/hadamard.cpp
@@ -11,9 +11,9 @@ namespace mlx::core {

 // n = 2^k component
 template <typename T>
-void hadamard_n(T* out, int n, int m, float scale, size_t size) {
+void hadamard_n(T* out, int n, int /* m */, float scale, int64_t size) {
  for (int b = 0; b < size / n; b++) {
-    size_t loc = b * n;
+    int64_t loc = b * n;
    T* data_ptr = out + loc;
    int h = 1;
    int n_over_2 = n / 2;
@@ -37,7 +37,7 @@ void hadamard_n(T* out, int n, int m, float scale, size_t size) {

 // m component
 template <typename T>
-void hadamard_m(T* out, int n, int m, float scale, size_t size) {
+void hadamard_m(T* out, int n, int m, float scale, int64_t size) {
  auto h_matrices = hadamard_matrices();
  auto& matrix = h_matrices[m];
  auto start = 1;
@@ -45,7 +45,7 @@ void hadamard_m(T* out, int n, int m, float scale, size_t size) {
  std::vector<bool> hmat_vec;
  while (end != std::string_view::npos) {
    auto row = matrix.substr(start, end - start);
-    for (int i = 0; i < row.length(); i++) {
+    for (int i = 0; i < std::ssize(row); i++) {
      hmat_vec.push_back(row[i] == '+');
    }
    start = end + 1;
@@ -53,7 +53,7 @@ void hadamard_m(T* out, int n, int m, float scale, size_t size) {
  }

  for (int b = 0; b < size / m / n; b++) {
-    size_t loc = b * n * m;
+    int64_t loc = b * n * m;
    T* data_ptr = out + loc;
    for (int i = 0; i < n; i++) {
      std::vector<float> out(m);
--- a/mlx/backend/cpu/indexing.cpp
+++ b/mlx/backend/cpu/indexing.cpp
@@ -78,7 +78,7 @@ void gather(
    can_copy = true;

    // Ignore leading 1s
-    int i = 0;
+    int64_t i = 0;
    for (; i < slice_sizes.size() && slice_sizes[i] == 1; ++i)
      ;

@@ -91,7 +91,7 @@ void gather(
    can_copy = true;

    // Ignore trailing 1s
-    int i = slice_sizes.size() - 1;
+    int64_t i = slice_sizes.size() - 1;
    for (; i >= 0 && slice_sizes[i] == 1; --i)
      ;

@@ -101,11 +101,11 @@ void gather(
      can_copy = (src.shape(i) == slice_sizes[i]);
    }
  }
-  size_t slice_size = 1;
+  int64_t slice_size = 1;
  for (auto s : slice_sizes) {
    slice_size *= s;
  }
-  size_t ind_size = slice_size == 0 ? 0 : out.size() / slice_size;
+  int64_t ind_size = slice_size == 0 ? 0 : out.size() / slice_size;
  const T* src_ptr = src.data<T>();
  T* dst_ptr = out.data<T>();

@@ -115,10 +115,10 @@ void gather(
    src_it = ContiguousIterator(slice_sizes, src.strides(), src.ndim());
  }

-  size_t out_idx = 0;
-  for (int idx = 0; idx < ind_size; idx++) {
-    size_t src_idx = 0;
-    for (int ii = 0; ii < inds.size(); ++ii) {
+  int64_t out_idx = 0;
+  for (int64_t idx = 0; idx < ind_size; idx++) {
+    int64_t src_idx = 0;
+    for (int ii = 0; ii < std::ssize(inds); ++ii) {
      auto ax = axes[ii];
      auto idx_loc = its[ii].loc;
      its[ii].step();
@@ -134,7 +134,7 @@ void gather(
          src_ptr + src_idx, src_ptr + src_idx + slice_size, dst_ptr + out_idx);
      out_idx += slice_size;
    } else {
-      for (int jj = 0; jj < slice_size; jj++) {
+      for (int64_t jj = 0; jj < slice_size; jj++) {
        dst_ptr[out_idx++] = src_ptr[src_idx + src_it.loc];
        src_it.step();
      }
@@ -403,11 +403,11 @@ void scatter(
    const std::vector<int>& axes) {
  int nind = inds.size();
  auto inds_ndim = updates.ndim() - out.ndim();
-  size_t n_updates = nind ? inds[0].size() : 1;
+  int64_t n_updates = nind ? inds[0].size() : 1;

  Shape update_shape(
      updates.shape().begin() + inds_ndim, updates.shape().end());
-  size_t update_size = 1;
+  int64_t update_size = 1;
  for (auto us : update_shape) {
    update_size *= us;
  }
@@ -418,9 +418,9 @@ void scatter(

  auto out_ptr = out.data<InT>();
  auto upd_ptr = updates.data<InT>();
-  for (int i = 0; i < n_updates; ++i) {
-    size_t out_offset = 0;
-    for (int j = 0; j < inds.size(); ++j) {
+  for (int64_t i = 0; i < n_updates; ++i) {
+    int64_t out_offset = 0;
+    for (int j = 0; j < std::ssize(inds); ++j) {
      auto ax = axes[j];
      auto idx_loc = its[j].loc;
      its[j].step();
@@ -429,7 +429,7 @@ void scatter(
      out_offset += (idx_val * out.strides()[ax]);
    }
    update_it.seek(i * update_size);
-    for (int j = 0; j < update_size; ++j) {
+    for (int64_t j = 0; j < update_size; ++j) {
      OpT{}(upd_ptr[update_it.loc], out_ptr + out_offset + out_it.loc);
      update_it.step();
      out_it.step();
--- a/mlx/backend/cpu/inverse.cpp
+++ b/mlx/backend/cpu/inverse.cpp
@@ -122,7 +122,7 @@ void inverse_impl(
      stream);

  const int N = a.shape(-1);
-  const size_t num_matrices = a.size() / (N * N);
+  const int64_t num_matrices = a.size() / (N * N);

  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_output_array(inv);
@@ -130,13 +130,13 @@ void inverse_impl(
  auto inv_ptr = inv.data<T>();
  if (tri) {
    encoder.dispatch([inv_ptr, N, num_matrices, upper]() {
-      for (int i = 0; i < num_matrices; i++) {
+      for (int64_t i = 0; i < num_matrices; i++) {
        tri_inv<T>(inv_ptr + N * N * i, N, upper);
      }
    });
  } else {
    encoder.dispatch([inv_ptr, N, num_matrices]() {
-      for (int i = 0; i < num_matrices; i++) {
+      for (int64_t i = 0; i < num_matrices; i++) {
        general_inv<T>(inv_ptr + N * N * i, N);
      }
    });
--- a/mlx/backend/cpu/lapack.h
+++ b/mlx/backend/cpu/lapack.h
@@ -47,7 +47,7 @@ INSTANTIATE_LAPACK_REAL(orgqr)
 INSTANTIATE_LAPACK_REAL(syevd)
 INSTANTIATE_LAPACK_REAL(geev)
 INSTANTIATE_LAPACK_REAL(potrf)
-INSTANTIATE_LAPACK_REAL(gesvdx)
+INSTANTIATE_LAPACK_REAL(gesdd)
 INSTANTIATE_LAPACK_REAL(getrf)
 INSTANTIATE_LAPACK_REAL(getri)
 INSTANTIATE_LAPACK_REAL(trtri)
--- a/mlx/backend/cpu/masked_mm.cpp
+++ b/mlx/backend/cpu/masked_mm.cpp
@@ -25,7 +25,7 @@ inline void mask_matrix(
    const int64_t Y_data_str,
    const int64_t X_mask_str,
    const int64_t Y_mask_str,
-    const size_t mask_offset) {
+    const int64_t mask_offset) {
  int tX = (X + block_size - 1) / block_size;
  int tY = (Y + block_size - 1) / block_size;

@@ -61,13 +61,13 @@ inline void segmented_mm(
    T* out,
    bool a_transposed,
    bool b_transposed,
-    size_t lda,
-    size_t ldb,
+    int64_t lda,
+    int64_t ldb,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
    const Strides& b_strides,
-    size_t num_segments,
+    int64_t num_segments,
    const Shape& segments_shape,
    const Strides& segments_strides) {
  int ndim = a_shape.size();
@@ -149,9 +149,9 @@ void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto [b_transposed, ldb, b, b_copied] =
      check_transpose(b_pre, has_op_mask, inputs.back().dtype() != bool_);

-  size_t M = a.shape(-2);
-  size_t N = b.shape(-1);
-  size_t K = a.shape(-1);
+  int64_t M = a.shape(-2);
+  int64_t N = b.shape(-1);
+  int64_t K = a.shape(-1);

  if (M == 0 || N == 0) {
    return;
@@ -172,8 +172,8 @@ void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
                       int batch_idx,
                       int X,
                       int Y,
-                       size_t X_data_str,
-                       size_t Y_data_str,
+                       int64_t X_data_str,
+                       int64_t Y_data_str,
                       const Shape& mask_shape,
                       const Strides& mask_strides,
                       bool is_bool) {
@@ -215,18 +215,18 @@ void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {

  encoder.set_input_array(a);
  encoder.set_input_array(b);
-  const void* a_mask_ptr;
-  const void* b_mask_ptr;
-  const void* out_mask_ptr;
+  const void* a_mask_ptr = nullptr;
+  const void* b_mask_ptr = nullptr;
+  const void* out_mask_ptr = nullptr;
  Shape a_mask_shape;
  Shape b_mask_shape;
  Shape out_mask_shape;
  Strides a_mask_strides;
  Strides b_mask_strides;
  Strides out_mask_strides;
-  bool a_mask_bool;
-  bool b_mask_bool;
-  bool out_mask_bool;
+  bool a_mask_bool = false;
+  bool b_mask_bool = false;
+  bool out_mask_bool = false;
  if (has_op_mask) {
    auto& a_mask = inputs[inputs.size() - 2];
    auto& b_mask = inputs[inputs.size() - 1];
@@ -253,7 +253,7 @@ void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto a_ptr = a.data<float>();
  auto b_ptr = b.data<float>();
  auto out_ptr = out.data<float>();
-  size_t num_matrices = out.size() / (M * size_t(N));
+  int64_t num_matrices = out.size() / (M * int64_t(N));
  auto ldc = out.shape(-1);

  encoder.dispatch([a_ptr,
@@ -394,9 +394,9 @@ void GatherMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto [a_transposed, lda, a] = check_transpose(a_pre);
  auto [b_transposed, ldb, b] = check_transpose(b_pre);

-  size_t M = a.shape(-2);
-  size_t N = b.shape(-1);
-  size_t K = a.shape(-1);
+  int64_t M = a.shape(-2);
+  int64_t N = b.shape(-1);
+  int64_t K = a.shape(-1);

  if (M == 0 || N == 0) {
    return;
@@ -413,7 +413,7 @@ void GatherMM::eval_cpu(const std::vector<array>& inputs, array& out) {

  // Get batch dims
  auto batch_size_out = out.size() / (M * N);
-  size_t matrix_stride_out = M * N;
+  int64_t matrix_stride_out = M * N;

  auto get_batch_dims = [](const auto& v) {
    return decltype(v){v.begin(), v.end() - 2};
@@ -423,7 +423,6 @@ void GatherMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& rhs_indices = inputs[3];

  auto batch_shape = get_batch_dims(out.shape());
-  int batch_ndim = batch_shape.size();

  auto batch_shape_A = get_batch_dims(a.shape());
  auto batch_strides_A = get_batch_dims(a.strides());
--- a/mlx/backend/cpu/matmul.cpp
+++ b/mlx/backend/cpu/matmul.cpp
@@ -91,7 +91,6 @@ void matmul_general(
  auto [b_transposed, ldb, b] = check_transpose(b_pre);
  size_t M = a.shape(-2);
  size_t N = b.shape(-1);
-  size_t K = a.shape(-1);
  if (M == 0 || N == 0) {
    return;
  }
@@ -108,6 +107,9 @@ void matmul_general(
  } else if (out.dtype() == float64) {
    matmul_dispatch<double>(
        a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta, stream);
+  } else if (out.dtype() == complex64) {
+    matmul_dispatch<complex64_t>(
+        a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta, stream);
  } else {
    throw std::runtime_error("[Matmul::eval_cpu] Invalid type.");
  }
@@ -128,10 +130,6 @@ void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {
 }

 void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
-  if (out.dtype() != float32) {
-    throw std::runtime_error(
-        "[AddMM::eval_cpu] Currently only supports float32.");
-  }
  if (out.size() == 0) {
    out.set_data(allocator::malloc(out.nbytes()));
    return;
--- a/mlx/backend/cpu/primitives.cpp
+++ b/mlx/backend/cpu/primitives.cpp
@@ -48,7 +48,7 @@ static std::pair<array, bool> compute_dynamic_offset(
  auto compute_offset =
      [strides, axes, offset = offset.data<int64_t>()](const auto* indices) {
        int64_t offset_ = 0;
-        for (int i = 0; i < axes.size(); ++i) {
+        for (int i = 0; i < std::ssize(axes); ++i) {
          offset_ += indices[i] * strides[axes[i]];
        }
        offset[0] = offset_;
@@ -124,6 +124,7 @@ void Transpose::eval_cpu(const std::vector<array>& inputs, array& out) {

 void Arange::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 0);
+  (void)inputs;
  out.set_data(allocator::malloc(out.nbytes()));
  switch (out.dtype()) {
    case bool_:
@@ -193,9 +194,9 @@ void Concatenate::eval_cpu(const std::vector<array>& inputs, array& out) {
  flags.row_contiguous = false;
  flags.col_contiguous = false;
  flags.contiguous = false;
-  for (int i = 0; i < inputs.size(); i++) {
+  for (int i = 0; i < std::ssize(inputs); i++) {
    array out_slice(inputs[i].shape(), out.dtype(), nullptr, {});
-    size_t data_offset = strides[axis_] * sizes[i];
+    int64_t data_offset = strides[axis_] * sizes[i];
    out_slice.copy_shared_buffer(
        out, strides, flags, out_slice.size(), data_offset);
    copy_cpu_inplace(inputs[i], out_slice, CopyType::GeneralGeneral, stream());
@@ -205,7 +206,7 @@ void Concatenate::eval_cpu(const std::vector<array>& inputs, array& out) {
 void Contiguous::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
-  constexpr size_t extra_bytes = 16384;
+  constexpr int64_t extra_bytes = 16384;
  if (in.buffer_size() <= out.nbytes() + extra_bytes &&
      (in.flags().row_contiguous ||
       (allow_col_major_ && in.flags().col_contiguous))) {
@@ -254,8 +255,8 @@ void Pad::eval_cpu(const std::vector<array>& inputs, array& out) {
  copy_cpu(val, out, CopyType::Scalar, stream());

  // Find offset for start of input values
-  size_t data_offset = 0;
-  for (int i = 0; i < axes_.size(); i++) {
+  int64_t data_offset = 0;
+  for (int i = 0; i < std::ssize(axes_); i++) {
    auto ax = axes_[i] < 0 ? out.ndim() + axes_[i] : axes_[i];
    data_offset += out.strides()[ax] * low_pad_size_[i];
  }
@@ -274,10 +275,10 @@ void RandomBits::eval_cpu(const std::vector<array>& inputs, array& out) {
  // keys has shape (N1, ..., NK, 2)
  // out has shape (N1, ..., NK, M1, M2, ...)
  auto& keys = inputs[0];
-  size_t num_keys = keys.size() / 2;
+  int64_t num_keys = keys.size() / 2;

-  size_t elems_per_key = out.size() / num_keys;
-  size_t bytes_per_key = out.itemsize() * elems_per_key;
+  int64_t elems_per_key = out.size() / num_keys;
+  int64_t bytes_per_key = out.itemsize() * elems_per_key;
  out.set_data(allocator::malloc(out.nbytes()));

  auto kptr = inputs[0].data<uint32_t>();
@@ -291,8 +292,8 @@ void RandomBits::eval_cpu(const std::vector<array>& inputs, array& out) {
                    num_keys,
                    kshape = keys.shape(),
                    kstrides = keys.strides()]() mutable {
-    size_t out_skip = (bytes_per_key + 4 - 1) / 4;
-    auto half_size = out_skip / 2;
+    int64_t out_skip = (bytes_per_key + 4 - 1) / 4;
+    uintptr_t half_size = out_skip / 2;
    bool even = out_skip % 2 == 0;
    for (int i = 0; i < num_keys; ++i, cptr += bytes_per_key) {
      auto ptr = reinterpret_cast<uint32_t*>(cptr);
--- a/mlx/backend/cpu/qrf.cpp
+++ b/mlx/backend/cpu/qrf.cpp
@@ -13,7 +13,7 @@ void qrf_impl(const array& a, array& q, array& r, Stream stream) {
  const int M = a.shape(-2);
  const int N = a.shape(-1);
  const int lda = M;
-  size_t num_matrices = a.size() / (M * N);
+  int64_t num_matrices = a.size() / (M * N);

  // Copy A to inplace input and make it col-contiguous
  array in(a.shape(), a.dtype(), nullptr, {});
@@ -54,7 +54,7 @@ void qrf_impl(const array& a, array& q, array& r, Stream stream) {
    auto work = allocator::malloc(sizeof(T) * lwork);

    // Loop over matrices
-    for (int i = 0; i < num_matrices; ++i) {
+    for (int64_t i = 0; i < num_matrices; ++i) {
      // Solve
      geqrf<T>(
          &M,
@@ -68,7 +68,7 @@ void qrf_impl(const array& a, array& q, array& r, Stream stream) {
    }
    allocator::free(work);

-    for (int i = 0; i < num_matrices; ++i) {
+    for (int64_t i = 0; i < num_matrices; ++i) {
      /// num_reflectors x N
      for (int j = 0; j < num_reflectors; ++j) {
        for (int k = 0; k < j; ++k) {
@@ -97,7 +97,7 @@ void qrf_impl(const array& a, array& q, array& r, Stream stream) {
    work = allocator::malloc(sizeof(T) * lwork);

    // Loop over matrices
-    for (int i = 0; i < num_matrices; ++i) {
+    for (int64_t i = 0; i < num_matrices; ++i) {
      // Compute Q
      orgqr<T>(
          &M,
@@ -111,7 +111,7 @@ void qrf_impl(const array& a, array& q, array& r, Stream stream) {
          &info);
    }

-    for (int i = 0; i < num_matrices; ++i) {
+    for (int64_t i = 0; i < num_matrices; ++i) {
      // M x num_reflectors
      for (int j = 0; j < M; ++j) {
        for (int k = 0; k < num_reflectors; ++k) {
--- a/mlx/backend/cpu/quantized.cpp
+++ b/mlx/backend/cpu/quantized.cpp
@@ -1,7 +1,5 @@
 // Copyright © 2023 Apple Inc.

-#include <cassert>
-
 #include "mlx/backend/cpu/copy.h"
 #include "mlx/backend/cpu/encoder.h"
 #include "mlx/backend/cpu/simd/simd.h"
@@ -13,6 +11,35 @@ namespace mlx::core {

 namespace {

+const static float MXFP4_LUT[16] = {
+    +0.0f,
+    +0.5f,
+    +1.0f,
+    +1.5f,
+    +2.0f,
+    +3.0f,
+    +4.0f,
+    +6.0f,
+    -0.0f,
+    -0.5f,
+    -1.0f,
+    -1.5f,
+    -2.0f,
+    -3.0f,
+    -4.0f,
+    -6.0f};
+
+template <typename T>
+static inline T dequantize_scale(uint8_t s) {
+  using FOrI = union {
+    bfloat16_t f;
+    uint16_t i;
+  };
+  FOrI out;
+  out.i = (s == 0 ? 0x40 : (static_cast<uint16_t>(s) << 7));
+  return static_cast<T>(out.f);
+}
+
 inline constexpr short get_pack_factor(int bits, int wsize = 8) {
  return (bits == 3 || bits == 5) ? 8 : (bits == 6 ? 4 : wsize / bits);
 }
@@ -407,6 +434,229 @@ void _qmm_dispatch(
  }
 }

+template <typename T>
+void mxfp4_qmm(
+    T* result,
+    const T* x,
+    const uint32_t* w,
+    const uint8_t* scales,
+    int M,
+    int N,
+    int K) {
+  constexpr int group_size = 32;
+  constexpr int pack_factor = get_pack_factor(4, 8);
+  constexpr int packs_in_group = group_size / pack_factor;
+
+  for (int m = 0; m < M; m++) {
+    const uint8_t* w_local = (const uint8_t*)w;
+    const uint8_t* scales_local = scales;
+
+    std::fill(result, result + N, 0);
+
+    for (int k = 0; k < K; k++) {
+      T* result_local = result;
+      T xi = *x++;
+
+      for (int n = 0; n < N; n += group_size) {
+        T scale = dequantize_scale<T>(*scales_local++);
+        for (int ng = 0; ng < packs_in_group; ng++) {
+          uint8_t wi = *w_local++;
+#pragma clang loop unroll(full)
+          for (int p = 0; p < pack_factor; p++) {
+            (*result_local++) +=
+                xi * scale * static_cast<T>(MXFP4_LUT[wi & 0xf]);
+            wi >>= 4;
+          }
+        }
+      }
+    }
+
+    result += N;
+  }
+}
+
+template <typename T>
+void mxfp4_qmm_t(
+    T* result,
+    const T* x,
+    const uint32_t* w,
+    const uint8_t* scales,
+    int M,
+    int N,
+    int K) {
+  constexpr int group_size = 32;
+  constexpr int pack_factor = get_pack_factor(4, 8);
+  constexpr int packs_in_group = group_size / pack_factor;
+
+  for (int m = 0; m < M; m++) {
+    const uint8_t* w_local = (const uint8_t*)w;
+    const uint8_t* scales_local = scales;
+
+    for (int n = 0; n < N; n++) {
+      const T* x_local = x;
+      T sum = 0;
+      for (int k = 0; k < K; k += group_size) {
+        T scale = dequantize_scale<T>(*scales_local++);
+
+        T gsum = 0;
+        for (int kw = 0; kw < packs_in_group; kw++) {
+          uint8_t wi = *w_local++;
+#pragma clang loop unroll(full)
+          for (int p = 0; p < pack_factor; p++) {
+            gsum += (*x_local++) * static_cast<T>(MXFP4_LUT[wi & 0xf]);
+            wi >>= 4;
+          }
+        }
+        sum += scale * gsum;
+      }
+      *result = sum;
+      result++;
+    }
+
+    x += K;
+  }
+}
+
+template <int S>
+simd::Simd<float, S> mxfp4_extract_bits_simd(const uint32_t* w) {
+  if constexpr (S == 8) {
+    constexpr std::array<uint32_t, 8> shifts_ = {{0, 4, 8, 12, 16, 20, 24, 28}};
+    auto shifts(*(simd::Simd<uint32_t, S>*)&shifts_);
+    auto wi = simd::Simd<uint32_t, S>(*w);
+    wi = wi >> shifts;
+    wi = wi & 0xf;
+    simd::Simd<float, S> w_out;
+    for (int i = 0; i < S; ++i) {
+      w_out[i] = MXFP4_LUT[wi[i]];
+    }
+    return w_out;
+  } else {
+    // Appease compiler.. but should never get here
+    throw std::runtime_error("Unsupported combination for simd qmm.");
+  }
+}
+
+template <typename T>
+void mxfp4_qmm_t_simd(
+    T* result,
+    const T* x,
+    const uint32_t* w,
+    const uint8_t* scales,
+    int M,
+    int N,
+    int K) {
+  constexpr int group_size = 32;
+  constexpr int pack_factor = 32 / 4;
+  constexpr int packs_in_group = group_size / pack_factor;
+  constexpr int S = simd::max_size<T>;
+  static_assert(
+      S % pack_factor == 0, "SIMD size must be divisible by pack factor");
+  constexpr int packs_per_simd = S / pack_factor;
+
+  for (int m = 0; m < M; m++) {
+    const uint32_t* w_local = w;
+    const uint8_t* scales_local = scales;
+
+    for (int n = 0; n < N; n++) {
+      simd::Simd<float, S> acc(0);
+      auto x_local = x;
+      for (int k = 0; k < K; k += group_size) {
+        T scale = dequantize_scale<T>(*scales_local++);
+
+        simd::Simd<float, S> g_acc(0);
+        for (int kw = 0; kw < packs_in_group; kw += packs_per_simd) {
+          // Extract bits
+          auto wf = mxfp4_extract_bits_simd<S>(w_local);
+          w_local += packs_per_simd;
+          simd::Simd<float, S> x_simd = simd::load<T, S>(x_local);
+          g_acc = g_acc + x_simd * wf;
+          x_local += S;
+        }
+        acc = acc + scale * g_acc;
+      }
+
+      *result = T(simd::sum(acc));
+      result++;
+    }
+    x += K;
+  }
+}
+
+template <typename T>
+void mxfp4_qmm_dispatch_transpose(
+    T* result,
+    const T* x,
+    const uint32_t* w,
+    const uint8_t* scales,
+    int M,
+    int N,
+    int K,
+    bool transposed_w) {
+  if (transposed_w) {
+    // the simd size must be a multiple of the number of elements per word
+    if constexpr (simd::max_size<T> % 8 == 0) {
+      mxfp4_qmm_t_simd<T>(result, x, w, scales, M, N, K);
+    } else {
+      mxfp4_qmm_t<T>(result, x, w, scales, M, N, K);
+    }
+  } else {
+    mxfp4_qmm<T>(result, x, w, scales, M, N, K);
+  }
+}
+
+template <typename T>
+void mxfp4_qmm_dispatch_typed(
+    array& out,
+    const array& x,
+    const array& w,
+    const array& scales,
+    bool transposed_w) {
+  int K = x.shape(-1);
+  int M = x.ndim() > 1 ? x.shape(-2) : 1;
+  int N = out.shape(-1);
+  int w_els = w.ndim() > 2 ? w.shape(-1) * w.shape(-2) : 0;
+  int g_els = w.ndim() > 2 ? scales.shape(-1) * scales.shape(-2) : 0;
+  int batch_size = x.size() / (K * M);
+
+  auto out_ptr = out.data<T>();
+  auto x_ptr = x.data<T>();
+  auto w_ptr = w.data<uint32_t>();
+  auto scales_ptr = scales.data<uint8_t>();
+  for (int i = 0; i < batch_size; i++) {
+    mxfp4_qmm_dispatch_transpose<T>(
+        out_ptr + i * M * N,
+        x_ptr + elem_to_loc(i * M * K, x.shape(), x.strides()),
+        w_ptr + elem_to_loc(i * w_els, w.shape(), w.strides()),
+        scales_ptr + elem_to_loc(i * g_els, scales.shape(), scales.strides()),
+        M,
+        N,
+        K,
+        transposed_w);
+  }
+}
+
+void mxfp4_qmm_dispatch(
+    array& out,
+    const array& x,
+    const array& w,
+    const array& scales,
+    bool transposed_w) {
+  switch (x.dtype()) {
+    case bfloat16:
+      mxfp4_qmm_dispatch_typed<bfloat16_t>(out, x, w, scales, transposed_w);
+      break;
+    case float16:
+      mxfp4_qmm_dispatch_typed<float16_t>(out, x, w, scales, transposed_w);
+      break;
+    case float32:
+      mxfp4_qmm_dispatch_typed<float>(out, x, w, scales, transposed_w);
+      break;
+    default:
+      throw std::invalid_argument(
+          "[quantized_matmul] only floating types are supported");
+  }
+}
+
 template <typename T>
 void _bs_qmm_dispatch_typed(
    array& out,
@@ -513,115 +763,198 @@ void _bs_qmm_dispatch(
  }
 }

+template <typename T>
+void mxfp4_bs_qmm_dispatch_typed(
+    array& out,
+    const array& x,
+    const array& w,
+    const array& scales,
+    const array& lhs_indices,
+    const array& rhs_indices,
+    bool transposed_w) {
+  int K = x.shape(-1);
+  int M = x.shape(-2);
+  int N = out.shape(-1);
+
+  int w_els = w.shape(-1) * w.shape(-2);
+  int g_els = scales.shape(-1) * scales.shape(-2);
+
+  auto out_ptr = out.data<T>();
+  auto x_ptr = x.data<T>();
+  auto w_ptr = w.data<uint32_t>();
+  auto scales_ptr = scales.data<uint8_t>();
+  auto lhs_indices_ptr = lhs_indices.data<uint32_t>();
+  auto rhs_indices_ptr = rhs_indices.data<uint32_t>();
+
+  for (int i = 0; i < lhs_indices.size(); i++) {
+    int x_idx = lhs_indices_ptr[elem_to_loc(
+        i, lhs_indices.shape(), lhs_indices.strides())];
+    int w_idx = rhs_indices_ptr[elem_to_loc(
+        i, rhs_indices.shape(), rhs_indices.strides())];
+    mxfp4_qmm_dispatch_transpose<T>(
+        out_ptr + i * M * N,
+        x_ptr + elem_to_loc(x_idx * M * K, x.shape(), x.strides()),
+        w_ptr + elem_to_loc(w_idx * w_els, w.shape(), w.strides()),
+        scales_ptr +
+            elem_to_loc(w_idx * g_els, scales.shape(), scales.strides()),
+        M,
+        N,
+        K,
+        transposed_w);
+  }
+}
+
+void mxfp4_bs_qmm_dispatch(
+    array& out,
+    const array& x,
+    const array& w,
+    const array& scales,
+    const array& lhs_indices,
+    const array& rhs_indices,
+    bool transposed_w) {
+  switch (x.dtype()) {
+    case float32:
+      mxfp4_bs_qmm_dispatch_typed<float>(
+          out, x, w, scales, lhs_indices, rhs_indices, transposed_w);
+      break;
+    case float16:
+      mxfp4_bs_qmm_dispatch_typed<float16_t>(
+          out, x, w, scales, lhs_indices, rhs_indices, transposed_w);
+      break;
+    case bfloat16:
+      mxfp4_bs_qmm_dispatch_typed<bfloat16_t>(
+          out, x, w, scales, lhs_indices, rhs_indices, transposed_w);
+      break;
+    default:
+      throw std::invalid_argument(
+          "[quantized_matmul] only floating types are supported");
+  }
+}
+
 } // namespace

 void QuantizedMatmul::eval_cpu(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 4);
-
  auto& x_pre = inputs[0];
  auto& w_pre = inputs[1];
  auto& scales_pre = inputs[2];
-  auto& biases_pre = inputs[3];

-  std::vector<array> temps;
-  auto ensure_row_contiguous = [s = stream(), &temps](const array& arr) {
+  auto& encoder = cpu::get_command_encoder(stream());
+  auto ensure_row_contiguous = [s = stream(), &encoder](const array& arr) {
    if (arr.flags().row_contiguous) {
      return arr;
    } else {
-      temps.push_back(array(arr.shape(), arr.dtype(), nullptr, {}));
-      copy_cpu(arr, temps.back(), CopyType::General, s);
-      return temps.back();
+      auto arr_cpy = array(arr.shape(), arr.dtype(), nullptr, {});
+      copy_cpu(arr, arr_cpy, CopyType::General, s);
+      encoder.add_temporary(arr_cpy);
+      return arr_cpy;
    }
  };

  auto x = ensure_row_contiguous(x_pre);
  auto w = ensure_row_contiguous(w_pre);
  auto scales = ensure_row_contiguous(scales_pre);
-  auto biases = ensure_row_contiguous(biases_pre);

  out.set_data(allocator::malloc(out.nbytes()));

-  auto& encoder = cpu::get_command_encoder(stream());
-  encoder.add_temporaries(std::move(temps));
  encoder.set_input_array(x);
  encoder.set_input_array(w);
  encoder.set_input_array(scales);
-  encoder.set_input_array(biases);
  encoder.set_output_array(out);
-  encoder.dispatch([out = array::unsafe_weak_copy(out),
-                    x = array::unsafe_weak_copy(x),
-                    w = array::unsafe_weak_copy(w),
-                    scales = array::unsafe_weak_copy(scales),
-                    biases = array::unsafe_weak_copy(biases),
-                    group_size_ = group_size_,
-                    bits_ = bits_,
-                    transpose_ = transpose_]() mutable {
-    _qmm_dispatch(out, x, w, scales, biases, group_size_, bits_, transpose_);
-  });
+  if (mode_ == QuantizationMode::Affine) {
+    auto biases = ensure_row_contiguous(inputs[3]);
+    encoder.set_input_array(biases);
+    encoder.dispatch([out = array::unsafe_weak_copy(out),
+                      x = array::unsafe_weak_copy(x),
+                      w = array::unsafe_weak_copy(w),
+                      scales = array::unsafe_weak_copy(scales),
+                      biases = array::unsafe_weak_copy(biases),
+                      group_size_ = group_size_,
+                      bits_ = bits_,
+                      transpose_ = transpose_]() mutable {
+      _qmm_dispatch(out, x, w, scales, biases, group_size_, bits_, transpose_);
+    });
+  } else {
+    encoder.dispatch([out = array::unsafe_weak_copy(out),
+                      x = array::unsafe_weak_copy(x),
+                      w = array::unsafe_weak_copy(w),
+                      scales = array::unsafe_weak_copy(scales),
+                      transpose_ = transpose_]() mutable {
+      mxfp4_qmm_dispatch(out, x, w, scales, transpose_);
+    });
+  }
 }

 void GatherQMM::eval_cpu(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 6);
-
  auto& x_pre = inputs[0];
  auto& w_pre = inputs[1];
  auto& scales_pre = inputs[2];
-  auto& biases_pre = inputs[3];
-  auto& lhs_indices = inputs[4];
-  auto& rhs_indices = inputs[5];
+  auto& lhs_indices = inputs[inputs.size() - 2];
+  auto& rhs_indices = inputs[inputs.size() - 1];

-  std::vector<array> temps;
+  auto& encoder = cpu::get_command_encoder(stream());
  auto ensure_row_contiguous_last_dims = [s = stream(),
-                                          &temps](const array& arr) {
+                                          &encoder](const array& arr) {
    auto stride_0 = arr.strides()[arr.ndim() - 2];
    auto stride_1 = arr.strides()[arr.ndim() - 1];
    if (stride_0 == arr.shape(-1) && stride_1 == 1) {
      return arr;
    } else {
-      temps.push_back(array(arr.shape(), arr.dtype(), nullptr, {}));
-      copy_cpu(arr, temps.back(), CopyType::General, s);
-      return temps.back();
+      auto arr_cpy = array(arr.shape(), arr.dtype(), nullptr, {});
+      copy_cpu(arr, arr_cpy, CopyType::General, s);
+      encoder.add_temporary(arr_cpy);
+      return arr_cpy;
    }
  };

  auto x = ensure_row_contiguous_last_dims(x_pre);
  auto w = ensure_row_contiguous_last_dims(w_pre);
  auto scales = ensure_row_contiguous_last_dims(scales_pre);
-  auto biases = ensure_row_contiguous_last_dims(biases_pre);

  out.set_data(allocator::malloc(out.nbytes()));

-  auto& encoder = cpu::get_command_encoder(stream());
-  encoder.add_temporaries(std::move(temps));
  encoder.set_input_array(x);
  encoder.set_input_array(w);
  encoder.set_input_array(scales);
-  encoder.set_input_array(biases);
  encoder.set_input_array(lhs_indices);
  encoder.set_input_array(rhs_indices);
  encoder.set_output_array(out);
-  encoder.dispatch([out = array::unsafe_weak_copy(out),
-                    x = array::unsafe_weak_copy(x),
-                    w = array::unsafe_weak_copy(w),
-                    scales = array::unsafe_weak_copy(scales),
-                    biases = array::unsafe_weak_copy(biases),
-                    lhs_indices = array::unsafe_weak_copy(lhs_indices),
-                    rhs_indices = array::unsafe_weak_copy(rhs_indices),
-                    group_size_ = group_size_,
-                    bits_ = bits_,
-                    transpose_ = transpose_]() mutable {
-    _bs_qmm_dispatch(
-        out,
-        x,
-        w,
-        scales,
-        biases,
-        lhs_indices,
-        rhs_indices,
-        group_size_,
-        bits_,
-        transpose_);
-  });
+  if (mode_ == QuantizationMode::Affine) {
+    auto biases = ensure_row_contiguous_last_dims(inputs[3]);
+    encoder.set_input_array(biases);
+    encoder.dispatch([out = array::unsafe_weak_copy(out),
+                      x = array::unsafe_weak_copy(x),
+                      w = array::unsafe_weak_copy(w),
+                      scales = array::unsafe_weak_copy(scales),
+                      biases = array::unsafe_weak_copy(biases),
+                      lhs_indices = array::unsafe_weak_copy(lhs_indices),
+                      rhs_indices = array::unsafe_weak_copy(rhs_indices),
+                      group_size_ = group_size_,
+                      bits_ = bits_,
+                      transpose_ = transpose_]() mutable {
+      _bs_qmm_dispatch(
+          out,
+          x,
+          w,
+          scales,
+          biases,
+          lhs_indices,
+          rhs_indices,
+          group_size_,
+          bits_,
+          transpose_);
+    });
+  } else {
+    encoder.dispatch([out = array::unsafe_weak_copy(out),
+                      x = array::unsafe_weak_copy(x),
+                      w = array::unsafe_weak_copy(w),
+                      scales = array::unsafe_weak_copy(scales),
+                      lhs_indices = array::unsafe_weak_copy(lhs_indices),
+                      rhs_indices = array::unsafe_weak_copy(rhs_indices),
+                      transpose_ = transpose_]() mutable {
+      mxfp4_bs_qmm_dispatch(
+          out, x, w, scales, lhs_indices, rhs_indices, transpose_);
+    });
+  }
 }

 template <typename T, typename U>
@@ -705,7 +1038,7 @@ void dispatch_quantize(
      w_ptr, out_ptr, scales_ptr, biases_ptr, bits, group_size, w.size());
 }

-void fast::AffineQuantize::eval_cpu(
+void fast::Quantize::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  auto ensure_row_contiguous = [s = stream()](const array& arr) {
@@ -764,7 +1097,7 @@ void fast::AffineQuantize::eval_cpu(
      }
    } else {
      throw std::runtime_error(
-          "[fast::AffineQuantize::eval_cpu] Only supports floating point inputs");
+          "[fast::Quantize::eval_cpu] Only supports floating point inputs");
    }
  });
 }
--- a/mlx/backend/cpu/simd/accelerate_simd.h
+++ b/mlx/backend/cpu/simd/accelerate_simd.h
@@ -9,7 +9,7 @@

 #include "mlx/backend/cpu/simd/base_simd.h"

-// There seems to be a bug in sims/base.h
+// There seems to be a bug in simd/base_simd.h
 // __XROS_2_0 is not defined, the expression evaluates
 // to true instead of false setting the SIMD library
 // higher than it should be even on macOS < 15
@@ -234,6 +234,7 @@ Simd<T, N> remainder(Simd<T, N> a, Simd<T, N> b) {

 template <typename MaskT, typename T1, typename T2, int N>
 Simd<T1, N> select(Simd<MaskT, N> mask, Simd<T1, N> x, Simd<T2, N> y) {
+  static_assert(std::is_same_v<MaskT, bool>);
  if constexpr (sizeof(T1) == 1) {
    return asd::bitselect(y.value, x.value, asd::convert<char>(mask.value));
  } else if constexpr (sizeof(T1) == 2) {
@@ -251,9 +252,13 @@ Simd<T, N> pow(Simd<T, N> base, Simd<T, N> exp) {
    return asd::pow(base.value, exp.value);
  } else {
    Simd<T, N> res = 1;
-    while (any(exp)) {
-      res = select(exp & 1, res * base, res);
-      base = select(exp, base * base, base);
+    // Raising an integer to a negative power is undefined
+    if (any(exp < static_cast<T>(0))) {
+      return 0;
+    }
+    while (any(exp > static_cast<T>(0))) {
+      res = select((exp & 1) != 0, res * base, res);
+      base = select(exp > static_cast<T>(0), base * base, base);
      exp = exp >> 1;
    }
    return res;
--- a/mlx/backend/cpu/simd/math.h
+++ b/mlx/backend/cpu/simd/math.h
@@ -79,7 +79,8 @@ Simd<T, N> sincos(Simd<T, N> in) {

  // Get the polynom selection mask. There is one polynom for 0 <= x <= Pi/4
  // and another one for Pi/4<x<=Pi/2. Both branches will be computed.
-  auto poly_mask = (emm2 & 2) != 0;
+  auto poly_mask =
+      (emm2 & static_cast<uint32_t>(2)) != static_cast<uint32_t>(0);

  // The magic pass: "Extended precision modular arithmetic"
  // x = ((x - y * DP1) - y * DP2) - y * DP3
@@ -87,8 +88,8 @@ Simd<T, N> sincos(Simd<T, N> in) {
  x = fma(y, Simd<float, N>(-2.4187564849853515625e-4f), x);
  x = fma(y, Simd<float, N>(-3.77489497744594108e-8f), x);

-  sign_mask_sin = sign_mask_sin ^ ((emm2 & 4) != 0);
-  auto sign_mask_cos = ((emm2 - 2) & 4) != 0;
+  sign_mask_sin = sign_mask_sin ^ ((emm2 & 4) != static_cast<uint32_t>(0));
+  auto sign_mask_cos = ((emm2 - 2) & 4) != static_cast<uint32_t>(0);

  // Evaluate the first polynom  (0 <= x <= Pi/4) in y1,
  // and the second polynom      (Pi/4 <= x <= 0) in y2
--- a/mlx/backend/cpu/sort.cpp
+++ b/mlx/backend/cpu/sort.cpp
@@ -15,6 +15,18 @@ namespace mlx::core {

 namespace {

+// NaN-aware comparator that places NaNs at the end
+template <typename T>
+bool nan_aware_less(T a, T b) {
+  if constexpr (std::is_floating_point_v<T> || std::is_same_v<T, complex64_t>) {
+    if (std::isnan(a))
+      return false;
+    if (std::isnan(b))
+      return true;
+  }
+  return a < b;
+}
+
 template <typename T>
 struct StridedIterator {
  using iterator_category = std::random_access_iterator_tag;
@@ -27,7 +39,7 @@ struct StridedIterator {
  StridedIterator() = default;

  explicit StridedIterator(T* ptr, int64_t stride, difference_type offset = 0)
-      : ptr_(ptr + offset * stride), stride_(stride) {}
+      : stride_(stride), ptr_(ptr + offset * stride) {}

  explicit StridedIterator(array& arr, int axis, difference_type offset = 0)
      : StridedIterator(arr.data<T>(), arr.strides()[axis], offset) {}
@@ -108,8 +120,8 @@ template <typename T>
 void sort(array& out, int axis) {
  // Get axis, shape and stride info
  axis = axis < 0 ? axis + out.ndim() : axis;
-  size_t in_size = out.size();
-  size_t n_rows = in_size / out.shape(axis);
+  int64_t in_size = out.size();
+  int64_t n_rows = in_size / out.shape(axis);

  auto remaining_shape = out.shape();
  remaining_shape.erase(remaining_shape.begin() + axis);
@@ -124,13 +136,13 @@ void sort(array& out, int axis) {
  ContiguousIterator src_it(
      remaining_shape, remaining_strides, remaining_shape.size());
  auto out_ptr = out.data<T>();
-  for (int i = 0; i < n_rows; i++) {
+  for (int64_t i = 0; i < n_rows; i++) {
    T* data_ptr = out_ptr + src_it.loc;

    StridedIterator st(data_ptr, axis_stride, 0);
    StridedIterator ed(data_ptr, axis_stride, axis_size);

-    std::stable_sort(st, ed);
+    std::stable_sort(st, ed, nan_aware_less<T>);
    src_it.step();
  }
 }
@@ -139,7 +151,7 @@ template <typename T, typename IdxT = uint32_t>
 void argsort(const array& in, array& out, int axis) {
  // Get axis, shape and stride info
  axis = axis < 0 ? axis + in.ndim() : axis;
-  size_t n_rows = in.size() / in.shape(axis);
+  int64_t n_rows = in.size() / in.shape(axis);

  auto in_remaining_shape = in.shape();
  in_remaining_shape.erase(in_remaining_shape.begin() + axis);
@@ -164,7 +176,7 @@ void argsort(const array& in, array& out, int axis) {
      out_remaining_shape, out_remaining_strides, out_remaining_shape.size());
  auto in_ptr = in.data<T>();
  auto out_ptr = out.data<IdxT>();
-  for (int i = 0; i < n_rows; i++) {
+  for (int64_t i = 0; i < n_rows; i++) {
    const T* data_ptr = in_ptr + in_it.loc;
    IdxT* idx_ptr = out_ptr + out_it.loc;

@@ -184,6 +196,15 @@ void argsort(const array& in, array& out, int axis) {
    std::stable_sort(st, ed, [data_ptr, in_stride](IdxT a, IdxT b) {
      auto v1 = data_ptr[a * in_stride];
      auto v2 = data_ptr[b * in_stride];
+
+      // Handle NaNs (place them at the end)
+      if (std::is_floating_point<T>::value) {
+        if (std::isnan(v1))
+          return false;
+        if (std::isnan(v2))
+          return true;
+      }
+
      return v1 < v2 || (v1 == v2 && a < b);
    });
  }
@@ -193,8 +214,8 @@ template <typename T>
 void partition(array& out, int axis, int kth) {
  // Get axis, shape and stride info
  axis = axis < 0 ? axis + out.ndim() : axis;
-  size_t in_size = out.size();
-  size_t n_rows = in_size / out.shape(axis);
+  int64_t in_size = out.size();
+  int64_t n_rows = in_size / out.shape(axis);

  auto remaining_shape = out.shape();
  remaining_shape.erase(remaining_shape.begin() + axis);
@@ -211,7 +232,7 @@ void partition(array& out, int axis, int kth) {
  ContiguousIterator src_it(
      remaining_shape, remaining_strides, remaining_shape.size());
  auto out_ptr = out.data<T>();
-  for (int i = 0; i < n_rows; i++) {
+  for (int64_t i = 0; i < n_rows; i++) {
    T* data_ptr = out_ptr + src_it.loc;
    src_it.step();

@@ -219,7 +240,7 @@ void partition(array& out, int axis, int kth) {
    StridedIterator md(data_ptr, axis_stride, kth);
    StridedIterator ed(data_ptr, axis_stride, axis_size);

-    std::nth_element(st, md, ed);
+    std::nth_element(st, md, ed, nan_aware_less<T>);
  }
 }

@@ -227,7 +248,7 @@ template <typename T, typename IdxT = uint32_t>
 void argpartition(const array& in, array& out, int axis, int kth) {
  // Get axis, shape and stride info
  axis = axis < 0 ? axis + in.ndim() : axis;
-  size_t n_rows = in.size() / in.shape(axis);
+  int64_t n_rows = in.size() / in.shape(axis);

  auto in_remaining_shape = in.shape();
  in_remaining_shape.erase(in_remaining_shape.begin() + axis);
@@ -256,7 +277,7 @@ void argpartition(const array& in, array& out, int axis, int kth) {
  auto in_ptr = in.data<T>();
  auto out_ptr = out.data<IdxT>();

-  for (int i = 0; i < n_rows; i++) {
+  for (int64_t i = 0; i < n_rows; i++) {
    const T* data_ptr = in_ptr + in_it.loc;
    IdxT* idx_ptr = out_ptr + out_it.loc;
    in_it.step();
@@ -276,6 +297,15 @@ void argpartition(const array& in, array& out, int axis, int kth) {
    std::nth_element(st, md, ed, [data_ptr, in_stride](IdxT a, IdxT b) {
      auto v1 = data_ptr[a * in_stride];
      auto v2 = data_ptr[b * in_stride];
+
+      // Handle NaNs (place them at the end)
+      if (std::is_floating_point<T>::value) {
+        if (std::isnan(v1))
+          return false;
+        if (std::isnan(v2))
+          return true;
+      }
+
      return v1 < v2 || (v1 == v2 && a < b);
    });
  }
--- a/mlx/backend/cpu/svd.cpp
+++ b/mlx/backend/cpu/svd.cpp
@@ -27,7 +27,7 @@ void svd_impl(
  const int N = a.shape(-1);
  const int K = std::min(M, N);

-  size_t num_matrices = a.size() / (M * N);
+  int64_t num_matrices = a.size() / (M * N);

  // lapack clobbers the input, so we have to make a copy.
  array in(a.shape(), a.dtype(), nullptr, {});
@@ -81,40 +81,26 @@ void svd_impl(
    // Vᵀ of shape N x N. (M x M in lapack).
    const int ldvt = M;

-    auto job_u = (u_ptr) ? "V" : "N";
-    auto job_vt = (u_ptr) ? "V" : "N";
-    static constexpr auto range = "A";
+    auto jobz = (u_ptr) ? "A" : "N";

-    // Will contain the number of singular values after the call has returned.
-    int ns = 0;
    T workspace_dimension = 0;

    // Will contain the indices of eigenvectors that failed to converge (not
    // used here but required by lapack).
-    auto iwork = array::Data{allocator::malloc(sizeof(int) * 12 * K)};
+    auto iwork = array::Data{allocator::malloc(sizeof(int) * 8 * K)};

    static const int lwork_query = -1;

-    static const int ignored_int = 0;
-    static const T ignored_float = 0;
-
    int info;

    // Compute workspace size.
-    gesvdx<T>(
-        /* jobu = */ job_u,
-        /* jobvt = */ job_vt,
-        /* range = */ range,
+    gesdd<T>(
+        /* jobz = */ jobz,
        // M and N are swapped since lapack expects column-major.
        /* m = */ &N,
        /* n = */ &M,
        /* a = */ nullptr,
        /* lda = */ &lda,
-        /* vl = */ &ignored_float,
-        /* vu = */ &ignored_float,
-        /* il = */ &ignored_int,
-        /* iu = */ &ignored_int,
-        /* ns = */ &ns,
        /* s = */ nullptr,
        /* u = */ nullptr,
        /* ldu = */ &ldu,
@@ -135,21 +121,14 @@ void svd_impl(
    auto scratch = array::Data{allocator::malloc(sizeof(T) * lwork)};

    // Loop over matrices.
-    for (int i = 0; i < num_matrices; i++) {
-      gesvdx<T>(
-          /* jobu = */ job_u,
-          /* jobvt = */ job_vt,
-          /* range = */ range,
+    for (int64_t i = 0; i < num_matrices; i++) {
+      gesdd<T>(
+          /* jobz = */ jobz,
          // M and N are swapped since lapack expects column-major.
          /* m = */ &N,
          /* n = */ &M,
          /* a = */ in_ptr + M * N * i,
          /* lda = */ &lda,
-          /* vl = */ &ignored_float,
-          /* vu = */ &ignored_float,
-          /* il = */ &ignored_int,
-          /* iu = */ &ignored_int,
-          /* ns = */ &ns,
          /* s = */ s_ptr + K * i,
          // According to the identity above, lapack will write Vᵀᵀ as U.
          /* u = */ vt_ptr ? vt_ptr + N * N * i : nullptr,
@@ -167,13 +146,6 @@ void svd_impl(
        ss << "svd_impl: sgesvdx_ failed with code " << info;
        throw std::runtime_error(ss.str());
      }
-
-      if (ns != K) {
-        std::stringstream ss;
-        ss << "svd_impl: expected " << K << " singular values, but " << ns
-           << " were computed.";
-        throw std::runtime_error(ss.str());
-      }
    }
  });
  encoder.add_temporary(in);
@@ -181,10 +153,10 @@ void svd_impl(

 template <typename T>
 void compute_svd(
-    const array& a,
-    bool compute_uv,
-    std::vector<array>& outputs,
-    Stream stream) {}
+    const array& /* a */,
+    bool /* compute_uv */,
+    std::vector<array>& /* outputs */,
+    Stream /* stream */) {}

 void SVD::eval_cpu(
    const std::vector<array>& inputs,
--- a/mlx/backend/cpu/ternary.h
+++ b/mlx/backend/cpu/ternary.h
@@ -136,7 +136,7 @@ void ternary_op(
  if (topt == TernaryOpType::ScalarScalarScalar) {
    *out_ptr = op(*a_ptr, *b_ptr, *c_ptr);
  } else if (topt == TernaryOpType::VectorVectorVector) {
-    for (size_t i = 0; i < out.size(); ++i) {
+    for (int64_t i = 0; i < out.size(); ++i) {
      *out_ptr = op(*a_ptr, *b_ptr, *c_ptr);
      a_ptr++;
      b_ptr++;
--- a/mlx/backend/cpu/unary.h
+++ b/mlx/backend/cpu/unary.h
@@ -10,8 +10,8 @@
 namespace mlx::core {

 template <typename T, typename U = T, typename Op>
-void unary_op(const T* a, U* out, size_t shape, size_t stride) {
-  for (size_t i = 0; i < shape; i += 1) {
+void unary_op(const T* a, U* out, int64_t shape, int64_t stride) {
+  for (int64_t i = 0; i < shape; i += 1) {
    out[i] = Op{}(*a);
    a += stride;
  }
@@ -38,14 +38,14 @@ void unary_op(const array& a, array& out, Op) {
      src++;
    }
  } else {
-    size_t shape = ndim > 0 ? a.shape().back() : 1;
-    size_t stride = ndim > 0 ? a.strides().back() : 1;
+    int64_t shape = ndim > 0 ? a.shape().back() : 1;
+    int64_t stride = ndim > 0 ? a.strides().back() : 1;
    if (ndim <= 1) {
      unary_op<T, U, Op>(src, dst, shape, stride);
      return;
    }
    auto it = ContiguousIterator(a.shape(), a.strides(), ndim - 1);
-    for (size_t elem = 0; elem < a.size(); elem += shape) {
+    for (int64_t elem = 0; elem < a.size(); elem += shape) {
      unary_op<T, U, Op>(src + it.loc, dst + elem, shape, stride);
      it.step();
    }
--- a/mlx/backend/cpu/unary_ops.h
+++ b/mlx/backend/cpu/unary_ops.h
@@ -77,7 +77,8 @@ struct Real {
 struct Sigmoid {
  template <int N, typename T>
  Simd<T, N> operator()(Simd<T, N> x) {
-    return 1.0f / (1.0f + simd::exp(-x));
+    auto y = 1.0f / (1.0f + simd::exp(simd::abs(x)));
+    return simd::select(x < Simd<T, N>{0}, y, Simd<T, N>{1} - y);
  }
  SINGLE()
 };
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -8,7 +8,6 @@ target_sources(
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/arange.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/binary_two.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cu
@@ -17,14 +16,18 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general_dynamic.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general_input.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/conv/gemm_conv.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/conv/gemm_grouped_conv.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/cuda.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/cudnn_utils.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/custom_kernel.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/distributed.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/eval.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/event.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/fence.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/gemms/gemv.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/gemms/cublas_gemm.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/gemms/steel_gemm.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/jit_module.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/kernel_utils.cu
@@ -46,12 +49,14 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/sort.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/ternary.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/unary.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/affine_quantize.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/quantized.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/worker.cpp)

+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/binary)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/unary)
+
 if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9.0)
  target_sources(
    mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/cublas_gemm_batched_12_9.cu)
@@ -149,7 +154,7 @@ target_link_libraries(mlx PRIVATE CUDA::nvrtc CUDA::cuda_driver)
 FetchContent_Declare(
  cudnn
  GIT_REPOSITORY https://github.com/NVIDIA/cudnn-frontend.git
-  GIT_TAG v1.12.1
+  GIT_TAG v1.14.0
  GIT_SHALLOW TRUE
  EXCLUDE_FROM_ALL)
 set(CUDNN_FRONTEND_SKIP_JSON_LIB ON)
@@ -165,6 +170,10 @@ target_link_libraries(mlx PRIVATE CUDNN::cudnn_all)
 # Suppress nvcc warnings on MLX headers.
 target_compile_options(mlx PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe
                                   --diag_suppress=997>)
+# Supress warnings: note: parameter passing for argument of type
+# ‘std::pair<float, float>’ when C++17 is enabled changed to match C++14 in GCC
+# 10.1
+target_compile_options(mlx PRIVATE -Wno-psabi)

 # Install CCCL headers for JIT.
 install(DIRECTORY ${cccl_SOURCE_DIR}/include/cuda
--- a/mlx/backend/cuda/allocator.cpp
+++ b/mlx/backend/cuda/allocator.cpp
@@ -30,8 +30,20 @@ SmallSizePool::SmallSizePool() {
  next_free_ = buffer_;

  CHECK_CUDA_ERROR(cudaMallocManaged(&data_, small_pool_size));
-  CHECK_CUDA_ERROR(
-      cudaMemAdvise(data_, small_pool_size, cudaMemAdviseSetReadMostly, 0));
+
+  int device_count = 0;
+  CHECK_CUDA_ERROR(cudaGetDeviceCount(&device_count));
+  for (int i = 0; i < device_count; ++i) {
+#if CUDART_VERSION >= 13000
+    cudaMemLocation loc;
+    loc.type = cudaMemLocationTypeDevice;
+    loc.id = i;
+#else
+    int loc = i;
+#endif // CUDART_VERSION >= 13000
+    CHECK_CUDA_ERROR(
+        cudaMemAdvise(data_, small_pool_size, cudaMemAdviseSetAccessedBy, loc));
+  }

  auto curr = next_free_;
  for (size_t i = 1; i < num_blocks; ++i) {
@@ -79,7 +91,7 @@ CudaAllocator::CudaAllocator()
  // TODO: Set memory limit for multi-device.
  size_t free, total;
  CHECK_CUDA_ERROR(cudaMemGetInfo(&free, &total));
-  memory_limit_ = total * 0.8;
+  memory_limit_ = total * 0.95;
  max_pool_size_ = memory_limit_;
 }

--- a/mlx/backend/cuda/arange.cu
+++ b/mlx/backend/cuda/arange.cu
@@ -6,23 +6,33 @@
 #include "mlx/dtype_utils.h"
 #include "mlx/primitives.h"

+#include <cooperative_groups.h>
 #include <nvtx3/nvtx3.hpp>
-#include <thrust/device_ptr.h>
-#include <thrust/transform.h>

 namespace mlx::core {

 namespace cu {

-template <typename T>
-struct Arange {
-  const T start;
-  const T step;
+namespace cg = cooperative_groups;

-  __device__ T operator()(uint32_t i) const {
-    return start + i * step;
+template <typename T, typename IdxT, int N_WRITES>
+__global__ void arange(T* out, IdxT size, T start, T step) {
+  IdxT index = cg::this_grid().thread_rank();
+
+  if ((index + 1) * N_WRITES > size) {
+    for (IdxT i = index * N_WRITES; i < size; ++i) {
+      out[i] = start + i * step;
+    }
+  } else {
+    AlignedVector<T, N_WRITES> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_WRITES; ++i) {
+      out_vec[i] = start + (index * N_WRITES + i) * step;
+    }
+
+    store_vector<N_WRITES>(out, index, out_vec);
  }
-};
+}

 } // namespace cu

@@ -36,19 +46,23 @@ void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& encoder = cu::get_command_encoder(stream());
  encoder.set_output_array(out);

-  auto capture = encoder.capture_context();
  dispatch_int_float_types(out.dtype(), "Arange", [&](auto type_tag) {
    using CTYPE = MLX_GET_TYPE(type_tag);
    using OutType = cuda_type_t<CTYPE>;
-    CTYPE step =
-        static_cast<CTYPE>(start_ + step_) - static_cast<CTYPE>(start_);
-    thrust::transform(
-        cu::thrust_policy(encoder.stream()),
-        thrust::counting_iterator<uint32_t>(0),
-        thrust::counting_iterator<uint32_t>(out.data_size()),
-        thrust::device_pointer_cast(out.data<OutType>()),
-        cu::Arange<OutType>{
-            static_cast<OutType>(start_), static_cast<OutType>(step)});
+    constexpr int N_WRITES = 16 / sizeof(OutType);
+    dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
+      using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+      auto [num_blocks, block_dims] = get_launch_args(out, large(), N_WRITES);
+      encoder.add_kernel_node(
+          cu::arange<OutType, IdxT, N_WRITES>,
+          num_blocks,
+          block_dims,
+          0,
+          out.data<OutType>(),
+          out.data_size(),
+          static_cast<CTYPE>(start_),
+          static_cast<CTYPE>(start_ + step_) - static_cast<CTYPE>(start_));
+    });
  });
 }

--- a/mlx/backend/cuda/binary/CMakeLists.txt
+++ b/mlx/backend/cuda/binary/CMakeLists.txt
@@ -0,0 +1,21 @@
+target_sources(
+  mlx
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/add.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arctan2.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/bitwise_binary.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/divide.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/equal.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/greater.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/greater_equal.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/less.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/less_equal.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/logical_and.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/logical_or.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/log_add_exp.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/minimum.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/maximum.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/multiply.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/power.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/remainder.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/not_equal.cu
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/subtract.cu)
--- a/mlx/backend/cuda/binary/add.cu
+++ b/mlx/backend/cuda/binary/add.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(Add)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/arctan2.cu
+++ b/mlx/backend/cuda/binary/arctan2.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(ArcTan2)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/binary.cuh
+++ b/mlx/backend/cuda/binary/binary.cuh
@@ -99,39 +99,89 @@ __global__ void binary_vv(const In* a, const In* b, Out* out, IdxT size) {
  }
 }

-template <typename Op, typename In, typename Out, typename IdxT, int NDIM>
+template <
+    typename Op,
+    typename In,
+    typename Out,
+    typename IdxT,
+    int NDIM,
+    int N_READS>
 __global__ void binary_g_nd(
    const In* a,
    const In* b,
    Out* out,
-    IdxT size,
+    IdxT size_rest,
    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> a_strides,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> b_strides) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto [a_idx, b_idx] = elem_to_loc_nd<NDIM>(
-        index, shape.data(), a_strides.data(), b_strides.data());
-    out[index] = Op{}(a[a_idx], b[b_idx]);
+  auto block = cg::this_thread_block();
+  auto grid = cg::this_grid();
+  IdxT index_rest =
+      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
+  if (index_rest >= size_rest) {
+    return;
  }
+
+  auto shape_x = shape[NDIM - 1];
+  auto a_stride_x = a_strides[NDIM - 1];
+  auto b_stride_x = b_strides[NDIM - 1];
+  IdxT index_x =
+      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
+  auto [a_idx, b_idx] = elem_to_loc_nd<NDIM>(
+      index_rest * shape_x, shape.data(), a_strides.data(), b_strides.data());
+  auto a_vec =
+      load_vector<N_READS>(a + a_idx, index_x, shape_x, a_stride_x, In(0));
+  auto b_vec =
+      load_vector<N_READS>(b + b_idx, index_x, shape_x, b_stride_x, In(0));
+
+  AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+  for (int i = 0; i < N_READS; ++i) {
+    out_vec[i] = Op{}(a_vec[i], b_vec[i]);
+  }
+  store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
 }

-template <typename Op, typename In, typename Out, typename IdxT>
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void binary_g(
    const In* a,
    const In* b,
    Out* out,
-    IdxT size,
+    IdxT size_rest,
    const __grid_constant__ Shape shape,
    const __grid_constant__ Strides a_strides,
    const __grid_constant__ Strides b_strides,
    int ndim) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto [a_idx, b_idx] = elem_to_loc(
-        index, shape.data(), a_strides.data(), b_strides.data(), ndim);
-    out[index] = Op{}(a[a_idx], b[b_idx]);
+  auto block = cg::this_thread_block();
+  auto grid = cg::this_grid();
+  IdxT index_rest =
+      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
+  if (index_rest >= size_rest) {
+    return;
  }
+
+  auto shape_x = shape[ndim - 1];
+  auto a_stride_x = a_strides[ndim - 1];
+  auto b_stride_x = b_strides[ndim - 1];
+  IdxT index_x =
+      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
+  auto [a_idx, b_idx] = elem_to_loc(
+      index_rest * shape_x,
+      shape.data(),
+      a_strides.data(),
+      b_strides.data(),
+      ndim);
+  auto a_vec =
+      load_vector<N_READS>(a + a_idx, index_x, shape_x, a_stride_x, In(0));
+  auto b_vec =
+      load_vector<N_READS>(b + b_idx, index_x, shape_x, b_stride_x, In(0));
+
+  AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+  for (int i = 0; i < N_READS; ++i) {
+    out_vec[i] = Op{}(a_vec[i], b_vec[i]);
+  }
+  store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
 }

 template <typename Op, typename In, typename Out>
@@ -209,39 +259,61 @@ void binary_op_gpu_inplace(
                auto& a_strides = strides[0];
                auto& b_strides = strides[1];
                int ndim = shape.size();
+                int work_per_thread = 1;
+                auto dim0 = ndim > 0 ? shape.back() : 1;
+                auto rest = out.size() / dim0;
+                if (dim0 >= 4) {
+                  work_per_thread = 4;
+                }
+                dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
+                auto block_dims = get_block_dims(dim0, rest, 1);
+                uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
+                uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);
                if (ndim <= 3) {
                  dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                    auto [num_blocks, block_dims] =
-                        get_launch_args(out, large());
+                    auto kernel = cu::binary_g_nd<
+                        Op,
+                        InType,
+                        OutType,
+                        IdxT,
+                        dims_constant(),
+                        1>;
+                    if (work_per_thread == 4) {
+                      kernel = cu::binary_g_nd<
+                          Op,
+                          InType,
+                          OutType,
+                          IdxT,
+                          dims_constant(),
+                          4>;
+                    }
                    encoder.add_kernel_node(
-                        cu::binary_g_nd<
-                            Op,
-                            InType,
-                            OutType,
-                            IdxT,
-                            dims_constant()>,
-                        num_blocks,
+                        kernel,
+                        {num_blocks_x, num_blocks_y},
                        block_dims,
                        0,
                        a.data<InType>(),
                        b.data<InType>(),
                        out.data<OutType>(),
-                        out.size(),
+                        rest,
                        const_param<dims_constant()>(shape),
                        const_param<dims_constant()>(a_strides),
                        const_param<dims_constant()>(b_strides));
                  });
                } else {
-                  auto [num_blocks, block_dims] = get_launch_args(out, large());
+                  auto kernel = cu::binary_g<Op, InType, OutType, IdxT, 1>;
+                  if (work_per_thread == 4) {
+                    kernel = cu::binary_g<Op, InType, OutType, IdxT, 4>;
+                  }
                  encoder.add_kernel_node(
-                      cu::binary_g<Op, InType, OutType, IdxT>,
-                      num_blocks,
+                      kernel,
+                      {num_blocks_x, num_blocks_y},
                      block_dims,
                      0,
                      a.data<InType>(),
                      b.data<InType>(),
                      out.data<OutType>(),
-                      out.size(),
+                      rest,
                      const_param(shape),
                      const_param(a_strides),
                      const_param(b_strides),
@@ -304,54 +376,4 @@ void binary_op_gpu(
    binary_op_gpu<cu::func>(inputs, out, name(), s);                  \
  }

-BINARY_GPU(Add)
-BINARY_GPU(ArcTan2)
-BINARY_GPU(Divide)
-BINARY_GPU(Remainder)
-BINARY_GPU(Greater)
-BINARY_GPU(GreaterEqual)
-BINARY_GPU(Less)
-BINARY_GPU(LessEqual)
-BINARY_GPU(LogicalAnd)
-BINARY_GPU(LogicalOr)
-BINARY_GPU(LogAddExp)
-BINARY_GPU(Maximum)
-BINARY_GPU(Minimum)
-BINARY_GPU(Multiply)
-BINARY_GPU(NotEqual)
-BINARY_GPU(Power)
-BINARY_GPU(Subtract)
-
-void Equal::eval_gpu(const std::vector<array>& inputs, array& out) {
-  nvtx3::scoped_range r("Equal::eval_gpu");
-  auto& s = out.primitive().stream();
-  if (equal_nan_) {
-    binary_op_gpu<cu::NaNEqual>(inputs, out, name(), s);
-  } else {
-    binary_op_gpu<cu::Equal>(inputs, out, name(), s);
-  }
-}
-
-void BitwiseBinary::eval_gpu(const std::vector<array>& inputs, array& out) {
-  nvtx3::scoped_range r("BitwiseBinary::eval_gpu");
-  auto& s = out.primitive().stream();
-  switch (op_) {
-    case BitwiseBinary::And:
-      binary_op_gpu<cu::BitwiseAnd>(inputs, out, name(), s);
-      break;
-    case BitwiseBinary::Or:
-      binary_op_gpu<cu::BitwiseOr>(inputs, out, name(), s);
-      break;
-    case BitwiseBinary::Xor:
-      binary_op_gpu<cu::BitwiseXor>(inputs, out, name(), s);
-      break;
-    case BitwiseBinary::LeftShift:
-      binary_op_gpu<cu::LeftShift>(inputs, out, name(), s);
-      break;
-    case BitwiseBinary::RightShift:
-      binary_op_gpu<cu::RightShift>(inputs, out, name(), s);
-      break;
-  }
-}
-
 } // namespace mlx::core
--- a/mlx/backend/cuda/binary/bitwise_binary.cu
+++ b/mlx/backend/cuda/binary/bitwise_binary.cu
@@ -0,0 +1,27 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+void BitwiseBinary::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("BitwiseBinary::eval_gpu");
+  auto& s = out.primitive().stream();
+  switch (op_) {
+    case BitwiseBinary::And:
+      binary_op_gpu<cu::BitwiseAnd>(inputs, out, name(), s);
+      break;
+    case BitwiseBinary::Or:
+      binary_op_gpu<cu::BitwiseOr>(inputs, out, name(), s);
+      break;
+    case BitwiseBinary::Xor:
+      binary_op_gpu<cu::BitwiseXor>(inputs, out, name(), s);
+      break;
+    case BitwiseBinary::LeftShift:
+      binary_op_gpu<cu::LeftShift>(inputs, out, name(), s);
+      break;
+    case BitwiseBinary::RightShift:
+      binary_op_gpu<cu::RightShift>(inputs, out, name(), s);
+      break;
+  }
+}
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/divide.cu
+++ b/mlx/backend/cuda/binary/divide.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(Divide)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/equal.cu
+++ b/mlx/backend/cuda/binary/equal.cu
@@ -0,0 +1,15 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+void Equal::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("Equal::eval_gpu");
+  auto& s = out.primitive().stream();
+  if (equal_nan_) {
+    binary_op_gpu<cu::NaNEqual>(inputs, out, name(), s);
+  } else {
+    binary_op_gpu<cu::Equal>(inputs, out, name(), s);
+  }
+}
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/greater.cu
+++ b/mlx/backend/cuda/binary/greater.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(Greater)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/greater_equal.cu
+++ b/mlx/backend/cuda/binary/greater_equal.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(GreaterEqual)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/less.cu
+++ b/mlx/backend/cuda/binary/less.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(Less)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/less_equal.cu
+++ b/mlx/backend/cuda/binary/less_equal.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(LessEqual)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/log_add_exp.cu
+++ b/mlx/backend/cuda/binary/log_add_exp.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(LogAddExp)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/logical_and.cu
+++ b/mlx/backend/cuda/binary/logical_and.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(LogicalAnd)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/logical_or.cu
+++ b/mlx/backend/cuda/binary/logical_or.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(LogicalOr)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/maximum.cu
+++ b/mlx/backend/cuda/binary/maximum.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(Maximum)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/minimum.cu
+++ b/mlx/backend/cuda/binary/minimum.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(Minimum)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/multiply.cu
+++ b/mlx/backend/cuda/binary/multiply.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(Multiply)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/not_equal.cu
+++ b/mlx/backend/cuda/binary/not_equal.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(NotEqual)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/power.cu
+++ b/mlx/backend/cuda/binary/power.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(Power)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/remainder.cu
+++ b/mlx/backend/cuda/binary/remainder.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(Remainder)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary/subtract.cu
+++ b/mlx/backend/cuda/binary/subtract.cu
@@ -0,0 +1,7 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/binary/binary.cuh"
+
+namespace mlx::core {
+BINARY_GPU(Subtract)
+} // namespace mlx::core
--- a/mlx/backend/cuda/binary_two.cu
+++ b/mlx/backend/cuda/binary_two.cu
@@ -127,45 +127,99 @@ binary_two_vv(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
  }
 }

-template <typename Op, typename In, typename Out, typename IdxT, int NDIM>
+template <
+    typename Op,
+    typename In,
+    typename Out,
+    typename IdxT,
+    int NDIM,
+    int N_READS>
 __global__ void binary_two_g_nd(
    const In* a,
    const In* b,
    Out* out_a,
    Out* out_b,
-    IdxT size,
+    IdxT size_rest,
    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> a_strides,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> b_strides) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto [a_idx, b_idx] = elem_to_loc_nd<NDIM>(
-        index, shape.data(), a_strides.data(), b_strides.data());
-    auto out = Op{}(a[a_idx], b[b_idx]);
-    out_a[index] = out[0];
-    out_b[index] = out[1];
+  auto block = cg::this_thread_block();
+  auto grid = cg::this_grid();
+  IdxT index_rest =
+      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
+  if (index_rest >= size_rest) {
+    return;
  }
+
+  auto shape_x = shape[NDIM - 1];
+  auto a_stride_x = a_strides[NDIM - 1];
+  auto b_stride_x = b_strides[NDIM - 1];
+  IdxT index_x =
+      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
+  auto [a_idx, b_idx] = elem_to_loc_nd<NDIM>(
+      index_rest * shape_x, shape.data(), a_strides.data(), b_strides.data());
+  auto a_vec =
+      load_vector<N_READS>(a + a_idx, index_x, shape_x, a_stride_x, In(0));
+  auto b_vec =
+      load_vector<N_READS>(b + b_idx, index_x, shape_x, b_stride_x, In(0));
+
+  AlignedVector<Out, N_READS> out_vec_a;
+  AlignedVector<Out, N_READS> out_vec_b;
+#pragma unroll
+  for (int i = 0; i < N_READS; ++i) {
+    auto out = Op{}(a_vec[i], b_vec[i]);
+    out_vec_a[i] = out[0];
+    out_vec_b[i] = out[1];
+  }
+  store_vector(out_a + shape_x * index_rest, index_x, out_vec_a, shape_x);
+  store_vector(out_b + shape_x * index_rest, index_x, out_vec_b, shape_x);
 }

-template <typename Op, typename In, typename Out, typename IdxT>
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void binary_two_g(
    const In* a,
    const In* b,
    Out* out_a,
    Out* out_b,
-    IdxT size,
+    IdxT size_rest,
    const __grid_constant__ Shape shape,
    const __grid_constant__ Strides a_strides,
    const __grid_constant__ Strides b_strides,
    int ndim) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto [a_idx, b_idx] = elem_to_loc(
-        index, shape.data(), a_strides.data(), b_strides.data(), ndim);
-    auto out = Op{}(a[a_idx], b[b_idx]);
-    out_a[index] = out[0];
-    out_b[index] = out[1];
+  auto block = cg::this_thread_block();
+  auto grid = cg::this_grid();
+  IdxT index_rest =
+      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
+  if (index_rest >= size_rest) {
+    return;
  }
+
+  auto shape_x = shape[ndim - 1];
+  auto a_stride_x = a_strides[ndim - 1];
+  auto b_stride_x = b_strides[ndim - 1];
+  IdxT index_x =
+      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
+  auto [a_idx, b_idx] = elem_to_loc(
+      index_rest * shape_x,
+      shape.data(),
+      a_strides.data(),
+      b_strides.data(),
+      ndim);
+  auto a_vec =
+      load_vector<N_READS>(a + a_idx, index_x, shape_x, a_stride_x, In(0));
+  auto b_vec =
+      load_vector<N_READS>(b + b_idx, index_x, shape_x, b_stride_x, In(0));
+
+  AlignedVector<Out, N_READS> out_vec_a;
+  AlignedVector<Out, N_READS> out_vec_b;
+#pragma unroll
+  for (int i = 0; i < N_READS; ++i) {
+    auto out = Op{}(a_vec[i], b_vec[i]);
+    out_vec_a[i] = out[0];
+    out_vec_b[i] = out[1];
+  }
+  store_vector(out_a + shape_x * index_rest, index_x, out_vec_a, shape_x);
+  store_vector(out_b + shape_x * index_rest, index_x, out_vec_b, shape_x);
 }

 template <typename Op, typename In, typename Out>
@@ -225,42 +279,64 @@ void binary_two_op_gpu_inplace(
                auto& a_strides = strides[0];
                auto& b_strides = strides[1];
                int ndim = shape.size();
+                int work_per_thread = 1;
+                auto dim0 = ndim > 0 ? shape.back() : 1;
+                auto rest = out_a.size() / dim0;
+                if (dim0 >= 4) {
+                  work_per_thread = 4;
+                }
+                dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
+                auto block_dims = get_block_dims(dim0, rest, 1);
+                uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
+                uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);
+
                if (ndim <= 3) {
                  dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                    auto [num_blocks, block_dims] =
-                        get_launch_args(out_a, large());
+                    auto kernel = cu::binary_two_g_nd<
+                        Op,
+                        InType,
+                        OutType,
+                        IdxT,
+                        dims_constant(),
+                        1>;
+                    if (work_per_thread == 4) {
+                      kernel = cu::binary_two_g_nd<
+                          Op,
+                          InType,
+                          OutType,
+                          IdxT,
+                          dims_constant(),
+                          4>;
+                    }
                    encoder.add_kernel_node(
-                        cu::binary_two_g_nd<
-                            Op,
-                            InType,
-                            OutType,
-                            IdxT,
-                            dims_constant()>,
-                        num_blocks,
+                        kernel,
+                        {num_blocks_x, num_blocks_y},
                        block_dims,
                        0,
                        a.data<InType>(),
                        b.data<InType>(),
                        out_a.data<OutType>(),
                        out_b.data<OutType>(),
-                        out_a.size(),
+                        rest,
                        const_param<dims_constant()>(shape),
                        const_param<dims_constant()>(a_strides),
                        const_param<dims_constant()>(b_strides));
                  });
                } else {
-                  auto [num_blocks, block_dims] =
-                      get_launch_args(out_a, large());
+                  auto kernel = cu::binary_two_g<Op, InType, OutType, IdxT, 1>;
+                  if (work_per_thread == 4) {
+                    kernel = cu::binary_two_g<Op, InType, OutType, IdxT, 4>;
+                  }
                  encoder.add_kernel_node(
-                      cu::binary_two_g<Op, InType, OutType, IdxT>,
-                      num_blocks,
+                      kernel,
+                      {num_blocks_x, num_blocks_y},
                      block_dims,
                      0,
                      a.data<InType>(),
                      b.data<InType>(),
                      out_a.data<OutType>(),
                      out_b.data<OutType>(),
-                      out_a.size(),
+                      rest,
                      const_param(shape),
                      const_param(a_strides),
                      const_param(b_strides),
--- a/mlx/backend/cuda/compiled.cpp
+++ b/mlx/backend/cuda/compiled.cpp
@@ -267,7 +267,8 @@ void Compiled::eval_gpu(
      }
    }

-    return std::make_pair(std::move(builder.os), std::move(kernel_names));
+    return std::make_tuple(
+        false, std::move(builder.os), std::move(kernel_names));
  });

  // Collapse contiguous dims to route to a faster kernel if possible. Also
@@ -331,9 +332,9 @@ void Compiled::eval_gpu(
    encoder.set_output_array(out);
  }

-  auto kernel = mod.get_kernel(kernel_name);
+  auto [kernel, max_block_dims] = mod.get_kernel_and_dims(kernel_name);
  auto [num_blocks, block_dims] =
-      get_launch_args(outputs[0], large, work_per_thread);
+      get_launch_args(outputs[0], large, work_per_thread, max_block_dims);
  encoder.add_kernel_node(kernel, num_blocks, block_dims, 0, args.args());
 }

--- a/mlx/backend/cuda/conv.cpp
+++ b/mlx/backend/cuda/conv.cpp
@@ -1,18 +1,12 @@
 // Copyright © 2025 Apple Inc.

+#include "mlx/backend/cuda/conv/conv.h"
+#include "mlx/backend/cuda/cudnn_utils.h"
 #include "mlx/backend/cuda/device.h"
-#include "mlx/backend/cuda/device/config.h"
 #include "mlx/backend/cuda/lru_cache.h"
 #include "mlx/backend/gpu/copy.h"
-#include "mlx/dtype_utils.h"
 #include "mlx/primitives.h"

-// cudnn_frontend.h redefines this macro.
-#undef CHECK_CUDA_ERROR
-
-#include <cudnn_frontend.h>
-#include <cudnn_frontend_find_plan.h>
-#include <fmt/format.h>
 #include <nvtx3/nvtx3.hpp>

 #include <cassert>
@@ -21,9 +15,6 @@ namespace mlx::core {

 namespace {

-// Not all engines support it so can not use this API now.
-#define MLX_USE_CUDNN_NATIVE_CUDA_GRAPH_API 0
-
 // Alias for better readability.
 #define CONV_FORWARD CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR
 #define CONV_BACKWARD_INPUT \
@@ -31,6 +22,9 @@ namespace {
 #define CONV_BACKWARD_WEIGHT \
  CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR

+// Custom placeholder representing fallback kernel.
+#define CONV_FALLBACK static_cast<cudnnBackendDescriptorType_t>(-1)
+
 struct ConvCacheKey {
  int device_id;
  cudnnDataType_t cudnn_dtype;
@@ -50,203 +44,13 @@ struct ConvCacheKey {
 auto& conv_cache() {
  static LRUBytesKeyCache<
      ConvCacheKey,
-      std::pair<cudnnBackendDescriptorType_t, cudnn_frontend::ExecutionPlan>>
-      cache(/* capacity */ 128);
+      std::pair<
+          cudnnBackendDescriptorType_t,
+          std::optional<cudnn_frontend::ExecutionPlan>>>
+      cache("MLX_CUDA_CONV_CACHE_SIZE", /* default_capacity */ 128);
  return cache;
 }

-template <typename T, typename Vec>
-inline SmallVector<T> convert_vector(const Vec& vec) {
-  return SmallVector<T>(vec.begin(), vec.end());
-}
-
-template <typename T, template <typename U> class Vec>
-inline std::array<T, MAX_NDIM> fixed_vector(const Vec<T>& vec) {
-  if (vec.size() > MAX_NDIM) {
-    throw std::runtime_error(
-        fmt::format("ndim can not be larger than {}.", MAX_NDIM));
-  }
-  std::array<T, MAX_NDIM> result = {};
-  std::copy_n(vec.begin(), vec.size(), result.begin());
-  return result;
-}
-
-auto nhwc_to_nchw(const array& x) {
-  auto shape = convert_vector<int64_t>(x.shape());
-  shape.insert(shape.begin() + 1, shape.back());
-  shape.erase(shape.end() - 1);
-  auto strides = convert_vector<int64_t>(x.strides());
-  strides.insert(strides.begin() + 1, strides.back());
-  strides.erase(strides.end() - 1);
-  return std::make_tuple(std::move(shape), std::move(strides));
-}
-
-inline cudnnDataType_t dtype_to_cudnn_type(Dtype dtype) {
-  switch (dtype) {
-    case int8:
-      return CUDNN_DATA_INT8;
-    case int32:
-      return CUDNN_DATA_INT32;
-    case uint8:
-      return CUDNN_DATA_UINT8;
-    case float16:
-      return CUDNN_DATA_HALF;
-    case bfloat16:
-      return CUDNN_DATA_BFLOAT16;
-    case float32:
-      return CUDNN_DATA_FLOAT;
-    case float64:
-      return CUDNN_DATA_DOUBLE;
-    default:
-      throw std::runtime_error(fmt::format(
-          "Unsupported dtype in Convolution: {}.", dtype_to_string(dtype)));
-  }
-}
-
-inline uint8_t get_alignment(const array& x) {
-  uint8_t alignment = 1;
-  uintptr_t address = reinterpret_cast<uintptr_t>(x.data<void>());
-  for (; alignment < 32; alignment *= 2) {
-    if (address % (alignment * 2)) {
-      return alignment;
-    }
-  }
-  return alignment;
-}
-
-inline cudnn_frontend::Tensor build_tensor(int64_t id, const array& x) {
-  auto [shape, strides] = nhwc_to_nchw(x);
-  return cudnn_frontend::TensorBuilder()
-      .setDim(shape.size(), shape.data())
-      .setStrides(strides.size(), strides.data())
-      .setId(id)
-      .setAlignment(get_alignment(x))
-      .setDataType(dtype_to_cudnn_type(x.dtype()))
-      .build();
-}
-
-cudnn_frontend::EngineConfigList get_engine_configs(
-    cudnnBackendDescriptorType_t backend_type,
-    Dtype dtype,
-    cudnn_frontend::OperationGraph& op_graph,
-    bool use_fallback = false) {
-  cudnn_frontend::GeneratorSource source;
-  if (use_fallback) {
-    source = [&backend_type](cudnn_frontend::OperationGraph& op_graph) {
-      auto fallback = cudnn_frontend::EngineFallbackListBuilder()
-                          .setOperationGraph(op_graph)
-                          .setOperation(backend_type)
-                          .build();
-      return fallback.getFallbackList();
-    };
-  } else {
-    source = [](cudnn_frontend::OperationGraph& op_graph) {
-      auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
-                            .setOperationGraph(op_graph)
-                            .setHeurMode(CUDNN_HEUR_MODE_A)
-                            .build();
-      return heuristics.getEngineConfig(heuristics.getEngineConfigCount());
-    };
-  }
-
-  cudnn_frontend::EngineConfigGenerator generator(1, &source);
-  auto configs = generator.generate_engine_config(op_graph);
-
-  cudnn_frontend::EngineConfigList filtered_configs;
-  cudnn_frontend::filter(configs, filtered_configs, [dtype](auto c) {
-    if (cudnn_frontend::hasNumericalNote<
-            CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS>(c)) {
-      return true;
-    }
-    if (cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_TENSOR_CORE>(c) &&
-        dtype == float32 && !env::enable_tf32()) {
-      return true;
-    }
-    return false;
-  });
-  return filtered_configs;
-}
-
-bool execute_plan(
-    cu::CommandEncoder& encoder,
-    cudnn_frontend::ExecutionPlan& plan,
-    array& x,
-    array& w,
-    array& y) {
-  int workspace_size = plan.getWorkspaceSize();
-  array workspace(allocator::malloc(workspace_size), {workspace_size}, uint8);
-
-  int64_t uids[3] = {'x', 'w', 'y'};
-  void* data_ptrs[3] = {
-      x.data<void>(),
-      w.data<void>(),
-      y.data<void>(),
-  };
-
-  auto variantPack = cudnn_frontend::VariantPackBuilder()
-                         .setWorkspacePointer(workspace.data<void>())
-                         .setDataPointers(3, data_ptrs)
-                         .setUids(3, uids)
-                         .build();
-
-  auto handle = encoder.device().cudnn_handle();
-  cudnnSetStream(handle, encoder.stream());
-
-#if CUDNN_VERSION >= 90500 && MLX_USE_CUDNN_NATIVE_CUDA_GRAPH_API
-  cudaGraph_t graph;
-  cudaGraphCreate(&graph, 0);
-  std::unique_ptr<cudaGraph_t, void (*)(cudaGraph_t*)> graph_freer(
-      &graph, [](cudaGraph_t* p) { cudaGraphDestroy(*p); });
-  if (cudnnBackendPopulateCudaGraph(
-          handle, plan.get_raw_desc(), variantPack.get_raw_desc(), graph) !=
-      CUDNN_STATUS_SUCCESS) {
-    return false;
-  }
-  encoder.add_graph_node(graph);
-#else
-  auto capture = encoder.capture_context();
-  if (cudnnBackendExecute(
-          handle, plan.get_raw_desc(), variantPack.get_raw_desc()) !=
-      CUDNN_STATUS_SUCCESS) {
-    // Discard the captured graph when failed.
-    capture.discard = true;
-    return false;
-  }
-#endif
-
-  encoder.add_temporary(workspace);
-  return true;
-}
-
-bool try_engines(
-    cu::CommandEncoder& encoder,
-    const ConvCacheKey& cache_key,
-    cudnnBackendDescriptorType_t backend_type,
-    cudnn_frontend::EngineConfigList& configs,
-    const std::string& op_graph_tag,
-    array& x,
-    array& w,
-    array& y) {
-  for (auto& config : configs) {
-    try {
-      auto plan = cudnn_frontend::ExecutionPlanBuilder()
-                      .setHandle(encoder.device().cudnn_handle())
-                      .setEngineConfig(config, op_graph_tag)
-                      .build();
-      if (execute_plan(encoder, plan, x, w, y)) {
-        conv_cache().emplace(
-            cache_key, std::make_pair(backend_type, std::move(plan)));
-        return true;
-      }
-    } catch (cudnn_frontend::cudnnException& error) {
-      if (error.getCudnnStatus() != CUDNN_STATUS_NOT_SUPPORTED) {
-        throw;
-      }
-    }
-  }
-  return false;
-}
-
 auto get_conv_op_settings(
    cudnnBackendDescriptorType_t backend_type,
    array& x,
@@ -291,7 +95,7 @@ auto get_conv_op_settings(
  }
 }

-std::optional<cudnn_frontend::OperationGraph> build_op_graph(
+std::optional<cudnn_frontend::OperationGraph> build_conv_op_graph(
    cu::CommandEncoder& encoder,
    cudnnBackendDescriptorType_t backend_type,
    Dtype dtype,
@@ -317,9 +121,9 @@ std::optional<cudnn_frontend::OperationGraph> build_op_graph(
                         .build();

    auto op = cudnn_frontend::OperationBuilder(backend_type)
-                  .setxDesc(build_tensor('x', x))
-                  .setwDesc(build_tensor('w', w))
-                  .setyDesc(build_tensor('y', y))
+                  .setxDesc(build_cudnn_tensor_nchw('x', x))
+                  .setwDesc(build_cudnn_tensor_nchw('w', w))
+                  .setyDesc(build_cudnn_tensor_nchw('y', y))
                  .setcDesc(conv_desc)
                  .build();

@@ -336,6 +140,42 @@ std::optional<cudnn_frontend::OperationGraph> build_op_graph(
  }
 }

+// Transpose from (C_out, H, W, C_in / groups) to (C_in, H, W, C_out / groups).
+array group_transpose(
+    const array& x,
+    int groups,
+    int group_dim,
+    int axis1,
+    int axis2,
+    Stream s) {
+  if (groups == 1) {
+    return swapaxes_in_eval(x, axis1, axis2);
+  }
+  int ndim = x.ndim();
+  if (group_dim < 0) {
+    group_dim += ndim;
+  }
+  if (axis1 < 0) {
+    axis1 += ndim;
+  }
+  if (axis2 < 0) {
+    axis2 += ndim;
+  }
+  if (group_dim <= axis1) {
+    axis1 += 1;
+  }
+  if (group_dim <= axis2) {
+    axis2 += 1;
+  }
+  auto shape = x.shape();
+  shape.insert(shape.begin() + group_dim, groups);
+  shape[group_dim + 1] = shape[group_dim + 1] / groups;
+  array x_trans = reshape_in_eval(x, std::move(shape), s);
+  x_trans = swapaxes_in_eval(x_trans, axis1, axis2);
+  x_trans = flatten_in_eval(x_trans, group_dim, group_dim + 1, s);
+  return x_trans;
+}
+
 // Do necessary transposes and copies to prepare the inputs and outputs for
 // building the cuDNN conv op. It is safe to be called multiple times in one
 // eval_gpu, with cost of possible redundant copies.
@@ -345,13 +185,14 @@ std::tuple<array, array, array> prepare_args(
    array in,
    array wt,
    array out,
+    int groups,
    Stream s) {
  // Transpose the args depending on the backend type.
  // TODO: Handle groups.
  if (backend_type == CONV_BACKWARD_INPUT) {
-    wt = swapaxes_in_eval(wt, 0, -1);
+    wt = group_transpose(wt, groups, 0, 0, -1, s);
  } else if (backend_type == CONV_BACKWARD_WEIGHT) {
-    in = swapaxes_in_eval(in, 0, -1);
+    in = group_transpose(in, groups, -1, 0, -1, s);
    wt = swapaxes_in_eval(wt, 0, -1);
    // Create a contiguous array that shares the data with |out|, but with dim
    // C_in and C_out swapped.
@@ -444,12 +285,12 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
  ConvCacheKey cache_key{
      encoder.device().cuda_device(),
      dtype_to_cudnn_type(dtype),
-      fixed_vector(in.shape()),
-      fixed_vector(wt.shape()),
-      fixed_vector(kernel_strides_),
-      fixed_vector(padding_lo_),
-      fixed_vector(padding_hi_),
-      fixed_vector(kernel_dilation_),
+      vector_key(in.shape()),
+      vector_key(wt.shape()),
+      vector_key(kernel_strides_),
+      vector_key(padding_lo_),
+      vector_key(padding_hi_),
+      vector_key(kernel_dilation_),
      groups_,
      flip_,
      get_alignment(in),
@@ -457,11 +298,29 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
      get_alignment(out)};
  if (auto it = conv_cache().find(cache_key); it != conv_cache().end()) {
    auto& [backend_type, plan] = it->second;
-    std::tie(in, wt, out) = prepare_args(encoder, backend_type, in, wt, out, s);
-    register_args(encoder, backend_type, in, wt, out, out_);
-    auto [x, w, y] = dispatch_args(backend_type, in, wt, out);
-    if (!execute_plan(encoder, plan, x, w, y)) {
-      throw std::runtime_error("[conv] Cached plan failed to execute.");
+    if (plan) {
+      // Run cached plan.
+      std::tie(in, wt, out) =
+          prepare_args(encoder, backend_type, in, wt, out, groups_, s);
+      register_args(encoder, backend_type, in, wt, out, out_);
+      auto [x, w, y] = dispatch_args(backend_type, in, wt, out);
+      if (!encode_cudnn_plan(encoder, *plan, {'x', 'w', 'y'}, x, w, y)) {
+        throw std::runtime_error("[conv] Cached plan failed to execute.");
+      }
+    } else {
+      // Run fallback kernel.
+      gemm_conv(
+          encoder,
+          in,
+          wt,
+          out,
+          kernel_strides_,
+          padding_lo_,
+          kernel_dilation_,
+          input_dilation_,
+          groups_,
+          flip_,
+          s);
    }
    return;
  }
@@ -490,7 +349,7 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
  std::optional<cudnn_frontend::OperationGraph> op_graph;
  for (auto try_backend : try_backends) {
    auto [in_copy, wt_copy, out_copy] =
-        prepare_args(encoder, try_backend, in, wt, out, s);
+        prepare_args(encoder, try_backend, in, wt, out, groups_, s);
    auto [x, w, y] = dispatch_args(try_backend, in_copy, wt_copy, out_copy);
    auto [stride, padding_lo, padding_hi, dilation] = get_conv_op_settings(
        try_backend,
@@ -502,7 +361,7 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
        padding_hi_,
        kernel_dilation_,
        input_dilation_);
-    op_graph = build_op_graph(
+    op_graph = build_conv_op_graph(
        encoder,
        try_backend,
        dtype,
@@ -521,26 +380,38 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
      break;
    }
  }
-  if (!op_graph) {
-    throw std::runtime_error("[conv] Can not build op graph.");
+
+  if (op_graph) {
+    // Find a plan for the graph and execute it.
+    auto plan = find_cudnn_plan_from_op_graph(
+        encoder.device().cudnn_handle(), backend_type, dtype, *op_graph);
+    if (plan) {
+      // Setup inputs and outputs.
+      register_args(encoder, backend_type, in, wt, out, out_);
+
+      auto [x, w, y] = dispatch_args(backend_type, in, wt, out);
+      if (encode_cudnn_plan(encoder, *plan, {'x', 'w', 'y'}, x, w, y)) {
+        conv_cache().emplace(
+            cache_key, std::make_pair(backend_type, std::move(*plan)));
+        return;
+      }
+    }
  }

-  // Get ready to execute the graph.
-  register_args(encoder, backend_type, in, wt, out, out_);
-
-  // Try to run plans based on heuristics.
-  auto configs = get_engine_configs(backend_type, dtype, *op_graph);
-  auto tag = op_graph->getTag();
-  auto [x, w, y] = dispatch_args(backend_type, in, wt, out);
-  if (try_engines(encoder, cache_key, backend_type, configs, tag, x, w, y)) {
-    return;
-  }
-  // Then try fallback plans.
-  configs = get_engine_configs(backend_type, dtype, *op_graph);
-  if (try_engines(encoder, cache_key, backend_type, configs, tag, x, w, y)) {
-    return;
-  }
-  throw std::runtime_error("[conv] Unable to find a working engine.");
+  // Use fallback kernel for settings not supported by cuDNN.
+  gemm_conv(
+      encoder,
+      in,
+      wt,
+      out,
+      kernel_strides_,
+      padding_lo_,
+      kernel_dilation_,
+      input_dilation_,
+      groups_,
+      flip_,
+      s);
+  conv_cache().emplace(cache_key, std::make_pair(CONV_FALLBACK, std::nullopt));
 }

 } // namespace mlx::core
--- a/mlx/backend/cuda/conv/conv.h
+++ b/mlx/backend/cuda/conv/conv.h
@@ -0,0 +1,126 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/gpu/copy.h"
+
+namespace mlx::core {
+
+template <int NDIM>
+struct ConvParams {
+  int N; // Batch size
+  int C; // In channels
+  int O; // Out channels
+  int strides[NDIM];
+  int padding[NDIM];
+  int kernel_dilation[NDIM];
+  int input_dilation[NDIM];
+  int groups;
+  bool flip;
+  int in_spatial_dims[NDIM];
+  int wt_spatial_dims[NDIM];
+  int out_spatial_dims[NDIM];
+  int64_t in_strides[NDIM + 2];
+
+  ConvParams(
+      const array& in,
+      const array& wt,
+      const array& out,
+      const std::vector<int>& strides,
+      const std::vector<int>& padding,
+      const std::vector<int>& kernel_dilation,
+      const std::vector<int>& input_dilation,
+      int groups,
+      bool flip)
+      : N(in.shape(0)),
+        C(in.shape(-1)),
+        O(wt.shape(0)),
+        groups(groups),
+        flip(flip) {
+    std::copy_n(strides.begin(), NDIM, this->strides);
+    std::copy_n(padding.begin(), NDIM, this->padding);
+    std::copy_n(kernel_dilation.begin(), NDIM, this->kernel_dilation);
+    std::copy_n(input_dilation.begin(), NDIM, this->input_dilation);
+    std::copy_n(in.shape().begin() + 1, NDIM, this->in_spatial_dims);
+    std::copy_n(wt.shape().begin() + 1, NDIM, this->wt_spatial_dims);
+    std::copy_n(out.shape().begin() + 1, NDIM, this->out_spatial_dims);
+    std::copy_n(in.strides().begin(), NDIM + 2, this->in_strides);
+  }
+};
+
+void gemm_grouped_conv(
+    cu::CommandEncoder& encoder,
+    const array& in,
+    const array& wt,
+    array& out,
+    const std::vector<int>& strides,
+    const std::vector<int>& padding,
+    const std::vector<int>& kernel_dilation,
+    const std::vector<int>& input_dilation,
+    int groups,
+    bool flip,
+    Stream s);
+
+void gemm_conv(
+    cu::CommandEncoder& encoder,
+    const array& in,
+    const array& wt,
+    array& out,
+    const std::vector<int>& strides,
+    const std::vector<int>& padding,
+    const std::vector<int>& kernel_dilation,
+    const std::vector<int>& input_dilation,
+    bool flip,
+    Stream s);
+
+inline void gemm_conv(
+    cu::CommandEncoder& encoder,
+    array in,
+    array wt,
+    array& out,
+    const std::vector<int>& strides,
+    const std::vector<int>& padding,
+    const std::vector<int>& kernel_dilation,
+    const std::vector<int>& input_dilation,
+    int groups,
+    bool flip,
+    Stream s) {
+  if (!in.flags().row_contiguous) {
+    in = contiguous_copy_gpu(in, s);
+    encoder.add_temporary(in);
+  }
+  if (!wt.flags().row_contiguous) {
+    wt = contiguous_copy_gpu(wt, s);
+    encoder.add_temporary(wt);
+  }
+
+  if (groups == 1) {
+    gemm_conv(
+        encoder,
+        in,
+        wt,
+        out,
+        strides,
+        padding,
+        kernel_dilation,
+        input_dilation,
+        flip,
+        s);
+  } else {
+    gemm_grouped_conv(
+        encoder,
+        in,
+        wt,
+        out,
+        strides,
+        padding,
+        kernel_dilation,
+        input_dilation,
+        groups,
+        flip,
+        s);
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/conv/gemm_conv.cu
+++ b/mlx/backend/cuda/conv/gemm_conv.cu
@@ -0,0 +1,217 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/conv/conv.h"
+#include "mlx/backend/cuda/gemms/cublas_gemm.h"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/dtype_utils.h"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename T, int NDIM>
+__global__ void naive_unfold_nd(
+    const T* in,
+    T* out,
+    int filter_size,
+    int out_pixels,
+    const __grid_constant__ ConvParams<NDIM> params) {
+  auto block = cg::this_thread_block();
+  auto tid = block.group_index();
+  auto lid = block.thread_index();
+
+  int index_batch = tid.z / out_pixels; // [0, N)
+  int index_out_spatial = tid.z % out_pixels; // [0, H_out * W_out)
+  int index_wt_spatial =
+      tid.x * block.dim_threads().x + lid.x; // [0, H_wt * W_wt)
+
+  if (index_wt_spatial >= filter_size / params.C) {
+    return;
+  }
+
+  in += tid.y; // [0, C)
+  out += tid.z * filter_size + index_wt_spatial * params.C + tid.y;
+
+  bool valid = index_batch < params.N;
+
+  // Get the coordinates in input.
+  int index_in[NDIM] = {};
+#pragma unroll
+  for (int i = NDIM - 1; i >= 0; --i) {
+    int index_out = index_out_spatial % params.out_spatial_dims[i];
+    int index_wt = index_wt_spatial % params.wt_spatial_dims[i];
+
+    if (params.flip) {
+      index_wt = params.wt_spatial_dims[i] - index_wt - 1;
+    }
+
+    int index = index_out * params.strides[i] - params.padding[i] +
+        index_wt * params.kernel_dilation[i];
+    int index_max =
+        1 + params.input_dilation[i] * (params.in_spatial_dims[i] - 1);
+
+    valid &= (index >= 0) && (index < index_max) &&
+        (index % params.input_dilation[i] == 0);
+
+    index_in[i] = index / params.input_dilation[i];
+
+    index_out_spatial /= params.out_spatial_dims[i];
+    index_wt_spatial /= params.wt_spatial_dims[i];
+  }
+
+  if (valid) {
+    int in_offset = index_batch * params.in_strides[0];
+#pragma unroll
+    for (int i = 0; i < NDIM; ++i) {
+      in_offset += index_in[i] * params.in_strides[i + 1];
+    }
+    *out = in[in_offset];
+  } else {
+    *out = T{0};
+  }
+}
+
+} // namespace cu
+
+template <int NDIM>
+array unfold_inputs_nd(
+    cu::CommandEncoder& encoder,
+    const array& in,
+    int mat_M,
+    int mat_K,
+    int mat_N,
+    ConvParams<NDIM>& params) {
+  array unfolded({mat_M, mat_K}, in.dtype(), nullptr, {});
+  unfolded.set_data(allocator::malloc(unfolded.nbytes()));
+  encoder.add_temporary(unfolded);
+
+  int filter_size = params.C;
+#pragma unroll
+  for (int i = 0; i < NDIM; ++i) {
+    filter_size *= params.wt_spatial_dims[i];
+  }
+
+  int out_pixels = 1;
+#pragma unroll
+  for (int i = 0; i < NDIM; ++i) {
+    out_pixels *= params.out_spatial_dims[i];
+  }
+
+  int wt_spatial_size = mat_K / params.C;
+  dim3 block_dims;
+  block_dims.x = std::min(std::max(wt_spatial_size, 32), 1024);
+  dim3 num_blocks;
+  num_blocks.x = cuda::ceil_div(wt_spatial_size, block_dims.x);
+  num_blocks.y = params.C;
+  num_blocks.z = mat_M;
+
+  encoder.set_input_array(in);
+  encoder.set_output_array(unfolded);
+  dispatch_float_types(in.dtype(), "unfold", [&](auto type_tag) {
+    using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+    encoder.add_kernel_node(
+        cu::naive_unfold_nd<DataType, NDIM>,
+        num_blocks,
+        block_dims,
+        0,
+        in.data<DataType>(),
+        unfolded.data<DataType>(),
+        filter_size,
+        out_pixels,
+        params);
+  });
+
+  return unfolded;
+}
+
+template <int NDIM>
+void gemm_conv_nd(
+    cu::CommandEncoder& encoder,
+    const array& in,
+    const array& wt,
+    array& out,
+    ConvParams<NDIM>& params,
+    Stream s) {
+  // Get gemm shapes.
+  int mat_M = out.size() / params.O; // N * H_out * W_out
+  int mat_K = wt.size() / params.O; // C * H_wt * W_wt
+  int mat_N = params.O; // O
+
+  // Unfold input to (N * H_out * W_out, C * H_wt * W_wt) for gemm.
+  array in_unfolded =
+      unfold_inputs_nd<NDIM>(encoder, in, mat_M, mat_K, mat_N, params);
+
+  // Reshape weight to (C * H_wt * W_wt, O) for gemm.
+  array wt_reshaped({mat_K, mat_N}, wt.dtype(), nullptr, {});
+  wt_reshaped.copy_shared_buffer(
+      wt,
+      {1, mat_K},
+      {false, false, /* col_contiguous */ true},
+      wt.data_size());
+
+  // Single batch.
+  Shape batch_shape{1};
+  Strides a_batch_strides{0};
+  Strides b_batch_strides{0};
+
+  // Run matmul.
+  CublasGemm gemm(
+      encoder.device(),
+      in.dtype(),
+      false, // a_transposed
+      mat_M, // a_rows
+      mat_K, // a_cols
+      mat_K, // lda
+      true, // b_transposed
+      mat_K, // b_rows
+      mat_N, // b_cols
+      mat_K, // ldb
+      batch_shape.back(),
+      a_batch_strides.back(),
+      b_batch_strides.back());
+  gemm.run(
+      encoder,
+      out,
+      in_unfolded,
+      wt_reshaped,
+      batch_shape,
+      a_batch_strides,
+      b_batch_strides);
+}
+
+void gemm_conv(
+    cu::CommandEncoder& encoder,
+    const array& in,
+    const array& wt,
+    array& out,
+    const std::vector<int>& strides,
+    const std::vector<int>& padding,
+    const std::vector<int>& kernel_dilation,
+    const std::vector<int>& input_dilation,
+    bool flip,
+    Stream s) {
+  int conv_ndim = in.ndim() - 2;
+  if (conv_ndim < 1 || conv_ndim > 3) {
+    throw std::runtime_error(
+        fmt::format("[conv] Unsupported gemm_conv for {}D conv.", conv_ndim));
+  }
+  dispatch_1_2_3(conv_ndim, [&](auto ndim_constant) {
+    ConvParams<ndim_constant()> params(
+        in,
+        wt,
+        out,
+        strides,
+        padding,
+        kernel_dilation,
+        input_dilation,
+        1, // groups
+        flip);
+    gemm_conv_nd<ndim_constant()>(encoder, in, wt, out, params, s);
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/conv/gemm_grouped_conv.cu
+++ b/mlx/backend/cuda/conv/gemm_grouped_conv.cu
@@ -0,0 +1,231 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/conv/conv.h"
+#include "mlx/backend/cuda/gemms/cublas_gemm.h"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/dtype_utils.h"
+
+#include <cooperative_groups.h>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename T, int NDIM>
+__global__ void naive_grouped_unfold_transpose_nd(
+    const T* in,
+    T* out,
+    int filter_size,
+    int out_pixels,
+    const __grid_constant__ ConvParams<NDIM> params) {
+  auto block = cg::this_thread_block();
+  auto tid = block.group_index();
+  auto lid = block.thread_index();
+
+  int index_batch = tid.z / out_pixels; // [0, N)
+  int index_out_spatial = tid.z % out_pixels; // [0, H_out * W_out)
+  int index_wt_spatial =
+      tid.x * block.dim_threads().x + lid.x; // [0, H_wt * W_wt)
+
+  if (index_wt_spatial >= filter_size / params.C) {
+    return;
+  }
+
+  in += tid.y; // [0, C)
+  out += tid.z * filter_size + tid.y * (filter_size / params.C);
+
+  bool valid = index_batch < params.N;
+
+  // Get the coordinates in input.
+  int index_in[NDIM] = {};
+  int wt_stride = 1;
+#pragma unroll
+  for (int i = NDIM - 1; i >= 0; --i) {
+    int index_out = index_out_spatial % params.out_spatial_dims[i];
+    int index_wt = index_wt_spatial % params.wt_spatial_dims[i];
+    out += index_wt * wt_stride;
+
+    if (params.flip) {
+      index_wt = params.wt_spatial_dims[i] - index_wt - 1;
+    }
+
+    int index = index_out * params.strides[i] - params.padding[i] +
+        index_wt * params.kernel_dilation[i];
+    int index_max =
+        1 + params.input_dilation[i] * (params.in_spatial_dims[i] - 1);
+
+    valid &= (index >= 0) && (index < index_max) &&
+        (index % params.input_dilation[i] == 0);
+
+    index_in[i] = index / params.input_dilation[i];
+
+    index_out_spatial /= params.out_spatial_dims[i];
+    index_wt_spatial /= params.wt_spatial_dims[i];
+    wt_stride *= params.wt_spatial_dims[i];
+  }
+
+  if (valid) {
+    int in_offset = index_batch * params.in_strides[0];
+#pragma unroll
+    for (int i = 0; i < NDIM; ++i) {
+      in_offset += index_in[i] * params.in_strides[i + 1];
+    }
+    *out = in[in_offset];
+  } else {
+    *out = T{0};
+  }
+}
+
+} // namespace cu
+
+template <int NDIM>
+array grouped_unfold_transpose_inputs_nd(
+    cu::CommandEncoder& encoder,
+    const array& in,
+    int mat_M,
+    int mat_K,
+    int mat_N,
+    ConvParams<NDIM>& params) {
+  array unfolded({mat_M, mat_K * params.groups}, in.dtype(), nullptr, {});
+  unfolded.set_data(allocator::malloc(unfolded.nbytes()));
+  encoder.add_temporary(unfolded);
+
+  int filter_size = params.C;
+#pragma unroll
+  for (int i = 0; i < NDIM; ++i) {
+    filter_size *= params.wt_spatial_dims[i];
+  }
+
+  int out_pixels = 1;
+#pragma unroll
+  for (int i = 0; i < NDIM; ++i) {
+    out_pixels *= params.out_spatial_dims[i];
+  }
+
+  int wt_spatial_size = (mat_K * params.groups) / params.C;
+  dim3 block_dims;
+  block_dims.x = std::min(std::max(wt_spatial_size, 32), 1024);
+  dim3 num_blocks;
+  num_blocks.x = cuda::ceil_div(wt_spatial_size, block_dims.x);
+  num_blocks.y = params.C;
+  num_blocks.z = mat_M;
+
+  encoder.set_input_array(in);
+  encoder.set_output_array(unfolded);
+  dispatch_float_types(in.dtype(), "unfold", [&](auto type_tag) {
+    using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+    encoder.add_kernel_node(
+        cu::naive_grouped_unfold_transpose_nd<DataType, NDIM>,
+        num_blocks,
+        block_dims,
+        0,
+        in.data<DataType>(),
+        unfolded.data<DataType>(),
+        filter_size,
+        out_pixels,
+        params);
+  });
+
+  return unfolded;
+}
+
+template <int NDIM>
+void gemm_grouped_conv_nd(
+    cu::CommandEncoder& encoder,
+    const array& in,
+    const array& wt,
+    array& out,
+    ConvParams<NDIM>& params,
+    Stream s) {
+  // Get gemm shapes.
+  int C_per_group = params.C / params.groups;
+  int O_per_group = params.O / params.groups;
+  int mat_M = out.size() / params.O; // N * H_out * W_out
+  int mat_K = wt.size() / params.O; // C_per_group * H_wt * W_wt
+  int mat_N = O_per_group; // O_per_group
+
+  // Unfold input to (N * H_out * W_out, C * H_wt * W_wt) for gemm.
+  array in_unfolded = grouped_unfold_transpose_inputs_nd<NDIM>(
+      encoder, in, mat_M, mat_K, mat_N, params);
+
+  // Reshape weight to (O, C_per_group, H_wt * W_wt) for gemm.
+  int wt_spatial_size = (wt.size() / wt.shape(0)) / wt.shape(-1);
+  array wt_view(
+      {params.O, C_per_group, wt_spatial_size}, wt.dtype(), nullptr, {});
+  wt_view.copy_shared_buffer(
+      wt, {wt.strides(0), 1, C_per_group}, wt.flags(), wt.size());
+  array wt_reshaped = contiguous_copy_gpu(wt_view, s);
+
+  // Batch with size of groups.
+  Shape batch_shape{params.groups};
+  Strides a_batch_strides{mat_K};
+  Strides b_batch_strides{mat_N * mat_K};
+
+  // Run matmul.
+  CublasGemm gemm(
+      encoder.device(),
+      in.dtype(),
+      false, // a_transposed
+      mat_M, // a_rows
+      mat_K, // a_cols
+      mat_K * params.groups, // lda
+      true, // b_transposed
+      mat_K, // b_rows
+      mat_N, // b_cols
+      mat_K, // ldb
+      batch_shape.back(),
+      a_batch_strides.back(),
+      b_batch_strides.back());
+  gemm.set_out(
+      out.dtype(),
+      false, // out_transposed
+      mat_M, // out_rows
+      mat_N, // out_cols
+      mat_N * params.groups, // out_ld
+      params.groups, // batch_count
+      mat_N); // batch_stride
+  gemm.run(
+      encoder,
+      out,
+      in_unfolded,
+      wt_reshaped,
+      batch_shape,
+      a_batch_strides,
+      b_batch_strides);
+}
+
+void gemm_grouped_conv(
+    cu::CommandEncoder& encoder,
+    const array& in,
+    const array& wt,
+    array& out,
+    const std::vector<int>& strides,
+    const std::vector<int>& padding,
+    const std::vector<int>& kernel_dilation,
+    const std::vector<int>& input_dilation,
+    int groups,
+    bool flip,
+    Stream s) {
+  int conv_ndim = in.ndim() - 2;
+  if (conv_ndim < 1 || conv_ndim > 3) {
+    throw std::runtime_error(
+        fmt::format("[conv] Unsupported gemm_conv for {}D conv.", conv_ndim));
+  }
+  dispatch_1_2_3(conv_ndim, [&](auto ndim_constant) {
+    ConvParams<ndim_constant()> params(
+        in,
+        wt,
+        out,
+        strides,
+        padding,
+        kernel_dilation,
+        input_dilation,
+        groups,
+        flip);
+    gemm_grouped_conv_nd<ndim_constant()>(encoder, in, wt, out, params, s);
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/copy.cu
+++ b/mlx/backend/cuda/copy.cu
@@ -15,8 +15,8 @@ void copy_gpu_inplace(
    int64_t offset_out,
    CopyType ctype,
    const Stream& s,
-    const std::optional<array>& dynamic_offset_in,
-    const std::optional<array>& dynamic_offset_out) {
+    std::optional<array> dynamic_offset_in,
+    std::optional<array> dynamic_offset_out) {
  if (out.size() == 0) {
    return;
  }
@@ -44,6 +44,16 @@ void copy_gpu_inplace(
          strides_vec[0]);
    } else {
      if (dynamic_offset_in || dynamic_offset_out) {
+        if (!dynamic_offset_in) {
+          dynamic_offset_in = array(0, int64);
+          encoder.add_temporary(*dynamic_offset_in);
+        }
+        if (!dynamic_offset_out) {
+          dynamic_offset_out = array(0, int64);
+          encoder.add_temporary(*dynamic_offset_out);
+        }
+        encoder.set_input_array(*dynamic_offset_in);
+        encoder.set_input_array(*dynamic_offset_out);
        copy_general_dynamic(
            encoder,
            ctype,
@@ -54,8 +64,8 @@ void copy_gpu_inplace(
            shape_collapsed,
            strides_vec[0],
            strides_vec[1],
-            dynamic_offset_in ? *dynamic_offset_in : array(0, int64),
-            dynamic_offset_out ? *dynamic_offset_out : array(0, int64));
+            *dynamic_offset_in,
+            *dynamic_offset_out);
      } else {
        copy_general(
            encoder,
--- a/mlx/backend/cuda/copy/copy_general.cu
+++ b/mlx/backend/cuda/copy/copy_general.cu
@@ -10,37 +10,80 @@ namespace cu {

 namespace cg = cooperative_groups;

-template <typename In, typename Out, typename IdxT, int NDIM>
+template <typename In, typename Out, typename IdxT, int NDIM, int N_READS>
 __global__ void copy_gg_nd(
    const In* in,
    Out* out,
-    IdxT size,
+    IdxT size_rest,
    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_out) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto [idx_in, idx_out] = elem_to_loc_nd<NDIM>(
-        index, shape.data(), strides_in.data(), strides_out.data());
-    out[idx_out] = CastOp<In, Out>{}(in[idx_in]);
+  auto block = cg::this_thread_block();
+  auto grid = cg::this_grid();
+  IdxT index_rest =
+      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
+  if (index_rest >= size_rest) {
+    return;
  }
+
+  auto shape_x = shape[NDIM - 1];
+  auto in_stride_x = strides_in[NDIM - 1];
+  auto out_stride_x = strides_out[NDIM - 1];
+  IdxT index_x =
+      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
+  auto [idx_in, idx_out] = elem_to_loc_nd<NDIM>(
+      index_rest * shape_x,
+      shape.data(),
+      strides_in.data(),
+      strides_out.data());
+
+  auto in_vec =
+      load_vector<N_READS>(in + idx_in, index_x, shape_x, in_stride_x, In(0));
+  AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+  for (int i = 0; i < N_READS; ++i) {
+    out_vec[i] = CastOp<In, Out>{}(in_vec[i]);
+  }
+  store_vector(out + idx_out, index_x, out_vec, shape_x, out_stride_x);
 }

-template <typename In, typename Out, typename IdxT>
+template <typename In, typename Out, typename IdxT, int N_READS>
 __global__ void copy_gg(
    const In* in,
    Out* out,
-    IdxT size,
+    IdxT size_rest,
    const __grid_constant__ Shape shape,
    const __grid_constant__ Strides strides_in,
    const __grid_constant__ Strides strides_out,
    int ndim) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto [idx_in, idx_out] = elem_to_loc(
-        index, shape.data(), strides_in.data(), strides_out.data(), ndim);
-    out[idx_out] = CastOp<In, Out>{}(in[idx_in]);
+  auto block = cg::this_thread_block();
+  auto grid = cg::this_grid();
+  IdxT index_rest =
+      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
+  if (index_rest >= size_rest) {
+    return;
  }
+
+  auto shape_x = shape[ndim - 1];
+  auto in_stride_x = strides_in[ndim - 1];
+  auto out_stride_x = strides_out[ndim - 1];
+  IdxT index_x =
+      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
+  auto [idx_in, idx_out] = elem_to_loc(
+      index_rest * shape_x,
+      shape.data(),
+      strides_in.data(),
+      strides_out.data(),
+      ndim);
+
+  auto in_vec =
+      load_vector<N_READS>(in + idx_in, index_x, shape_x, in_stride_x, In(0));
+  AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+  for (int i = 0; i < N_READS; ++i) {
+    out_vec[i] = CastOp<In, Out>{}(in_vec[i]);
+  }
+  store_vector(out + idx_out, index_x, out_vec, shape_x, out_stride_x);
 }

 } // namespace cu
@@ -69,33 +112,52 @@ void copy_general(
            size_t data_size = 1;
            for (auto& s : shape)
              data_size *= s;
+
+            int work_per_thread = 1;
+            auto dim0 = ndim > 0 ? shape.back() : 1;
+            auto rest = data_size / dim0;
+            if (dim0 >= 4) {
+              work_per_thread = 4;
+            }
+
+            dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
+            auto block_dims = get_block_dims(dim0, rest, 1);
+            uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
+            uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);
+
            if (ndim <= 3) {
              dispatch_1_2_3(ndim, [&](auto ndim_constant) {
-                auto [num_blocks, block_dims] =
-                    get_launch_args(data_size, shape, out.strides(), large());
+                auto kernel =
+                    cu::copy_gg_nd<InType, OutType, IdxT, ndim_constant(), 1>;
+                if (work_per_thread == 4) {
+                  kernel =
+                      cu::copy_gg_nd<InType, OutType, IdxT, ndim_constant(), 4>;
+                }
                encoder.add_kernel_node(
-                    cu::copy_gg_nd<InType, OutType, IdxT, ndim_constant()>,
-                    num_blocks,
+                    kernel,
+                    {num_blocks_x, num_blocks_y},
                    block_dims,
                    0,
                    in_ptr,
                    out_ptr,
-                    data_size,
+                    rest,
                    const_param<ndim_constant()>(shape),
                    const_param<ndim_constant()>(strides_in),
                    const_param<ndim_constant()>(strides_out));
              });
            } else { // ndim >= 4
-              auto [num_blocks, block_dims] =
-                  get_launch_args(data_size, shape, out.strides(), large());
+              auto kernel = cu::copy_gg<InType, OutType, IdxT, 1>;
+              if (work_per_thread == 4) {
+                kernel = cu::copy_gg<InType, OutType, IdxT, 4>;
+              }
              encoder.add_kernel_node(
-                  cu::copy_gg<InType, OutType, IdxT>,
-                  num_blocks,
+                  kernel,
+                  {num_blocks_x, num_blocks_y},
                  block_dims,
                  0,
                  in_ptr,
                  out_ptr,
-                  data_size,
+                  rest,
                  const_param(shape),
                  const_param(strides_in),
                  const_param(strides_out),
--- a/mlx/backend/cuda/copy/copy_general_input.cu
+++ b/mlx/backend/cuda/copy/copy_general_input.cu
@@ -10,33 +10,67 @@ namespace cu {

 namespace cg = cooperative_groups;

-template <typename In, typename Out, typename IdxT, int NDIM>
+template <typename In, typename Out, typename IdxT, int NDIM, int N_READS>
 __global__ void copy_g_nd(
    const In* in,
    Out* out,
-    IdxT size,
+    IdxT size_rest,
    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
-    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    IdxT idx_in = elem_to_loc_nd<NDIM>(index, shape.data(), strides_in.data());
-    out[index] = CastOp<In, Out>{}(in[idx_in]);
+    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides) {
+  auto block = cg::this_thread_block();
+  auto grid = cg::this_grid();
+  IdxT index_rest =
+      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
+  if (index_rest >= size_rest) {
+    return;
  }
+
+  auto shape_x = shape[NDIM - 1];
+  auto stride_x = strides[NDIM - 1];
+  IdxT index_x =
+      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
+  auto idx =
+      elem_to_loc_nd<NDIM>(index_rest * shape_x, shape.data(), strides.data());
+  auto in_vec =
+      load_vector<N_READS>(in + idx, index_x, shape_x, stride_x, In(0));
+  AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+  for (int i = 0; i < N_READS; ++i) {
+    out_vec[i] = CastOp<In, Out>{}(in_vec[i]);
+  }
+  store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
 }

-template <typename In, typename Out, typename IdxT>
+template <typename In, typename Out, typename IdxT, int N_READS>
 __global__ void copy_g(
    const In* in,
    Out* out,
-    IdxT size,
+    IdxT size_rest,
    const __grid_constant__ Shape shape,
-    const __grid_constant__ Strides strides_in,
+    const __grid_constant__ Strides strides,
    int ndim) {
-  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    IdxT idx_in = elem_to_loc(index, shape.data(), strides_in.data(), ndim);
-    out[index] = CastOp<In, Out>{}(in[idx_in]);
+  auto block = cg::this_thread_block();
+  auto grid = cg::this_grid();
+  IdxT index_rest =
+      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
+  if (index_rest >= size_rest) {
+    return;
  }
+
+  auto shape_x = shape[ndim - 1];
+  auto stride_x = strides[ndim - 1];
+  IdxT index_x =
+      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
+  auto idx =
+      elem_to_loc(index_rest * shape_x, shape.data(), strides.data(), ndim);
+  auto in_vec =
+      load_vector<N_READS>(in + idx, index_x, shape_x, stride_x, In(0));
+  AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+  for (int i = 0; i < N_READS; ++i) {
+    out_vec[i] = CastOp<In, Out>{}(in_vec[i]);
+  }
+  store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
 }

 } // namespace cu
@@ -61,30 +95,49 @@ void copy_general_input(
            const InType* in_ptr = in.data<InType>() + offset_in;
            OutType* out_ptr = out.data<OutType>() + offset_out;
            int ndim = shape.size();
+            int work_per_thread = 1;
+            auto dim0 = ndim > 0 ? shape.back() : 1;
+            auto rest = out.size() / dim0;
+            if (dim0 >= 4) {
+              work_per_thread = 4;
+            }
+            dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
+            auto block_dims = get_block_dims(dim0, rest, 1);
+            uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
+            uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);
+
            if (ndim <= 3) {
              dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                auto [num_blocks, block_dims] = get_launch_args(out, large());
+                auto kernel =
+                    cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 1>;
+                if (work_per_thread == 4) {
+                  kernel =
+                      cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 4>;
+                }
                encoder.add_kernel_node(
-                    cu::copy_g_nd<InType, OutType, IdxT, dims_constant()>,
-                    num_blocks,
+                    kernel,
+                    {num_blocks_x, num_blocks_y},
                    block_dims,
                    0,
                    in_ptr,
                    out_ptr,
-                    out.size(),
+                    rest,
                    const_param<dims_constant()>(shape),
                    const_param<dims_constant()>(strides_in));
              });
            } else { // ndim >= 4
-              auto [num_blocks, block_dims] = get_launch_args(out, large());
+              auto kernel = cu::copy_g<InType, OutType, IdxT, 1>;
+              if (work_per_thread == 4) {
+                kernel = cu::copy_g<InType, OutType, IdxT, 4>;
+              }
              encoder.add_kernel_node(
-                  cu::copy_g<InType, OutType, IdxT>,
-                  num_blocks,
+                  kernel,
+                  {num_blocks_x, num_blocks_y},
                  block_dims,
                  0,
                  in_ptr,
                  out_ptr,
-                  out.size(),
+                  rest,
                  const_param(shape),
                  const_param(strides_in),
                  ndim);
--- a/mlx/backend/cuda/cudnn_utils.cpp
+++ b/mlx/backend/cuda/cudnn_utils.cpp
@@ -0,0 +1,275 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/cudnn_utils.h"
+#include "mlx/backend/cuda/device.h"
+
+namespace mlx::core {
+
+namespace {
+
+// Create a cudnn tensor descriptor.
+template <typename Vec>
+inline cudnn_frontend::Tensor build_cudnn_tensor(
+    int64_t id,
+    const array& x,
+    const Vec& shape,
+    const Vec& strides) {
+  return cudnn_frontend::TensorBuilder()
+      .setDim(shape.size(), shape.data())
+      .setStrides(strides.size(), strides.data())
+      .setId(id)
+      .setAlignment(get_alignment(x))
+      .setDataType(dtype_to_cudnn_type(x.dtype()))
+      .build();
+}
+
+// In MLX a singleton dim (shape[dim] == 1) can have any stride, but in cuDNN
+// whether a tensor is contiguous is determined with:
+// shape[dim] == shape[dim + 1] * strides[dim + 1]
+// So a contiguous array with singleton dims in MLX may be mistakenly treated
+// as strided in cuDNN, and we work around it by normalizing the strides.
+Strides normalized_strides(const array& x) {
+  if (!x.flags().row_contiguous || x.ndim() < 2) {
+    return x.strides();
+  }
+  Strides strides = x.strides();
+  for (int i = x.ndim() - 2; i >= 0; --i) {
+    if (x.shape(i) == 1) {
+      strides[i] = x.shape(i + 1) * strides[i + 1];
+    }
+  }
+  return strides;
+}
+
+// Return the shape and strides after transposing from NHWC to NCHW.
+auto nhwc_to_nchw(SmallVector<int64_t> shape, SmallVector<int64_t> strides) {
+  assert(shape.size() >= 3);
+  shape.insert(shape.begin() + 1, shape.back());
+  shape.erase(shape.end() - 1);
+  strides.insert(strides.begin() + 1, strides.back());
+  strides.erase(strides.end() - 1);
+  return std::make_tuple(std::move(shape), std::move(strides));
+}
+
+inline auto nhwc_to_nchw(const array& x) {
+  return nhwc_to_nchw(
+      convert_vector<int64_t>(x.shape()), normalized_strides(x));
+}
+
+// Return available engines for a |op_graph|.
+cudnn_frontend::EngineConfigList get_cudnn_engine_configs(
+    cudnnBackendDescriptorType_t backend_type,
+    Dtype dtype,
+    cudnn_frontend::OperationGraph& op_graph,
+    bool use_fallback = true) {
+  SmallVector<cudnn_frontend::GeneratorSource, 2> sources;
+  sources.push_back([](auto& op_graph) {
+    auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
+                          .setOperationGraph(op_graph)
+                          .setHeurMode(CUDNN_HEUR_MODE_A)
+                          .build();
+    return heuristics.getEngineConfig(heuristics.getEngineConfigCount());
+  });
+  if (use_fallback) {
+    sources.push_back([&backend_type](auto& op_graph) {
+      auto fallback = cudnn_frontend::EngineFallbackListBuilder()
+                          .setOperationGraph(op_graph)
+                          .setOperation(backend_type)
+                          .build();
+      return fallback.getFallbackList();
+    });
+  }
+
+  auto configs =
+      cudnn_frontend::EngineConfigGenerator(sources.size(), sources.data())
+          .generate_engine_config(op_graph);
+
+  cudnn_frontend::EngineConfigList filtered_configs;
+  cudnn_frontend::filter(configs, filtered_configs, [dtype](auto c) {
+    if (cudnn_frontend::hasNumericalNote<
+            CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS>(c)) {
+      return true;
+    }
+    if (cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_TENSOR_CORE>(c) &&
+        dtype == float32 && !env::enable_tf32()) {
+      return true;
+    }
+    return false;
+  });
+  return filtered_configs;
+}
+
+// Take |engine_configs| and |op_graph| and find a working execution plans
+// from them.
+std::optional<cudnn_frontend::ExecutionPlan>
+find_cudnn_plan_from_engine_configs(
+    cudnnHandle_t handle,
+    const cudnn_frontend::EngineConfigList& engine_configs,
+    const cudnn_frontend::OperationGraph& op_graph) {
+  auto op_graph_tag = op_graph.getTag();
+  for (const auto& config : engine_configs) {
+    try {
+      return cudnn_frontend::ExecutionPlanBuilder()
+          .setHandle(handle)
+          .setEngineConfig(config, op_graph_tag)
+          .build();
+    } catch (cudnn_frontend::cudnnException& error) {
+      if (error.getCudnnStatus() != CUDNN_STATUS_NOT_SUPPORTED) {
+        throw;
+      }
+    }
+  }
+  return std::nullopt;
+}
+
+// Prepare workspace and args to execute plan.
+template <typename F>
+bool prepare_cudnn_plan(
+    cu::CommandEncoder& encoder,
+    cudnn_frontend::ExecutionPlan& plan,
+    int num_args,
+    const int64_t* uids,
+    void** data_ptrs,
+    F&& execute) {
+  int workspace_size = plan.getWorkspaceSize();
+  array workspace(
+      workspace_size > 0 ? allocator::malloc(workspace_size)
+                         : allocator::Buffer(nullptr),
+      {workspace_size},
+      uint8);
+
+  auto args = cudnn_frontend::VariantPackBuilder()
+                  .setWorkspacePointer(workspace.data<void>())
+                  .setDataPointers(num_args, data_ptrs)
+                  .setUids(num_args, uids)
+                  .build();
+
+  auto handle = encoder.device().cudnn_handle();
+  cudnnSetStream(handle, encoder.stream());
+
+  if (!execute(handle, plan.get_raw_desc(), args.get_raw_desc())) {
+    return false;
+  }
+
+  encoder.add_temporary(workspace);
+  return true;
+}
+
+} // namespace
+
+cudnn_frontend::Tensor build_cudnn_tensor(int64_t id, const array& x) {
+  auto shape = convert_vector<int64_t>(x.shape());
+  return build_cudnn_tensor(id, x, shape, normalized_strides(x));
+}
+
+cudnn_frontend::Tensor build_cudnn_tensor_nchw(int64_t id, const array& x) {
+  auto [shape, strides] = nhwc_to_nchw(x);
+  return build_cudnn_tensor(id, x, shape, strides);
+}
+
+cudnn_frontend::Tensor build_cudnn_tensor_4d_nchw(int64_t id, const array& x) {
+  if (x.ndim() == 0) {
+    SmallVector<int64_t, 4> scalar_dims = {1, 1, 1, 1};
+    return build_cudnn_tensor(id, x, scalar_dims, scalar_dims);
+  }
+  if (x.ndim() == 1) {
+    int64_t s = x.shape(0);
+    SmallVector<int64_t, 4> shape = {1, x.shape(0), 1, 1};
+    SmallVector<int64_t, 4> strides = {s, 1, s, s};
+    return build_cudnn_tensor(id, x, shape, strides);
+  }
+  if (x.ndim() == 2) {
+    int64_t s =
+        x.flags().row_contiguous ? x.shape(1) * x.strides(1) : x.strides(0);
+    SmallVector<int64_t, 4> shape = {x.shape(0), x.shape(1), 1, 1};
+    SmallVector<int64_t, 4> strides = {s, x.strides(1), s, s};
+    return build_cudnn_tensor(id, x, shape, strides);
+  }
+  if (x.ndim() == 3 || x.ndim() == 4) {
+    return build_cudnn_tensor_nchw(id, x);
+  }
+  throw std::runtime_error(
+      fmt::format("Unsupported array with {} dims.", x.ndim()));
+}
+
+cudnn_frontend::Tensor build_cudnn_scalar_4d(int64_t id, Dtype dtype) {
+  SmallVector<int64_t, 4> scalar_dims = {1, 1, 1, 1};
+  return cudnn_frontend::TensorBuilder()
+      .setDim(scalar_dims.size(), scalar_dims.data())
+      .setStrides(scalar_dims.size(), scalar_dims.data())
+      .setId(id)
+      .setAlignment(16)
+      .setDataType(dtype_to_cudnn_type(dtype))
+      .setByValue(true)
+      .build();
+}
+
+std::optional<cudnn_frontend::ExecutionPlan> find_cudnn_plan_from_op_graph(
+    cudnnHandle_t handle,
+    cudnnBackendDescriptorType_t backend_type,
+    Dtype dtype,
+    cudnn_frontend::OperationGraph& op_graph) {
+  auto engine_configs = get_cudnn_engine_configs(backend_type, dtype, op_graph);
+  if (engine_configs.empty()) {
+    return std::nullopt;
+  }
+  return find_cudnn_plan_from_engine_configs(handle, engine_configs, op_graph);
+}
+
+bool encode_cudnn_plan_with_capturing(
+    cu::CommandEncoder& encoder,
+    cudnn_frontend::ExecutionPlan& plan,
+    int num_args,
+    const int64_t* uids,
+    void** data_ptrs) {
+  return prepare_cudnn_plan(
+      encoder,
+      plan,
+      num_args,
+      uids,
+      data_ptrs,
+      [&](auto handle, auto plan, auto args) {
+        auto capture = encoder.capture_context();
+        if (cudnnBackendExecute(handle, plan, args) != CUDNN_STATUS_SUCCESS) {
+          // Discard the captured graph when failed.
+          capture.discard = true;
+          return false;
+        }
+        return true;
+      });
+}
+
+#if CUDNN_VERSION >= 90500
+bool encode_cudnn_plan_with_graph_api(
+    cu::CommandEncoder& encoder,
+    cudnn_frontend::ExecutionPlan& plan,
+    CudaGraph& graph,
+    int num_args,
+    const int64_t* uids,
+    void** data_ptrs) {
+  return prepare_cudnn_plan(
+      encoder,
+      plan,
+      num_args,
+      uids,
+      data_ptrs,
+      [&](auto handle, auto plan, auto args) {
+        if (!graph) {
+          graph = CudaGraph(encoder.device());
+          if (cudnnBackendPopulateCudaGraph(handle, plan, args, graph) !=
+              CUDNN_STATUS_SUCCESS) {
+            return false;
+          }
+        } else {
+          if (cudnnBackendUpdateCudaGraph(handle, plan, args, graph) !=
+              CUDNN_STATUS_SUCCESS) {
+            return false;
+          }
+        }
+        encoder.add_graph_node(graph);
+        return true;
+      });
+}
+#endif
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/cudnn_utils.h
+++ b/mlx/backend/cuda/cudnn_utils.h
@@ -0,0 +1,164 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/array.h"
+#include "mlx/backend/cuda/device/config.h"
+#include "mlx/backend/cuda/utils.h"
+#include "mlx/dtype_utils.h"
+
+#include <cudnn_frontend.h>
+#include <cudnn_frontend_find_plan.h>
+#include <fmt/format.h>
+
+#include <algorithm>
+#include <array>
+
+namespace mlx::core {
+
+namespace cu {
+class CommandEncoder;
+}
+
+// Return pointer alignment of |x|'s data.
+inline uint8_t get_alignment(const array& x) {
+  uint8_t alignment = 1;
+  uintptr_t address = reinterpret_cast<uintptr_t>(x.data<void>());
+  for (; alignment < 32; alignment *= 2) {
+    if (address % (alignment * 2)) {
+      return alignment;
+    }
+  }
+  return alignment;
+}
+
+// Convert the type of elements in |vec| to |T|.
+template <typename T, typename Vec>
+inline SmallVector<T> convert_vector(const Vec& vec) {
+  return SmallVector<T>(vec.begin(), vec.end());
+}
+
+// Return an array that can be used as map key for |vec| with size <= MAX_NDIM.
+//
+// There are 2 differences from the const_param util from kernel_utils.cuh:
+// 1. The rest of array is filled with 0.
+// 2. This util can be used in .cpp files.
+template <typename T, template <typename U> class Vec>
+inline std::array<T, MAX_NDIM> vector_key(const Vec<T>& vec) {
+  if (vec.size() > MAX_NDIM) {
+    throw std::runtime_error(
+        fmt::format("ndim can not be larger than {}.", MAX_NDIM));
+  }
+  std::array<T, MAX_NDIM> result = {};
+  std::copy_n(vec.begin(), vec.size(), result.begin());
+  return result;
+}
+
+// Helpers used by get_data_ptrs to get pointers.
+inline void* get_data_ptr(const array& arr) {
+  return const_cast<void*>(arr.data<void>());
+}
+
+template <typename T, typename = std::enable_if_t<std::is_scalar_v<T>>>
+inline void* get_data_ptr(T& scalar) {
+  return &scalar;
+}
+
+// Return an array filled with data pointers of args.
+template <typename... Args>
+inline std::array<void*, sizeof...(Args)> get_data_ptrs(Args&... args) {
+  return {get_data_ptr(args)...};
+}
+
+// Map dtype to cudnn data type.
+inline cudnnDataType_t dtype_to_cudnn_type(Dtype dtype) {
+  switch (dtype) {
+    case int8:
+      return CUDNN_DATA_INT8;
+    case int32:
+      return CUDNN_DATA_INT32;
+    case uint8:
+      return CUDNN_DATA_UINT8;
+    case float16:
+      return CUDNN_DATA_HALF;
+    case bfloat16:
+      return CUDNN_DATA_BFLOAT16;
+    case float32:
+      return CUDNN_DATA_FLOAT;
+    case float64:
+      return CUDNN_DATA_DOUBLE;
+    default:
+      throw std::runtime_error(fmt::format(
+          "Unsupported dtype in Convolution: {}.", dtype_to_string(dtype)));
+  }
+}
+
+// Create a tensor descriptor from |x|.
+cudnn_frontend::Tensor build_cudnn_tensor(int64_t id, const array& x);
+
+// Create a tensor descriptor from |x|, and transpose from NHWC to NCHW.
+cudnn_frontend::Tensor build_cudnn_tensor_nchw(int64_t id, const array& x);
+
+// Create a tensor descriptor from |x|, make sure it is 4D, and transpose it
+// from NHWC to NCHW.
+cudnn_frontend::Tensor build_cudnn_tensor_4d_nchw(int64_t id, const array& x);
+
+// Create a 4D scalar tensor descriptor, which is passed by value.
+cudnn_frontend::Tensor build_cudnn_scalar_4d(int64_t id, Dtype dtype);
+
+// Find a working plan for |op_graph|.
+std::optional<cudnn_frontend::ExecutionPlan> find_cudnn_plan_from_op_graph(
+    cudnnHandle_t handle,
+    cudnnBackendDescriptorType_t backend_type,
+    Dtype dtype,
+    cudnn_frontend::OperationGraph& op_graph);
+
+// Encode the plan to command buffer by capturing.
+bool encode_cudnn_plan_with_capturing(
+    cu::CommandEncoder& encoder,
+    cudnn_frontend::ExecutionPlan& plan,
+    int num_args,
+    const int64_t* uids,
+    void** data_ptrs);
+
+#if CUDNN_VERSION >= 90500
+// Encode the plan to command buffer by using native graph api of cudnn. If the
+// |graph| is empty it will be populated, otherwise it will be updated.
+bool encode_cudnn_plan_with_graph_api(
+    cu::CommandEncoder& encoder,
+    cudnn_frontend::ExecutionPlan& plan,
+    CudaGraph& graph,
+    int num_args,
+    const int64_t* uids,
+    void** data_ptrs);
+#endif
+
+// Helpers to make calls like encode_cudnn_plan(..., {'x', 'y', 'z'}, x, y, z).
+template <typename... Args>
+bool encode_cudnn_plan(
+    cu::CommandEncoder& encoder,
+    cudnn_frontend::ExecutionPlan& plan,
+    std::initializer_list<int64_t> uids,
+    Args&... args) {
+  assert(uids.size() == sizeof...(args));
+  auto data_ptrs = get_data_ptrs(args...);
+  return encode_cudnn_plan_with_capturing(
+      encoder, plan, uids.size(), uids.begin(), data_ptrs.data());
+}
+
+#if CUDNN_VERSION >= 90500
+template <typename... Args>
+bool encode_cudnn_plan(
+    cu::CommandEncoder& encoder,
+    cudnn_frontend::ExecutionPlan& plan,
+    CudaGraph& graph,
+    std::initializer_list<int64_t> uids,
+    Args&... args) {
+  assert(uids.size() == sizeof...(args));
+  auto data_ptrs = get_data_ptrs(args...);
+  return encode_cudnn_plan_with_graph_api(
+      encoder, plan, graph, uids.size(), uids.begin(), data_ptrs.data());
+}
+#endif
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/custom_kernel.cpp
+++ b/mlx/backend/cuda/custom_kernel.cpp
@@ -0,0 +1,379 @@
+// Copyright © 2025 Apple Inc.
+
+#include <iostream>
+
+#include "mlx/backend/common/compiled.h"
+#include "mlx/backend/cuda/jit_module.h"
+#include "mlx/backend/cuda/utils.h"
+#include "mlx/backend/gpu/copy.h"
+#include "mlx/fast.h"
+#include "mlx/fast_primitives.h"
+
+#include <fmt/format.h>
+#include <nvtx3/nvtx3.hpp>
+
+namespace mlx::core::fast {
+
+namespace {
+
+constexpr const char* default_header = R"(
+#include "mlx/backend/cuda/device/utils.cuh"
+
+#include <cooperative_groups.h>
+
+#define inf cuda::std::numeric_limits<float>::infinity()
+
+)";
+
+std::string template_arguments_hash(
+    const std::vector<std::pair<std::string, TemplateArg>>& template_args) {
+  if (template_args.empty()) {
+    return "";
+  }
+
+  std::string hash;
+  hash.reserve(512);
+
+  for (const auto& [name, arg] : template_args) {
+    if (std::holds_alternative<int>(arg)) {
+      hash += fmt::format("_{}", std::get<int>(arg));
+    } else if (std::holds_alternative<bool>(arg)) {
+      hash += (std::get<bool>(arg)) ? "_t" : "_f";
+    } else if (std::holds_alternative<Dtype>(arg)) {
+      hash += "_";
+      hash += get_type_string(std::get<Dtype>(arg));
+    }
+  }
+
+  return hash;
+}
+
+std::string build_kernel(
+    const std::string& func_name,
+    const std::string& header,
+    const std::string& source,
+    const std::vector<std::string>& input_names,
+    const std::vector<array>& inputs,
+    const std::vector<std::string>& output_names,
+    const std::vector<Dtype>& output_dtypes,
+    const std::vector<std::pair<std::string, TemplateArg>>& template_args,
+    const std::vector<CustomKernelShapeInfo>& shape_infos) {
+  std::string kernel_source;
+  kernel_source.reserve(header.size() + source.size() + 8192);
+  kernel_source += default_header;
+  kernel_source += header;
+  kernel_source +=
+      "namespace mlx::core::cu {\n\n"
+      "namespace cg = cooperative_groups;\n\n";
+
+  kernel_source += "__global__ void ";
+  kernel_source += func_name;
+  kernel_source += "(\n";
+
+  // Add inputs
+  for (int i = 0; i < inputs.size(); ++i) {
+    const auto& name = input_names[i];
+    const auto& arr = inputs[i];
+    kernel_source += "    const ";
+    kernel_source += dtype_to_cuda_type(arr.dtype());
+    kernel_source += "* ";
+    kernel_source += name;
+    kernel_source += ",\n";
+    // Add input shape, strides and ndim if present in the source
+    if (arr.ndim() > 0) {
+      if (shape_infos[i].shape) {
+        kernel_source += "    const __grid_constant__ Shape ";
+        kernel_source += name;
+        kernel_source += "_shape,\n";
+      }
+      if (shape_infos[i].strides) {
+        kernel_source += "    const __grid_constant__ Strides ";
+        kernel_source += name;
+        kernel_source += "_strides,\n";
+      }
+      if (shape_infos[i].ndim) {
+        kernel_source += "    const __grid_constant__ int ";
+        kernel_source += name;
+        kernel_source += "_ndim,\n";
+      }
+    }
+  }
+
+  // Add outputs
+  for (int i = 0; i < output_names.size(); ++i) {
+    const auto& name = output_names[i];
+    const auto& dtype = output_dtypes[i];
+    kernel_source += "    ";
+    kernel_source += dtype_to_cuda_type(dtype);
+    kernel_source += "* ";
+    kernel_source += name;
+    if (i < output_names.size() - 1) {
+      kernel_source += ",\n";
+    } else {
+      kernel_source += ") {\n";
+    }
+  }
+
+  // Set compile time constants
+  if (!template_args.empty()) {
+    for (const auto& [name, arg] : template_args) {
+      if (std::holds_alternative<int>(arg)) {
+        kernel_source +=
+            fmt::format("  constexpr int {} = {};\n", name, std::get<int>(arg));
+      } else if (std::holds_alternative<bool>(arg)) {
+        kernel_source += fmt::format(
+            "  constexpr bool {} = {};\n", name, std::get<bool>(arg));
+      } else {
+        kernel_source += fmt::format(
+            "  using {} = {};\n",
+            name,
+            dtype_to_cuda_type(std::get<Dtype>(arg)));
+      }
+    }
+    kernel_source += "\n";
+  }
+
+  kernel_source += source;
+  kernel_source += "\n}\n\n} // namespace mlx::core::cu\n";
+
+  return kernel_source;
+}
+
+} // namespace
+
+CustomKernelFunction cuda_kernel(
+    const std::string& name,
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::string& source,
+    const std::string& header,
+    bool ensure_row_contiguous,
+    int shared_memory) {
+  if (output_names.empty()) {
+    throw std::invalid_argument(
+        "[custom_kernel] Must specify at least one output.");
+  }
+
+  std::vector<CustomKernelShapeInfo> shape_infos;
+  for (auto& n : input_names) {
+    CustomKernelShapeInfo shape_info;
+    shape_info.shape = source.find(n + "_shape") != std::string::npos;
+    shape_info.strides = source.find(n + "_strides") != std::string::npos;
+    shape_info.ndim = source.find(n + "_ndim") != std::string::npos;
+    shape_infos.push_back(shape_info);
+  }
+
+  return [=, shape_infos = std::move(shape_infos)](
+             const std::vector<array>& inputs,
+             const std::vector<Shape>& output_shapes,
+             const std::vector<Dtype>& output_dtypes,
+             std::tuple<int, int, int> grid,
+             std::tuple<int, int, int> threadgroup,
+             const std::vector<std::pair<std::string, TemplateArg>>&
+                 template_args = {},
+             std::optional<float> init_value = std::nullopt,
+             bool verbose = false,
+             StreamOrDevice s_ = {}) {
+    if (inputs.size() != input_names.size()) {
+      std::ostringstream msg;
+      msg << "[custom_kernel] Expected `inputs` to have size "
+          << input_names.size() << " but got size " << inputs.size() << "."
+          << std::endl;
+      throw std::invalid_argument(msg.str());
+    }
+    if (output_shapes.size() != output_names.size()) {
+      std::ostringstream msg;
+      msg << "[custom_kernel] Expected `output_shapes` to have size "
+          << output_names.size() << " but got size " << output_shapes.size()
+          << "." << std::endl;
+      throw std::invalid_argument(msg.str());
+    }
+    if (output_dtypes.size() != output_names.size()) {
+      std::ostringstream msg;
+      msg << "[custom_kernel] Expected `output_dtypes` to have size "
+          << output_names.size() << " but got size " << output_dtypes.size()
+          << "." << std::endl;
+      throw std::invalid_argument(msg.str());
+    }
+
+    auto s = to_stream(s_);
+    if (s.device != Device::gpu) {
+      throw std::invalid_argument("[custom_kernel] Only supports the GPU.");
+    }
+
+    std::string kernel_name =
+        "custom_kernel_" + name + template_arguments_hash(template_args);
+    std::string kernel_source = build_kernel(
+        kernel_name,
+        header,
+        source,
+        input_names,
+        inputs,
+        output_names,
+        output_dtypes,
+        template_args,
+        shape_infos);
+
+    if (verbose) {
+      std::cout << "Generated source code for `" << kernel_name
+                << "`:" << std::endl
+                << "```" << std::endl
+                << kernel_source << std::endl
+                << "```" << std::endl;
+    }
+
+    return array::make_arrays(
+        std::move(output_shapes),
+        std::move(output_dtypes),
+        std::make_shared<CustomKernel>(
+            s,
+            std::move(kernel_name),
+            std::move(kernel_source),
+            grid,
+            threadgroup,
+            shape_infos,
+            ensure_row_contiguous,
+            init_value,
+            std::vector<ScalarArg>{},
+            false,
+            shared_memory),
+        std::move(inputs));
+  };
+}
+
+std::vector<array> precompiled_cuda_kernel(
+    const std::string& name,
+    const std::string& compiled_source,
+    const std::vector<array>& inputs,
+    const std::vector<Shape>& output_shapes,
+    const std::vector<Dtype>& output_dtypes,
+    const std::vector<ScalarArg>& scalars,
+    std::tuple<int, int, int> grid,
+    std::tuple<int, int, int> threadgroup,
+    int shared_memory,
+    std::optional<float> init_value,
+    bool ensure_row_contiguous,
+    StreamOrDevice s) {
+  std::vector<CustomKernelShapeInfo> shape_infos(
+      inputs.size(), CustomKernelShapeInfo{false, false, false});
+  return array::make_arrays(
+      output_shapes,
+      output_dtypes,
+      std::make_shared<CustomKernel>(
+          to_stream(s),
+          name,
+          compiled_source,
+          grid,
+          threadgroup,
+          shape_infos,
+          ensure_row_contiguous,
+          init_value,
+          scalars,
+          true,
+          shared_memory),
+      inputs);
+}
+
+void CustomKernel::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  nvtx3::scoped_range r("CustomKernel::eval_gpu");
+  auto& s = stream();
+
+  std::vector<array> copies;
+
+  // Allocate and initialize the output arrays
+  for (auto& out : outputs) {
+    if (init_value_) {
+      copies.emplace_back(init_value_.value(), out.dtype());
+      fill_gpu(copies.back(), out, s);
+    } else {
+      out.set_data(allocator::malloc(out.nbytes()));
+    }
+  }
+
+  // Create the input arrays and copy if needed
+  auto check_input = [&copies, &s, this](const array& x) -> const array {
+    bool no_copy = x.flags().row_contiguous;
+    if (!ensure_row_contiguous_ || no_copy) {
+      return x;
+    } else {
+      copies.push_back(array(x.shape(), x.dtype(), nullptr, {}));
+      copy_gpu(x, copies.back(), CopyType::General, s);
+      return copies.back();
+    }
+  };
+  std::vector<array> checked_inputs;
+  for (const array& in : inputs) {
+    checked_inputs.push_back(check_input(in));
+  }
+
+  // Compile the custom kernel
+  std::string kernel_name =
+      (is_precompiled_) ? name_ : "mlx::core::cu::" + name_;
+  cu::JitModule& mod = cu::get_jit_module(
+      s.device,
+      name_,
+      [&]() {
+        return std::make_tuple(
+            is_precompiled_, source_, std::vector{kernel_name});
+      },
+      false);
+
+  // Make the arguments
+  cu::KernelArgs args;
+  for (int i = 0; i < checked_inputs.size(); i++) {
+    const array& in = checked_inputs[i];
+    auto& shape_info = shape_infos_[i];
+    args.append(in);
+    if (shape_info.shape) {
+      args.append_ndim(in.shape());
+    }
+    if (shape_info.strides) {
+      args.append_ndim(in.strides());
+    }
+    if (shape_info.ndim) {
+      args.append<int32_t>(in.ndim());
+    }
+  }
+  for (auto& out : outputs) {
+    args.append(out);
+  }
+  for (auto& s : scalar_arguments_) {
+    if (std::holds_alternative<bool>(s)) {
+      args.append(std::get<bool>(s));
+    } else if (std::holds_alternative<int>(s)) {
+      args.append(std::get<int>(s));
+    } else if (std::holds_alternative<float>(s)) {
+      args.append(std::get<float>(s));
+    }
+  }
+
+  // Make the grid
+  const auto [tx, ty, tz] = threadgroup_;
+  const auto [gx, gy, gz] = grid_;
+  dim3 block(std::min(tx, gx), std::min(ty, gy), std::min(tz, gz));
+  dim3 grid((gx + tx - 1) / tx, (gy + ty - 1) / ty, (gz + tz - 1) / tz);
+
+  // Call the kernel
+  auto& encoder = cu::get_command_encoder(s);
+  for (const auto& in : checked_inputs) {
+    encoder.set_input_array(in);
+  }
+  for (const auto& out : outputs) {
+    encoder.set_output_array(out);
+  }
+  for (const auto& t : copies) {
+    encoder.add_temporary(t);
+  }
+  auto kernel =
+      mod.get_kernel(kernel_name, [smem = shared_memory_](CUfunction kernel) {
+        if (smem > 0 && smem > 48000) {
+          cuFuncSetAttribute(
+              kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem);
+        }
+      });
+  encoder.add_kernel_node(kernel, grid, block, shared_memory_, args.args());
+}
+
+} // namespace mlx::core::fast
--- a/mlx/backend/cuda/device.cpp
+++ b/mlx/backend/cuda/device.cpp
@@ -14,10 +14,6 @@ namespace mlx::core::cu {

 namespace {

-// Can be tuned with MLX_MAX_OPS_PER_BUFFER
-// This should be less than 255
-constexpr int default_max_nodes_per_graph = 20;
-
 #define CHECK_CUDNN_ERROR(cmd) check_cudnn_error(#cmd, (cmd))

 void check_cudnn_error(const char* name, cudnnStatus_t err) {
@@ -27,11 +23,11 @@ void check_cudnn_error(const char* name, cudnnStatus_t err) {
  }
 }

-int cuda_graph_cache_size() {
-  static int cache_size = []() {
-    return env::get_var("MLX_CUDA_GRAPH_CACHE_SIZE", 100);
+bool use_cuda_graphs() {
+  static bool use_graphs = []() {
+    return env::get_var("MLX_USE_CUDA_GRAPHS", true);
  }();
-  return cache_size;
+  return use_graphs;
 }

 } // namespace
@@ -68,8 +64,8 @@ Device::~Device() {

 void Device::make_current() {
  // We need to set/get current CUDA device very frequently, cache it to reduce
-  // actual calls of CUDA APIs. This function assumes single-thread in host.
-  static int current = 0;
+  // actual calls of CUDA APIs.
+  static thread_local int current = 0;
  if (current != device_) {
    CHECK_CUDA_ERROR(cudaSetDevice(device_));
    current = device_;
@@ -86,14 +82,20 @@ CommandEncoder& Device::get_command_encoder(Stream s) {

 CommandEncoder::CaptureContext::CaptureContext(CommandEncoder& enc) : enc(enc) {
  enc.device().make_current();
+  if (!use_cuda_graphs()) {
+    return;
+  }
  CHECK_CUDA_ERROR(
      cudaStreamBeginCapture(enc.stream(), cudaStreamCaptureModeGlobal));
 }

 CommandEncoder::CaptureContext::~CaptureContext() {
-  CHECK_CUDA_ERROR(cudaStreamEndCapture(enc.stream(), &graph));
-  std::unique_ptr<cudaGraph_t, void (*)(cudaGraph_t*)> graph_freer(
-      &graph, [](cudaGraph_t* p) { CHECK_CUDA_ERROR(cudaGraphDestroy(*p)); });
+  if (!use_cuda_graphs()) {
+    enc.node_count_++;
+    return;
+  }
+
+  graph.end_capture(enc.stream());
  if (discard) {
    return;
  }
@@ -107,6 +109,9 @@ CommandEncoder::ConcurrentContext::ConcurrentContext(CommandEncoder& enc)

 CommandEncoder::ConcurrentContext::~ConcurrentContext() {
  enc.in_concurrent_ = false;
+  if (!use_cuda_graphs()) {
+    return;
+  }

  // Use an empty graph node for synchronization
  CommandEncoder::GraphNode empty{NULL, 'E', std::to_string(enc.node_count_++)};
@@ -185,37 +190,46 @@ void CommandEncoder::insert_graph_dependencies(std::vector<GraphNode> nodes) {
 }

 CommandEncoder::CommandEncoder(Device& d)
-    : device_(d), stream_(d), graph_cache_(cuda_graph_cache_size()) {
-  CHECK_CUDA_ERROR(cudaGraphCreate(&graph_, 0));
-}
+    : device_(d),
+      stream_(d),
+      graph_(d),
+      worker_(d),
+      graph_cache_("MLX_CUDA_GRAPH_CACHE_SIZE", /* default_capacity */ 400) {}

 void CommandEncoder::add_completed_handler(std::function<void()> task) {
  worker_.add_task(std::move(task));
 }

 void CommandEncoder::set_input_array(const array& arr) {
+  if (!use_cuda_graphs()) {
+    return;
+  }
  auto id = reinterpret_cast<std::uintptr_t>(arr.buffer().ptr());
  active_deps_.push_back(id);
 }

 void CommandEncoder::set_output_array(const array& arr) {
+  if (!use_cuda_graphs()) {
+    return;
+  }
+
  auto id = reinterpret_cast<std::uintptr_t>(arr.buffer().ptr());
  active_deps_.push_back(id);
  active_outputs_.push_back(id);
 }

-void CommandEncoder::maybe_commit() {
-  if (node_count_ >= env::max_ops_per_buffer(default_max_nodes_per_graph)) {
-    commit();
-  }
-}
-
 void CommandEncoder::add_kernel_node(
    void* func,
    dim3 grid_dim,
    dim3 block_dim,
    uint32_t smem_bytes,
    void** params) {
+  if (!use_cuda_graphs()) {
+    node_count_++;
+    CHECK_CUDA_ERROR(cudaLaunchKernel(
+        func, grid_dim, block_dim, params, smem_bytes, stream()));
+    return;
+  }
  cudaKernelNodeParams kernel_params = {0};
  kernel_params.func = func;
  kernel_params.gridDim = grid_dim;
@@ -231,6 +245,23 @@ void CommandEncoder::add_kernel_node(
    dim3 block_dim,
    uint32_t smem_bytes,
    void** params) {
+  if (!use_cuda_graphs()) {
+    node_count_++;
+    CHECK_CUDA_ERROR(cuLaunchKernel(
+        func,
+        grid_dim.x,
+        grid_dim.y,
+        grid_dim.z,
+        block_dim.x,
+        block_dim.y,
+        block_dim.z,
+        smem_bytes,
+        stream(),
+        params,
+        nullptr));
+    return;
+  }
+
  CUDA_KERNEL_NODE_PARAMS kernel_params = {0};
  kernel_params.func = func;
  kernel_params.gridDimX = grid_dim.x;
@@ -257,20 +288,38 @@ void CommandEncoder::add_kernel_node(const CUDA_KERNEL_NODE_PARAMS& params) {
 }

 void CommandEncoder::add_graph_node(cudaGraph_t child) {
+  if (!use_cuda_graphs()) {
+    node_count_++;
+    CudaGraphExec graph_exec;
+    graph_exec.instantiate(child);
+    device_.make_current();
+    CHECK_CUDA_ERROR(cudaGraphLaunch(graph_exec, stream()));
+    return;
+  }
  cudaGraphNode_t node;
  CHECK_CUDA_ERROR(cudaGraphAddChildGraphNode(&node, graph_, NULL, 0, child));
  insert_graph_dependencies(GraphNode{node, 'G'});
 }

+int CommandEncoder::get_num_ops() {
+  return node_count_;
+}
+
 void CommandEncoder::commit() {
  nvtx3::scoped_range r("CommandEncoder::commit");
  if (!temporaries_.empty()) {
    add_completed_handler([temporaries = std::move(temporaries_)]() {});
  }
-  if (node_count_ > 0) {
+  if (use_cuda_graphs() && node_count_ > 0) {
    if (!from_nodes_.empty()) {
      CHECK_CUDA_ERROR(cudaGraphAddDependencies(
-          graph_, from_nodes_.data(), to_nodes_.data(), from_nodes_.size()));
+          graph_,
+          from_nodes_.data(),
+          to_nodes_.data(),
+#if CUDART_VERSION >= 13000
+          nullptr, // edgeData
+#endif // CUDART_VERSION >= 13000
+          from_nodes_.size()));
    }

    graph_key_ += ".";
@@ -304,19 +353,18 @@ void CommandEncoder::commit() {
    CHECK_CUDA_ERROR(cudaGraphLaunch(graph_exec, stream_));

    // Reset state
-    node_count_ = 0;
    graph_node_count_ = 0;
    empty_node_count_ = 0;
    from_nodes_.clear();
    to_nodes_.clear();
    graph_key_.clear();
    node_map_.clear();
-    CHECK_CUDA_ERROR(cudaGraphDestroy(graph_));
-    CHECK_CUDA_ERROR(cudaGraphCreate(&graph_, 0));
+    graph_ = CudaGraph(device_);
  }

  // Put completion handlers in a batch.
  worker_.commit(stream_);
+  node_count_ = 0;
 }

 void CommandEncoder::synchronize() {
--- a/mlx/backend/cuda/device.h
+++ b/mlx/backend/cuda/device.h
@@ -21,7 +21,7 @@ class CommandEncoder {
  struct CaptureContext {
    CaptureContext(CommandEncoder& enc);
    ~CaptureContext();
-    cudaGraph_t graph;
+    CudaGraph graph;
    CommandEncoder& enc;
    bool discard{false};
  };
@@ -76,9 +76,6 @@ class CommandEncoder {
      uint32_t smem_bytes,
      void** params);

-  // Low-level graph helpers.
-  void add_kernel_node(const cudaKernelNodeParams& params);
-  void add_kernel_node(const CUDA_KERNEL_NODE_PARAMS& params);
  void add_graph_node(cudaGraph_t child);

  void add_temporary(const array& arr) {
@@ -86,7 +83,7 @@ class CommandEncoder {
  }

  void add_completed_handler(std::function<void()> task);
-  void maybe_commit();
+  int get_num_ops();
  void commit();

  Device& device() {
@@ -101,6 +98,9 @@ class CommandEncoder {
  void synchronize();

 private:
+  void add_kernel_node(const cudaKernelNodeParams& params);
+  void add_kernel_node(const CUDA_KERNEL_NODE_PARAMS& params);
+
  struct GraphNode {
    cudaGraphNode_t node;
    // K = kernel
@@ -115,7 +115,7 @@ class CommandEncoder {

  Device& device_;
  CudaStream stream_;
-  cudaGraph_t graph_;
+  CudaGraph graph_;
  Worker worker_;
  char node_count_{0};
  char graph_node_count_{0};
@@ -140,7 +140,7 @@ class Device {
  Device(const Device&) = delete;
  Device& operator=(const Device&) = delete;

-  // Make this device the current cuda device, required by some cuda calls.
+  // Make this device the current cuda device, this method is thread-safe.
  void make_current();

  CommandEncoder& get_command_encoder(Stream s);
--- a/mlx/backend/cuda/device/binary_ops.cuh
+++ b/mlx/backend/cuda/device/binary_ops.cuh
@@ -204,6 +204,12 @@ struct Power {
  __device__ T operator()(T base, T exp) {
    if constexpr (cuda::std::is_integral_v<T>) {
      T res = 1;
+      // Raising an integer to a negative power is undefined
+      if constexpr (cuda::std::is_signed_v<T>) {
+        if (exp < 0) {
+          return 0;
+        }
+      }
      while (exp) {
        if (exp & 1) {
          res *= base;
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Ronan Collobert	24828b1b2f	CMakeLists.txt update	2025-10-31 16:55:04 -07:00
Ronan Collobert	9f649b5658	WIP (python)	2025-10-31 16:24:51 -07:00
Ronan Collobert	18aa921388	WIP	2025-10-31 16:24:35 -07:00
Ronan Collobert	8d13a0bc6b	WIP (metal)	2025-10-31 16:24:21 -07:00
Ronan Collobert	ac75c87fd7	WIP (cpu)	2025-10-31 16:24:09 -07:00
Ronan Collobert	7107802e09	WIP (examples)	2025-10-31 16:23:51 -07:00
Ronan Collobert	c5913131cf	WIP (distributed)	2025-10-31 13:32:56 -07:00
Ronan Collobert	19ab7911f6	WIP (cuda)	2025-10-31 13:32:43 -07:00
Ronan Collobert	4a1b1796b7	WIP (io)	2025-10-31 13:20:47 -07:00
Ronan Collobert	b48d298205	WIP (distributed)	2025-10-31 13:20:09 -07:00
Ronan Collobert	8277e71ea9	WIP (gpu)	2025-10-31 13:19:54 -07:00
Ronan Collobert	b0d985416a	fix arg_reduce	2025-10-31 13:13:15 -07:00
Ronan Collobert	8d10f3ec75	WIP (metal)	2025-10-31 11:47:03 -07:00
Ronan Collobert	6343622c67	fix small vector indexing checks	2025-10-31 11:46:36 -07:00
Ronan Collobert	979abf462b	WIP (metal)	2025-10-31 09:43:29 -07:00
Ronan Collobert	981d2fdaf0	WIP (cpu)	2025-10-31 09:40:50 -07:00
Ronan Collobert	5a306d3495	WIP (common)	2025-10-31 09:40:13 -07:00
Ronan Collobert	5baa361779	WIP (tests)	2025-10-31 09:39:38 -07:00
Ronan Collobert	1bac0db7e3	WIP	2025-10-30 16:25:36 -07:00
Ronan Collobert	a1212b4e44	WIP (distributed)	2025-10-30 16:25:11 -07:00
Ronan Collobert	45a8b226af	WIP (cpu)	2025-10-30 16:24:51 -07:00
Ronan Collobert	76ef1e98f3	WIP (common)	2025-10-30 16:18:59 -07:00
Ronan Collobert	63d91557e0	fix FFT (PocketFFT requires size_t for axis)	2025-10-29 17:05:48 -07:00
Ronan Collobert	310e501e6a	WIP (cpu)	2025-10-29 16:52:25 -07:00
Ronan Collobert	cacc3ab7fd	WIP (common)	2025-10-29 16:51:42 -07:00
Ronan Collobert	53525cba23	WIP	2025-10-29 16:51:05 -07:00
Ronan Collobert	3d67b717a0	the cpu simd case	2025-10-29 16:43:18 -07:00
Ronan Collobert	953b2f5be2	WIP	2025-10-29 16:11:32 -07:00
Ronan Collobert	26f7155537	SmallVector: keep sizes small (int)	2025-10-29 16:06:10 -07:00
Ronan Collobert	66fcb9fe94	array: use int or int64_t instead of size_t	2025-10-29 16:04:04 -07:00
Awni Hannun	d1e06117e8	bump python (#2694 )	2025-10-27 11:34:31 -07:00
Awni Hannun	539d8322d1	add median op (#2705 )	2025-10-27 11:33:42 -07:00
Awni Hannun	c4767d110f	fix addmm cpu (#2699 )	2025-10-27 11:33:32 -07:00
David Koski	895217f25b	optionally load metallib from framework (#2702 ) * optionally load metallib from framework * pre-commit * adjust logic	2025-10-27 07:52:03 -07:00
Manuel Villanueva	0cfeeb60ca	Einsum error msg improvement (#2690 ) * Improved error message for Einsum * Modifications via pre-commit * format * nits --------- Co-authored-by: Awni Hannun <awni@apple.com>	2025-10-27 06:31:47 -07:00
Ronan Collobert	8f8af61a37	fix warnings showing up with -Wall (#2692 )	2025-10-24 11:43:35 -07:00
Manuel Villanueva	233384161e	Improved mx.split() docs (#2689 ) * Improved mx.split() documentation * Fix typo in docstring for array split function * add example --------- Co-authored-by: Awni Hannun <awni@apple.com>	2025-10-24 09:48:41 -07:00
Awni Hannun	5bcf3a6794	format	2025-10-22 16:08:47 -07:00
wickedcoder	7707196297	Merge commit from fork * add length validation to the header * fix accessing out of bound index with .at()	2025-10-22 15:31:25 -07:00
wickedcoder	7e3471c987	Merge commit from fork * add tensor->weights_data validation * add null pointer check for tensor	2025-10-22 15:31:03 -07:00
Awni Hannun	9f0ba3ddf1	patch bump (#2680 )	2025-10-17 12:12:07 -07:00
Awni Hannun	4bce5f9b2d	suppress gcc 10.1 warnings (#2679 ) * suppress gcc 10.1 warnings * suppress gcc 10.1 warnings	2025-10-17 12:09:21 -07:00
Anastasiia Filippova	e9eab527eb	Nccl timeout (#2673 ) * print the error & delete nccl group * timeout for nccl binding * typo * revert error * fixed a typo	2025-10-14 12:29:54 -07:00
Awni Hannun	36ca62dba8	remove unused unary file (#2672 )	2025-10-13 19:36:26 -07:00
Manuel Villanueva	9cbb1b0148	Modified sort behavior when running CPU or Metal to match NumPy/JAX (#2667 ) * Modified sort behavior when running CPU or Metal to match NumPy/JAX sorting behavior. * Modified sort behavior when running CPU or Metal to match NumPy/JAX * nits --------- Co-authored-by: Awni Hannun <awni@apple.com>	2025-10-13 14:36:45 -07:00
Fabrizio Milo	9bfc476d72	Normalize README bullet formatting (#2671 )	2025-10-13 12:13:30 -07:00
Awni Hannun	25e2356316	speed up scalars (#2669 )	2025-10-13 12:10:15 -07:00
Awni Hannun	226a1d24e0	Debug cuda conv (#2662 ) * use t4 * use t4	2025-10-10 16:12:47 -07:00
Awni Hannun	630350ad3e	Precise sigmoid (#2659 ) * bump patch * Sigmoid matches PyTorch and is more precise on tails	2025-10-10 10:05:23 -07:00
Awni Hannun	380aeb58ae	enable admm low-precision cpu (#2661 )	2025-10-10 09:50:54 -07:00
Awni Hannun	f37389d100	bump patch (#2658 )	2025-10-10 08:36:41 -07:00
Awni Hannun	e89e8b4272	Export with callback (#2612 ) * export with callback * export with callback * Add types, fix kwarg ordering bug + test * cleanup, test, fix * typos	2025-10-08 19:24:33 -07:00
AN Long	85a8824a8c	Fix cumulative operations when axis=None (#2653 )	2025-10-08 15:25:38 -07:00
Awni Hannun	f5d4397e5c	Fix fast synch when fence is waited before a command buffer is created (#2657 )	2025-10-08 11:23:46 -07:00
Awni Hannun	343e33b6d5	fix all_gather vjp (#2654 )	2025-10-07 06:05:23 -07:00
Angelos Katharopoulos	0073096dd1	Split name into directories for cuda jit (#2656 )	2025-10-07 01:52:58 -07:00
Angelos Katharopoulos	e3d004fed9	Fix and refactor row-reduce (#2650 )	2025-10-07 01:51:08 -07:00
Awni Hannun	a393435d28	Speed up compile for node with many parents (#2649 )	2025-10-03 19:30:36 -07:00
Awni Hannun	a7a94b29d7	Fix compile when outputs change (#2648 )	2025-10-03 08:40:57 -07:00
Daniel Yeh	22a5da76c8	Faster complex matmul (#2571 )	2025-10-02 23:33:15 -07:00
Andrey Portnoy	287c63a093	Configure CMake to export compile_commands.json (#2645 ) This helps enable LSP for code navigation using clangd.	2025-10-02 15:40:32 -07:00
Awni Hannun	1c9ae1eaa1	cuda fix flaky test (#2646 )	2025-10-02 15:40:04 -07:00
Angelos Katharopoulos	c2c3e0b0a2	[CUDA] Add a small column specialization to reduce (#2642 )	2025-10-02 14:41:05 -07:00
Awni Hannun	b0cc71ae71	Faster triu, tril, where with scalar (#2644 )	2025-10-02 12:21:27 -07:00
Awni Hannun	e88f2d4a8e	fix cross entropy axis param (#2641 ) * fix cross entropy axis param * faster grad clipping	2025-10-01 16:49:55 -07:00
Angelos Katharopoulos	9cee557423	Fix status message (#2638 )	2025-10-01 16:43:45 -07:00
Awni Hannun	bbf1423953	wait for tasks in cuda (#2636 )	2025-09-30 16:08:46 -07:00
Angelos Katharopoulos	eb24267b56	Compile now can attach arbitrary data to an entry (#2634 )	2025-09-30 13:33:27 -07:00
Awni Hannun	dc371ae7a5	fix for max block dim (#2631 )	2025-09-29 08:59:25 -07:00
AN Long	e76a8dd5c5	Fix incorrect path and typos (#2630 )	2025-09-28 06:03:04 -07:00
Cheng	b466dea982	[CUDA] Make CudaEvent work with multi-device (#2614 ) * Set current device when creating cuda event * Separate cuda events by device * Avoid race condition in pool	2025-09-27 11:27:17 +09:00
Angelos Katharopoulos	7a6adda1e6	Bump the version (#2627 )	2025-09-26 15:15:28 -07:00
Angelos Katharopoulos	1a9f820af6	Compiled should not end in broadcast (#2622 )	2025-09-26 13:36:09 -07:00
Awni Hannun	d4f4ff3c5e	Allow None input to compiled functions (#2621 ) * Allow None input to compiled functions * Allow None input to compiled functions	2025-09-25 08:42:23 -07:00
Jagrit Digani	7c7e48dbd1	New tuning for small K gemv (#2620 ) * New tuning for small K gemv	2025-09-23 12:28:35 -07:00
Daniel Yeh	fbbf3b9b3e	Support pickling array for bfloat16 (#2586 ) * add bfloat16 pickling * Improvements * improve --------- Co-authored-by: Chen-Chen Yeh <ge96noj@mytum.de>	2025-09-22 20:12:15 -07:00
Daniel Yeh	bf01ad9367	fix (#2613 ) Co-authored-by: Chen-Chen Yeh <ge96noj@mytum.de>	2025-09-22 20:12:04 -07:00
Cheng	ae438d05fa	[CUDA] Recycle CUDA events (#2604 ) * Make CudaEvent a CudaHandle * Add caching for CudaEvent * Make sure cuda events are destroyed at last * Fix headers * SharedEvent => AtomicEvent * RawCudaEvent => CudaEventHandle, CudaEventWrapper => CopyableCudaEvent * Remove unneeded asserts	2025-09-23 10:42:03 +09:00
Awni Hannun	711a645807	avoid producing NaN in attention (#2608 )	2025-09-22 13:10:43 -07:00
Josh Bleecher Snyder	aa9d44b3d4	implement Convolution::output_shape (#2601 ) - pull conv_out_shape out for re-use - add Conv::output_shape - add e2e python tests confirming shapeless=True support and correctness Updates #2599	2025-09-22 10:09:45 -07:00
Awni Hannun	ec2ab42888	Lower sorted QMM gather threshold (#2609 )	2025-09-19 18:22:55 -07:00
Cheng	787c0d90cd	Detect cache thrashing in LRUCache (#2600 ) * Detect cache thrashing in LRUCache * Do not check cache thrashing in tests	2025-09-19 09:12:14 +09:00
Oleksandr Bilous	e8b604a6a3	fix: library loading for swift dynamic frameworks (#2568 )	2025-09-18 13:54:59 -07:00
Awni Hannun	50cc09887f	expose depends (#2606 )	2025-09-18 10:06:15 -07:00
Umberto Mignozzetti	3f730e77aa	Update export function example for array input (#2598 ) After changing the shape to conform (same shapes for all objects), the example works.	2025-09-16 14:38:05 -07:00
Awni Hannun	caecbe876a	no copy batch rope (#2595 )	2025-09-15 14:23:48 -07:00
Umberto Mignozzetti	8afb6d62f2	Fix typo in average_gradients function call (#2594 )	2025-09-15 11:29:21 -07:00
Awni Hannun	6ccfa603cd	fix metal scan (#2591 )	2025-09-15 11:01:57 -07:00
Umberto Mignozzetti	36cad99a11	Refactor code examples to use 'gelu' (#2592 ) Updated code examples to use 'gelu' directly instead of 'nn.gelu'.	2025-09-15 09:47:02 -07:00
Awni Hannun	ee18e1cbf0	patch bump (#2588 )	2025-09-11 17:10:09 -07:00
Awni Hannun	af120c2bc0	set nccl ABI version (#2587 )	2025-09-11 16:55:53 -07:00
Cheng	6a3acf2301	[CUDA] Set bias as input when using bias epilogue (#2584 )	2025-09-11 15:31:09 +09:00
Awni Hannun	d6977f2a57	Add sdpa with sinks (#2558 ) * add sdpa with sinks * fix 2 pass * fix matrix sdpa * fix perf regression * add to cuda (#2580)	2025-09-10 14:53:00 -07:00
Gökdeniz Gülmez	db5443e831	Adding Relu2 (#2582 ) * in. com. * upd. ackn. * update __init__ * nits * nits + format * used mx.maximum(x, 0) instead of calling the function and moves relu6 under relu2 to make it nicer * same with _make_activation_module * Update python/mlx/nn/layers/activations.py upd Co-authored-by: Awni Hannun <awni.hannun@gmail.com> * update funct.rst * upd. layers.rst --------- Co-authored-by: Awni Hannun <awni.hannun@gmail.com>	2025-09-10 07:24:30 -07:00
Cheng	52b8384d10	Fix flaky addmm tests (#2581 )	2025-09-10 14:22:22 +09:00
Cheng	44cc5da4bc	[CUDA] Fix alpha not respected when using bias epilogue (#2578 )	2025-09-10 09:08:01 +09:00
Cheng	dde3682b69	[CUDA] Use GEMM with epilogue instead of AddMM (#2569 )	2025-09-09 13:18:49 +09:00
Awni Hannun	17310d91a6	Add batch offsets for mx.fast.rope (#2564 ) * implement batch rope for Metal * cuda rope (#2576)	2025-09-08 17:35:07 -07:00
Cheng	b194d65a6a	Some tweaks in cmake files (#2574 ) * Do proper check of Metal lib * Update doctest to get rid of cmake version hack	2025-09-09 08:27:18 +09:00
Cheng	a44b27f5f8	Fix a few ccache cache miss (#2573 ) * Fix ccache cache miss * Do not define _VERSION_ in python bindings	2025-09-09 07:41:05 +09:00
Awni Hannun	e5a33f2223	faster depthwise 1D conv (#2567 )	2025-09-08 11:37:23 -07:00
Cheng	c1e3340b23	Set ccache size before building (#2570 )	2025-09-07 09:00:31 +09:00
XXXXRT666	8f163a367d	typing: add type hints to mlx.core.array, linalg, distributed, and random (#2565 ) * Add type annotations to mlx methods * Missing list_or_scalar	2025-09-04 09:08:11 -07:00
Manuel Villanueva	89a3df9014	Fixed several type annotations in the MLX stubs which degraded to Unknown/Any (#2560 ) * Added scalar to stubs to fix Unkown Type Hint ### Proposed changes Issue #2478 reports that several type annotations in the MLX stubs degrade to Unknown/Any in editors like VS Code with Pylance, due to missing imports (Union, Optional, Tuple) and an undefined scalar type alias. This PR updates the stub generation patterns to: • Add missing typing imports in mlx.core.__prefix__ so that Union, Optional, Tuple, etc. are always available. • Define and export scalar: TypeAlias = Union[int, float, bool] in mlx.core.__suffix__ so that functions typed with Union[scalar, array] resolve correctly instead of falling back to Any. • Update submodule stub prefixes (distributed, fast, linalg, metal, random) to import scalar alongside array, Device, and Stream, ensuring type checkers resolve the union consistently across modules. With these changes, functions like mlx.add now display rich type signatures such as: ``` def add( a: scalar \| array, b: scalar \| array, stream: Stream \| Device \| None = None ) -> array ``` instead of degrading to Any. ### Checklist • I have read the CONTRIBUTING document • I have run pre-commit run --all-files to format my code / installed pre-commit prior to committing changes • I have added tests that prove my fix is effective or that my feature works (n/a — stub generation only) • I have updated the necessary documentation (if needed) * add bool to patterns --------- Co-authored-by: Awni Hannun <awni@apple.com>	2025-09-03 12:52:08 -07:00
Krishi Saripalli	c5d2937aa5	chore: Update Docs With Slice Copy Example (#2559 ) * chore: updated docs with slice copy example * nits --------- Co-authored-by: Awni Hannun <awni@apple.com>	2025-09-02 22:07:02 -07:00
Awni Hannun	b61a65e313	fix copies in sdpa (#2563 )	2025-09-02 11:00:36 -07:00
wrmsr	04cbb4191c	Fix dequantize python sig (#2562 )	2025-09-01 11:50:20 -07:00
Artur Antonov	c5460762e7	Fix AdamW weight_decay default value in docstring (#2557 )	2025-08-31 21:29:30 -07:00
Awni Hannun	8ce49cd39e	fix quantized vjp for mxfp4 (#2555 )	2025-08-29 10:06:15 -07:00
Awni Hannun	9c68b50853	version bump (#2554 )	2025-08-29 06:54:17 -07:00
Awni Hannun	111f1e71af	Faster contiguous gather for indices in the first axis (#2552 ) * faster contiguous gather for indices in the first axis * work per thread > 1 * angelos suggestion for scales / biases	2025-08-28 21:26:30 -07:00
Awni Hannun	827003d568	fix METAL quantization in JIT (#2553 )	2025-08-28 18:26:25 -07:00
Awni Hannun	d363a76aa4	Bump xcode in circle (#2551 ) * bump xcode in circle * bump xcode in circle * bump xcode in circle	2025-08-28 13:13:34 -07:00
Awni Hannun	70560b6bd5	Add mode parameter for quantization (#2499 ) * add mode parameter for quantization * mxfp4 quantize/dequantize + start of optional biases * mxfp4 works * speedup * cpu mxfp4 * fix * fix test tol * fix * refactor * add quant mode enum	2025-08-28 06:45:26 -07:00
Awni Hannun	7ef8a6f2d5	[CUDA] fix sort (#2550 ) * [CUDA] fix sort * fix test	2025-08-27 19:48:43 -07:00
Cheng	31c6f6e33f	[CUDA] Use ConcurrentContext in concatenate_gpu (#2549 )	2025-08-28 09:30:08 +09:00
Awni Hannun	584d48458e	link with nccl (#2546 )	2025-08-27 10:01:07 -07:00
Cheng	5cf984ca87	Separate cpu compilation cache by versions (#2548 )	2025-08-27 11:25:15 +09:00
Cheng	a9bac3d9e5	Run CPP tests for CUDA build in CI (#2544 )	2025-08-27 08:06:46 +09:00
Awni Hannun	5458d43247	add load with path tests (#2543 )	2025-08-26 14:24:47 -07:00
Awni Hannun	a4dba65220	Enable cuda graph toggle (#2545 ) * enable cuda graph toggle * increase cache size	2025-08-26 12:50:38 -07:00
Awni Hannun	3dcb286baf	Remove stream from average grads so it uses default (#2532 ) * Remove stream from average grads so it uses default * comment	2025-08-25 15:56:29 -07:00
Cheng	4822c3dbe9	[CUDA] Implement DynamicSlice/DynamicSliceUpdate (#2533 ) * Move DynamicSlice to gpu/primitives * Implement compute_dynamic_offset in CUDA	2025-08-26 07:31:39 +09:00
Awni Hannun	2ca75bb529	Remove nccl install in release (#2542 )	2025-08-25 15:20:18 -07:00
Awni Hannun	db14e29a0b	allow pathlib.Path to save/load functions (#2541 )	2025-08-25 14:58:49 -07:00
Awni Hannun	d2f540f4e0	Use nccl header only when nccl is not present (#2539 ) * use nccl header only when nccl is not present * larger machine for cuda build	2025-08-25 14:17:25 -07:00
Cheng	333ffea273	[CUDA] Remove thrust in arange (#2535 )	2025-08-24 16:22:36 +09:00
Cheng	f55b6f1f2f	Enable COMPILE_WARNING_AS_ERROR for linux builds in CI (#2534 )	2025-08-24 15:33:08 +09:00
Awni Hannun	30561229c7	Fix allocation bug in NCCL (#2530 )	2025-08-22 14:39:43 -07:00
Awni Hannun	068a4612e9	nccl default for backend=any (#2528 ) * nccl default for backend=any * check num gpus + ensure row contiguous for all reduce * comment	2025-08-22 12:24:27 -07:00
Andrey Portnoy	5722c147de	[CUDA] Update calls to `cudaMemAdvise` and `cudaGraphAddDependencies` for CUDA 13 (#2525 ) * [CUDA] Update cudaMemAdvise and cudaGraphAddDependencies for CUDA 13 These functions' signatures changed in CUDA 13, so we differentiate between CUDA 13 and preceding releases at compile time. * Mention NVIDIA in ACKNOWLEDGMENTS.md	2025-08-21 19:57:20 -07:00
Cheng	f6819a1f26	Fix warning 186-D from nvcc (#2527 )	2025-08-22 10:29:55 +09:00
Awni Hannun	f93f87c802	nccl dep + default for cuda (#2526 )	2025-08-21 17:57:49 -07:00
Anastasiia Filippova	9392fc3f88	NCCL backend (#2476 )	2025-08-21 11:56:15 -07:00
Awni Hannun	e843c4d8d5	fix power (#2523 )	2025-08-21 06:46:01 -07:00
Angelos Katharopoulos	0c5fc63a36	Fix docs omission (#2524 )	2025-08-20 17:56:06 -07:00
Angelos Katharopoulos	e397177f6e	Custom cuda kernel (#2517 )	2025-08-20 17:20:22 -07:00
Cheng	f4c8888cbe	[CUDA] Fix stride of singleton dims before passing to cuDNN (#2521 )	2025-08-21 08:55:26 +09:00
Angelos Katharopoulos	25c1e03205	Fix overflow in large filter small channels (#2520 )	2025-08-20 08:03:29 -07:00
russellizadi	512281781c	Remove state return from function example in compile documentation (#2518 )	2025-08-20 00:45:05 -07:00
Cheng	ac85ddfdb7	[CUDA] Add GEMM-based fallback convolution kernels (#2511 ) * Add gemm_conv * Add gemm_grouped_conv	2025-08-20 10:06:22 +09:00
Cheng	65d0d40232	Split cuDNN helpers into a separate header (#2491 ) * Add RAII managed CudaGraph class * Implement forward rms_norm with cuDNN * Revert back to old rms norm kernel	2025-08-20 09:29:28 +09:00
Awni Hannun	cea9369610	fix lapack svd (#2515 )	2025-08-18 15:07:59 -07:00
Awni Hannun	e7c6e1db82	no segfault with uninitialized array.at (#2514 )	2025-08-18 08:33:38 -07:00
Awni Hannun	c5fcd5b61b	fix custom kernel test (#2510 )	2025-08-18 06:45:59 -07:00
Angelos Katharopoulos	1df9887998	Ensure no oob read in gemv_masked (#2508 )	2025-08-17 08:42:33 -07:00
Angelos Katharopoulos	73f22d6226	Ensure small sort doesn't use indices if not argsort (#2506 )	2025-08-17 08:42:20 -07:00
Cheng	c422050ca7	Update cuDNN Frontend to v1.14 (#2505 )	2025-08-17 19:13:01 +09:00
Cheng	1ba18ff7d9	[CUDA] Fix conv grads with groups (#2495 ) * Put reshape utils in one file * [CUDA] Fix conv grads with groups * Put the reshape utils in gpu/copy.h	2025-08-16 10:09:18 +09:00
Cheng	37b440faa8	Clean up code handling both std::vector and SmallVector (#2493 )	2025-08-16 09:01:10 +09:00
Cheng	888b13ed63	Remove the hack around SmallVector in cpu compile (#2494 )	2025-08-16 08:17:24 +09:00
Cheng	4abb218d21	The naive_conv_2d is no longer used (#2496 )	2025-08-16 07:57:30 +09:00
Awni Hannun	6441c21a94	Faster general unary op (#2472 ) * faster general unary op * faster general ops + reorg * fix + comment * binary two * copy general	2025-08-15 15:04:12 -07:00