Test with CUDA 12.2 (#2375 )

* Test with CUDA 12.0 * try older image * fix cpu sort
fix ring distributed test (#2380 )
2025-12-16 01:49:05 +08:00 · 2025-07-16 13:00:37 -07:00 · 2025-07-16 11:25:24 -07:00 · 2025-07-16 07:34:24 -07:00 · 2025-07-15 18:19:47 -07:00 · 2025-07-15 14:22:07 -07:00
144 changed files with 5244 additions and 2647 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7,18 +7,6 @@ parameters:
  nightly_build:
    type: boolean
    default: false
-  weekly_build:
-    type: boolean
-    default: false
-  test_release:
-    type: boolean
-    default: false
-  linux_release:
-    type: boolean
-    default: false
-  cuda_release:
-    type: boolean
-    default: false

 jobs:
  build_documentation:
@@ -41,7 +29,7 @@ jobs:
            pip install --upgrade pip
            pip install --upgrade cmake
            pip install -r docs/requirements.txt
-            CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` pip install . -v
+            pip install . -v
      - when:
          condition:
            not: << parameters.upload-docs >>
@@ -73,9 +61,9 @@ jobs:
                 git push -f origin gh-pages

  linux_build_and_test:
-    docker:
-      - image: cimg/python:3.9
-
+    machine:
+      image: ubuntu-2204:current
+      resource_class: large
    steps:
      - checkout
      - run:
@@ -87,21 +75,17 @@ jobs:
      - run:
          name: Install dependencies
          command: |
-            pip install --upgrade cmake
-            pip install nanobind==2.4.0
-            pip install numpy
+            export DEBIAN_FRONTEND=noninteractive
+            export NEEDRESTART_MODE=a
            sudo apt-get update
-            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
+            sudo apt-get upgrade -y
+            pip install --upgrade cmake
+            sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
            sudo apt-get install openmpi-bin openmpi-common libopenmpi-dev
      - run:
          name: Install Python package
          command: |
-            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
-              python3 setup.py build_ext --inplace
-            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
-              python3 setup.py develop
+            pip install -e ".[dev]"
      - run:
          name: Generate package stubs
          command: |
@@ -111,9 +95,10 @@ jobs:
      - run:
          name: Run Python tests
          command: |
-            python3 -m unittest discover python/tests -v
+            python -m unittest discover python/tests -v
            mpirun --bind-to none -host localhost:8 -np 8 python python/tests/mpi_test_distributed.py
-            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py
+            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
+            if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
      - run:
          name: Build CPP only
          command: |
@@ -157,8 +142,7 @@ jobs:
          name: Install Python package
          command: |
            source env/bin/activate
-            DEBUG=1 CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
-            CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
+            DEBUG=1 CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
              pip install -e . -v
      - run:
          name: Generate package stubs
@@ -173,7 +157,8 @@ jobs:
            LOW_MEMORY=1 DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu
            mpirun --bind-to none -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
-            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py
+            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
+            if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
      - run:
          name: Build example extension
          command: |
@@ -208,8 +193,7 @@ jobs:
          name: Run Python tests with JIT
          command: |
            source env/bin/activate
-            CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
-              CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
+            CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
              pip install -e . -v
            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 \
              METAL_DEBUG_ERROR_MODE=0 \
@@ -217,7 +201,7 @@ jobs:

  cuda_build_and_test:
    machine:
-      image: linux-cuda-12:default
+      image: linux-cuda-12:2023.11.1
      resource_class: gpu.nvidia.small.gen2
    steps:
      - checkout
@@ -226,10 +210,9 @@ jobs:
          command: |
            sudo apt-get update
            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
-            python -m venv env
+            python3 -m venv env
            source env/bin/activate
-            CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
-              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+            CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
              pip install -e ".[dev]"
      - run:
          name: Run Python tests
@@ -278,7 +261,6 @@ jobs:
          command: |
            source env/bin/activate
            env -u MACOSX_DEPLOYMENT_TARGET DEV_RELEASE=1 \
-              CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
              pip install . -v
      - run:
          name: Generate package stubs
@@ -290,9 +272,17 @@ jobs:
          name: Build Python package
          command: |
            source env/bin/activate
-            << parameters.build_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
-              python -m build -w
+            << parameters.build_env >> MLX_BUILD_STAGE=1 python -m build -w
+      - when:
+          condition:
+            equal: ["3.9", << parameters.python_version >>]
+          steps:
+            - run:
+                name: Build common package
+                command: |
+                  source env/bin/activate
+                  python setup.py clean --all
+                  << parameters.build_env >> MLX_BUILD_STAGE=2 python -m build -w
      - when:
          condition: << parameters.build_env >>
          steps:
@@ -309,63 +299,70 @@ jobs:
      python_version:
        type: string
        default: "3.9"
-      extra_env:
+      build_env:
        type: string
-        default: "DEV_RELEASE=1"
-    docker:
-      - image: ubuntu:20.04
+        default: ""
+    machine:
+      image: ubuntu-2204:current
+      resource_class: large
    steps:
      - checkout
      - run:
          name: Build wheel
          command: |
            PYTHON=python<< parameters.python_version >>
-            apt-get update
-            apt-get upgrade -y
-            DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
-            apt-get install -y apt-utils
-            apt-get install -y software-properties-common
-            add-apt-repository -y ppa:deadsnakes/ppa
-            apt-get install -y $PYTHON $PYTHON-dev $PYTHON-full
-            apt-get install -y libblas-dev liblapack-dev liblapacke-dev
-            apt-get install -y build-essential git
+            export DEBIAN_FRONTEND=noninteractive
+            export NEEDRESTART_MODE=a
+            sudo apt-get update
+            sudo apt-get upgrade -y
+            TZ=Etc/UTC sudo apt-get -y install tzdata
+            sudo apt-get install -y apt-utils
+            sudo apt-get install -y software-properties-common
+            sudo add-apt-repository -y ppa:deadsnakes/ppa
+            sudo apt-get install -y $PYTHON $PYTHON-dev $PYTHON-full
+            sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
+            sudo apt-get install -y build-essential git
            $PYTHON -m venv env
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install nanobind==2.4.0
-            pip install --upgrade setuptools
-            pip install numpy
            pip install auditwheel
            pip install patchelf
            pip install build
            pip install twine
-            << parameters.extra_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
-              pip install . -v
+            << parameters.build_env >> pip install ".[dev]" -v
            pip install typing_extensions
            python setup.py generate_stubs
-            << parameters.extra_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
-              python -m build --wheel
-            auditwheel show dist/*
-            auditwheel repair dist/* --plat manylinux_2_31_x86_64
-      - run:
-          name: Upload package
-          command: |
-            source env/bin/activate
-            twine upload wheelhouse/*
+            MLX_BUILD_STAGE=1 << parameters.build_env >> python -m build -w
+            bash python/scripts/repair_linux.sh
+      - when:
+          condition:
+            equal: ["3.9", << parameters.python_version >>]
+          steps:
+            - run:
+                name: Build common package
+                command: |
+                  source env/bin/activate
+                  python setup.py clean --all
+                  << parameters.build_env >> MLX_BUILD_STAGE=2 \
+                    python -m build -w
+                  auditwheel repair dist/mlx_cpu*.whl --plat manylinux_2_35_x86_64
+      - when:
+          condition: << parameters.build_env >>
+          steps:
+            - run:
+                name: Upload packages
+                command: |
+                  source env/bin/activate
+                  twine upload wheelhouse/*.whl
      - store_artifacts:
          path: wheelhouse/

  build_cuda_release:
    parameters:
-      python_version:
+      build_env:
        type: string
-        default: "3.9"
-      extra_env:
-        type: string
-        default: "DEV_RELEASE=1"
+        default: ""
    machine:
      image: linux-cuda-12:default
      resource_class: gpu.nvidia.small.gen2
@@ -376,27 +373,25 @@ jobs:
          command: |
            sudo apt-get update
            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
+            sudo apt-get install zip
            python -m venv env
            source env/bin/activate
            pip install auditwheel
            pip install patchelf
            pip install build
            pip install twine
-            << parameters.extra_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
+            << parameters.build_env >> MLX_BUILD_STAGE=2 \
              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
-              pip install ".[dev]" -v
-            python setup.py generate_stubs
-            << parameters.extra_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
-              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
-              python -m build --wheel
+              python -m build -w
            bash python/scripts/repair_cuda.sh
-      - run:
-          name: Upload package
-          command: |
-            source env/bin/activate
-            twine upload wheelhouse/*.whl
+      - when:
+          condition: << parameters.build_env >>
+          steps:
+            - run:
+                name: Upload package
+                command: |
+                  source env/bin/activate
+                  twine upload wheelhouse/*.whl
      - store_artifacts:
          path: wheelhouse/

@@ -408,8 +403,6 @@ workflows:
            pattern: "^(?!pull/)[-\\w]+$"
            value: << pipeline.git.branch >>
        - not: << pipeline.parameters.nightly_build >>
-        - not: << pipeline.parameters.weekly_build >>
-        - not: << pipeline.parameters.test_release >>
    jobs:
      - mac_build_and_test:
          matrix:
@@ -423,8 +416,6 @@ workflows:
    when:
      and:
        - not: << pipeline.parameters.nightly_build >>
-        - not: << pipeline.parameters.weekly_build >>
-        - not: << pipeline.parameters.test_release >>
    jobs:
      - build_release:
          filters:
@@ -506,6 +497,25 @@ workflows:
            branches:
              ignore: /.*/
          upload-docs: true
+      - build_linux_release:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          matrix:
+            parameters:
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              build_env: ["PYPI_RELEASE=1"]
+      - build_cuda_release:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          matrix:
+            parameters:
+              build_env: ["PYPI_RELEASE=1"]

  prb:
    when:
@@ -584,99 +594,8 @@ workflows:
              - macosx_deployment_target: "15.0"
                xcode_version: "15.0.0"
                python_version: "3.13"
-  weekly_build:
-    when:
-      and:
-        - equal: [ main, << pipeline.git.branch >> ]
-        - << pipeline.parameters.weekly_build >>
-    jobs:
-      - build_release:
-          matrix:
-            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
-              macosx_deployment_target: ["13.5", "14.0", "15.0"]
-              build_env: ["DEV_RELEASE=1"]
-              xcode_version: ["16.2.0", "15.0.0"]
-            exclude:
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.9"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.10"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.11"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.12"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.13"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.9"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.10"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.11"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.12"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.13"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.9"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.10"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.11"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.12"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.13"
-                build_env: "DEV_RELEASE=1"
-  linux_test_release:
-    when:
-      and:
-        - equal: [ main, << pipeline.git.branch >> ]
-        - << pipeline.parameters.linux_release >>
-    jobs:
      - build_linux_release:
          matrix:
            parameters:
              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
-              extra_env: ["PYPI_RELEASE=1"]
-  cuda_test_release:
-    when:
-      and:
-        - equal: [ main, << pipeline.git.branch >> ]
-        - << pipeline.parameters.cuda_release >>
-    jobs:
-      - build_cuda_release:
-          matrix:
-            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
-              extra_env: ["PYPI_RELEASE=1"]
+      - build_cuda_release
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,10 +64,8 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
      message(WARNING "Building for x86_64 arch is not officially supported.")
    endif()
  endif()
-
 else()
  set(MLX_BUILD_METAL OFF)
-  message(WARNING "MLX is prioritised for Apple silicon systems using macOS.")
 endif()

 # ----------------------------- Lib -----------------------------
--- a/benchmarks/cpp/single_ops.cpp
+++ b/benchmarks/cpp/single_ops.cpp
@@ -192,6 +192,22 @@ void time_reductions() {

  auto argmin_along_1 = [&a]() { return mx::argmin(a, 1, false); };
  TIME(argmin_along_1);
+
+  auto indices = mx::array({1});
+  auto updates = mx::reshape(mx::array({NAN}), {1, 1, 1});
+  std::vector<int> axes{0};
+  auto b = scatter(a, {indices}, updates, axes);
+  mx::eval(b);
+
+  auto max_along_0 = [&b]() { return mx::max(b, 0, false); };
+  TIME(max_along_0);
+  auto max_along_1 = [&b]() { return mx::max(b, 1, false); };
+  TIME(max_along_1);
+
+  auto min_along_0 = [&b]() { return mx::min(b, 0, false); };
+  TIME(min_along_0);
+  auto min_along_1 = [&b]() { return mx::min(b, 1, false); };
+  TIME(min_along_1);
 }

 void time_gather_scatter() {
--- a/benchmarks/python/single_ops.py
+++ b/benchmarks/python/single_ops.py
@@ -51,6 +51,20 @@ def time_maximum():
    time_fn(mx.maximum, a, b)


+def time_max():
+    a = mx.random.uniform(shape=(32, 1024, 1024))
+    a[1, 1] = mx.nan
+    mx.eval(a)
+    time_fn(mx.max, a, 0)
+
+
+def time_min():
+    a = mx.random.uniform(shape=(32, 1024, 1024))
+    a[1, 1] = mx.nan
+    mx.eval(a)
+    time_fn(mx.min, a, 0)
+
+
 def time_negative():
    a = mx.random.uniform(shape=(10000, 1000))
    mx.eval(a)
@@ -108,6 +122,8 @@ if __name__ == "__main__":

    time_add()
    time_matmul()
+    time_min()
+    time_max()
    time_maximum()
    time_exp()
    time_negative()
--- a/docs/src/dev/extensions.rst
+++ b/docs/src/dev/extensions.rst
@@ -138,13 +138,13 @@ more concrete:
        * representing the vectorized computation and the axis which
        * corresponds to the output vectorized dimension.
        */
-        virtual std::pair<std::vector<array>, std::vector<int>> vmap(
+        std::pair<std::vector<array>, std::vector<int>> vmap(
            const std::vector<array>& inputs,
            const std::vector<int>& axes) override;

-        /** Print the primitive. */
-        void print(std::ostream& os) override {
-            os << "Axpby";
+        /** The name of primitive. */
+        const char* name() const override {
+          return "Axpby";
        }

        /** Equivalence check **/
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -23,13 +23,6 @@ To install from PyPI you must meet the following requirements:
    MLX is only available on devices running macOS >= 13.5
    It is highly recommended to use macOS 14 (Sonoma)

-
-MLX is also available on conda-forge. To install MLX with conda do:
-
-.. code-block:: shell
-
-   conda install conda-forge::mlx
-
 CUDA
 ^^^^

@@ -38,8 +31,16 @@ and SM 7.0 (Volta) and up. To install MLX with CUDA support, run:

 .. code-block:: shell

-    pip install mlx-cuda
+    pip install "mlx[cuda]"

+CPU-only (Linux)
+^^^^^^^^^^^^^^^^
+
+For a CPU-only version of MLX that runs on Linux use:
+
+.. code-block:: shell
+
+    pip install "mlx[cpu]"

 Troubleshooting
 ^^^^^^^^^^^^^^^
@@ -88,20 +89,20 @@ Then simply build and install MLX using pip:

 .. code-block:: shell

-  CMAKE_BUILD_PARALLEL_LEVEL=8 pip install .
+  pip install .

 For developing, install the package with development dependencies, and use an
 editable install:

 .. code-block:: shell

-  CMAKE_BUILD_PARALLEL_LEVEL=8 pip install -e ".[dev]"
+  pip install -e ".[dev]"

 Once the development dependencies are installed, you can build faster with:

 .. code-block:: shell

- CMAKE_BUILD_PARALLEL_LEVEL=8 python setup.py build_ext --inplace
+ python setup.py build_ext --inplace

 Run the tests with:

@@ -262,7 +263,7 @@ When building either the Python or C++ APIs make sure to pass the cmake flag

 .. code-block:: shell

-  CMAKE_BUILD_PARALLEL_LEVEL=8 CMAKE_ARGS="-DMLX_BUILD_CUDA=ON" pip install -e ".[dev]"
+  CMAKE_ARGS="-DMLX_BUILD_CUDA=ON" pip install -e ".[dev]"

 To build the C++ package run:

--- a/examples/extensions/axpby/axpby.h
+++ b/examples/extensions/axpby/axpby.h
@@ -74,9 +74,9 @@ class Axpby : public mx::Primitive {
      const std::vector<mx::array>& inputs,
      const std::vector<int>& axes) override;

-  /** Print the primitive. */
-  void print(std::ostream& os) override {
-    os << "Axpby";
+  /** The name of primitive. */
+  const char* name() const override {
+    return "Axpby";
  }

  /** Equivalence check **/
--- a/mlx/backend/common/matmul.h
+++ b/mlx/backend/common/matmul.h
@@ -12,16 +12,11 @@ namespace mlx::core {
 inline std::tuple<Shape, Strides, Strides> collapse_batches(
    const array& a,
    const array& b) {
-  // Get and check the shape for the batched dims
-  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
-  Shape B_bshape{b.shape().begin(), b.shape().end() - 2};
-  if (A_bshape != B_bshape) {
-    std::ostringstream msg;
-    msg << "[matmul] Got matrices with incorrectly broadcasted shapes: " << "A "
-        << a.shape() << ", B " << b.shape() << ".";
-    throw std::runtime_error(msg.str());
+  if (a.ndim() == 2) {
+    return {{1}, {0}, {0}};
  }

+  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
  Strides A_bstride{a.strides().begin(), a.strides().end() - 2};
  Strides B_bstride{b.strides().begin(), b.strides().end() - 2};

@@ -42,17 +37,11 @@ inline std::tuple<Shape, Strides, Strides> collapse_batches(

 inline std::tuple<Shape, Strides, Strides, Strides>
 collapse_batches(const array& a, const array& b, const array& c) {
-  // Get and check the shape for the batched dims
-  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
-  Shape B_bshape{b.shape().begin(), b.shape().end() - 2};
-  Shape C_bshape{c.shape().begin(), c.shape().end() - 2};
-  if (A_bshape != B_bshape || A_bshape != C_bshape) {
-    std::ostringstream msg;
-    msg << "[addmm] Got matrices with incorrectly broadcasted shapes: " << "A "
-        << a.shape() << ", B " << b.shape() << ", B " << c.shape() << ".";
-    throw std::runtime_error(msg.str());
+  if (a.ndim() == 2) {
+    return {{1}, {0}, {0}, {0}};
  }

+  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
  Strides A_bstride{a.strides().begin(), a.strides().end() - 2};
  Strides B_bstride{b.strides().begin(), b.strides().end() - 2};
  Strides C_bstride{c.strides().begin(), c.strides().end() - 2};
--- a/mlx/backend/common/utils.cpp
+++ b/mlx/backend/common/utils.cpp
@@ -1,14 +1,20 @@
 // Copyright © 2023-2024 Apple Inc.

+#include <dlfcn.h>
+
 #include "mlx/backend/common/utils.h"
-#include "mlx/primitives.h"

 namespace mlx::core {

-std::string get_primitive_string(Primitive* primitive) {
-  std::ostringstream op_t;
-  primitive->print(op_t);
-  return op_t.str();
+std::filesystem::path current_binary_dir() {
+  static std::filesystem::path binary_dir = []() {
+    Dl_info info;
+    if (!dladdr(reinterpret_cast<void*>(&current_binary_dir), &info)) {
+      throw std::runtime_error("Unable to get current binary dir.");
+    }
+    return std::filesystem::path(info.dli_fname).parent_path();
+  }();
+  return binary_dir;
 }

 std::tuple<Shape, std::vector<Strides>> collapse_contiguous_dims(
--- a/mlx/backend/common/utils.h
+++ b/mlx/backend/common/utils.h
@@ -2,6 +2,7 @@

 #pragma once

+#include <filesystem>
 #include <tuple>
 #include <vector>

@@ -9,7 +10,8 @@

 namespace mlx::core {

-std::string get_primitive_string(Primitive* primitive);
+// Return the directory that contains current shared library.
+std::filesystem::path current_binary_dir();

 inline int64_t
 elem_to_loc(int elem, const Shape& shape, const Strides& strides) {
--- a/mlx/backend/cpu/cholesky.cpp
+++ b/mlx/backend/cpu/cholesky.cpp
@@ -20,7 +20,7 @@ void cholesky_impl(const array& a, array& factor, bool upper, Stream stream) {

  // The decomposition is computed in place, so just copy the input to the
  // output.
-  copy(
+  copy_cpu(
      a,
      factor,
      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
--- a/mlx/backend/cpu/compiled.cpp
+++ b/mlx/backend/cpu/compiled.cpp
@@ -231,7 +231,7 @@ inline void build_kernel(
      os << "static_cast<" << get_type_string(x.dtype()) << ">(tmp_"
         << namer.get_name(x.inputs()[0]) << ");" << std::endl;
    } else {
-      x.primitive().print(os);
+      os << x.primitive().name();
      os << "()(";
      for (int i = 0; i < x.inputs().size() - 1; i++) {
        os << "tmp_" << namer.get_name(x.inputs()[i]) << ", ";
--- a/mlx/backend/cpu/conv.cpp
+++ b/mlx/backend/cpu/conv.cpp
@@ -883,7 +883,7 @@ void explicit_gemm_conv_1D_cpu(
  // Fill with zeros
  std::vector<array> temps;
  temps.push_back(array(0, conv_dtype));
-  copy(temps.back(), in_padded, CopyType::Scalar, stream);
+  copy_cpu(temps.back(), in_padded, CopyType::Scalar, stream);

  // Pick input slice from padded
  size_t data_offset = padding_lo[0] * in_padded.strides()[1];
@@ -895,7 +895,7 @@ void explicit_gemm_conv_1D_cpu(
      in_padded_slice.size(),
      data_offset);
  // Copy input values into the slice
-  copy_inplace(in, in_padded_slice, CopyType::GeneralGeneral, stream);
+  copy_cpu_inplace(in, in_padded_slice, CopyType::GeneralGeneral, stream);
  temps.push_back(in_padded_slice);

  // Make strided view
@@ -920,7 +920,7 @@ void explicit_gemm_conv_1D_cpu(
  // Materialize strided view
  Shape strided_reshape = {N * oH, wH * C};
  array in_strided(strided_reshape, in_strided_view.dtype(), nullptr, {});
-  copy(in_strided_view, in_strided, CopyType::General, stream);
+  copy_cpu(in_strided_view, in_strided, CopyType::General, stream);
  temps.push_back(in_strided);

  // Check wt dtype and prepare
@@ -938,13 +938,13 @@ void explicit_gemm_conv_1D_cpu(
        wt.size(),
        0);
    gemm_wt = array(wt_transpose.shape(), float32, nullptr, {});
-    copy(wt_transpose, gemm_wt, CopyType::General, stream);
+    copy_cpu(wt_transpose, gemm_wt, CopyType::General, stream);
    temps.push_back(gemm_wt);
  } else if (wt.dtype() != float32 || !wt.flags().row_contiguous) {
    auto ctype =
        wt.flags().row_contiguous ? CopyType::Vector : CopyType::General;
    gemm_wt = array(wt.shape(), float32, nullptr, {});
-    copy(wt, gemm_wt, ctype, stream);
+    copy_cpu(wt, gemm_wt, ctype, stream);
    temps.push_back(gemm_wt);
  }

@@ -991,7 +991,7 @@ void explicit_gemm_conv_1D_cpu(

  // Copy results if needed
  if (out.dtype() != float32) {
-    copy_inplace(gemm_out, out, CopyType::Vector, stream);
+    copy_cpu_inplace(gemm_out, out, CopyType::Vector, stream);
  }
  encoder.add_temporaries(std::move(temps));
 }
@@ -1029,7 +1029,7 @@ void explicit_gemm_conv_2D_cpu(
  // Fill with zeros
  std::vector<array> temps;
  temps.push_back(array(0, conv_dtype));
-  copy(temps.back(), in_padded, CopyType::Scalar, stream);
+  copy_cpu(temps.back(), in_padded, CopyType::Scalar, stream);

  // Pick input slice from padded
  size_t data_offset = padding_lo[0] * in_padded.strides()[1] +
@@ -1044,7 +1044,7 @@ void explicit_gemm_conv_2D_cpu(
  temps.push_back(in_padded_slice);

  // Copy input values into the slice
-  copy_inplace(in, in_padded_slice, CopyType::GeneralGeneral, stream);
+  copy_cpu_inplace(in, in_padded_slice, CopyType::GeneralGeneral, stream);

  // Make strided view
  Shape strided_shape = {N, oH, oW, wH, wW, C};
@@ -1065,7 +1065,7 @@ void explicit_gemm_conv_2D_cpu(
  // Materialize strided view
  Shape strided_reshape = {N * oH * oW, wH * wW * C};
  array in_strided(strided_reshape, in_strided_view.dtype(), nullptr, {});
-  copy(in_strided_view, in_strided, CopyType::General, stream);
+  copy_cpu(in_strided_view, in_strided, CopyType::General, stream);
  temps.push_back(in_strided);

  // Check wt dtype and prepare
@@ -1076,7 +1076,7 @@ void explicit_gemm_conv_2D_cpu(
    auto ctype =
        wt.flags().row_contiguous ? CopyType::Vector : CopyType::General;
    gemm_wt = array(wt.shape(), float32, nullptr, {});
-    copy(wt, gemm_wt, ctype, stream);
+    copy_cpu(wt, gemm_wt, ctype, stream);
    temps.push_back(gemm_wt);
  }

@@ -1116,7 +1116,7 @@ void explicit_gemm_conv_2D_cpu(

  // Copy results if needed
  if (out.dtype() != float32) {
-    copy_inplace(gemm_out, out, CopyType::Vector, stream);
+    copy_cpu_inplace(gemm_out, out, CopyType::Vector, stream);
  }
  encoder.add_temporaries(std::move(temps));
 }
@@ -1156,7 +1156,7 @@ void explicit_gemm_conv_ND_cpu(

  // Fill with zeros
  std::vector<array> temps = {array(0, conv_dtype)};
-  copy(temps.back(), in_padded, CopyType::Scalar, stream);
+  copy_cpu(temps.back(), in_padded, CopyType::Scalar, stream);

  // Pick input slice from padded
  size_t data_offset = 0;
@@ -1173,7 +1173,7 @@ void explicit_gemm_conv_ND_cpu(
      data_offset);

  // Copy input values into the slice
-  copy_inplace(in, in_padded_slice, CopyType::GeneralGeneral, stream);
+  copy_cpu_inplace(in, in_padded_slice, CopyType::GeneralGeneral, stream);
  temps.push_back(in_padded_slice);

  // Make strided view
@@ -1212,7 +1212,7 @@ void explicit_gemm_conv_ND_cpu(
  }

  array in_strided(strided_reshape, in_strided_view.dtype(), nullptr, {});
-  copy(in_strided_view, in_strided, CopyType::General, stream);
+  copy_cpu(in_strided_view, in_strided, CopyType::General, stream);
  temps.push_back(in_strided);

  // Check wt dtype and prepare
@@ -1223,13 +1223,13 @@ void explicit_gemm_conv_ND_cpu(
    auto ctype =
        wt.flags().row_contiguous ? CopyType::Vector : CopyType::General;
    gemm_wt = array(wt.shape(), float32, nullptr, {});
-    copy(wt, gemm_wt, ctype, stream);
+    copy_cpu(wt, gemm_wt, ctype, stream);
    temps.push_back(gemm_wt);
  }

  if (flip) {
    auto gemm_wt_ = array(gemm_wt.shape(), float32, nullptr, {});
-    copy(gemm_wt, gemm_wt_, CopyType::Vector, stream);
+    copy_cpu(gemm_wt, gemm_wt_, CopyType::Vector, stream);
    temps.push_back(gemm_wt_);

    // Calculate the total size of the spatial dimensions
@@ -1284,7 +1284,7 @@ void explicit_gemm_conv_ND_cpu(

  // Copy results if needed
  if (out.dtype() != float32) {
-    copy_inplace(gemm_out, out, CopyType::Vector, stream);
+    copy_cpu_inplace(gemm_out, out, CopyType::Vector, stream);
  }
  encoder.add_temporaries(std::move(temps));
 }
--- a/mlx/backend/cpu/copy.cpp
+++ b/mlx/backend/cpu/copy.cpp
@@ -295,7 +295,11 @@ inline void copy_inplace_dispatch(

 } // namespace

-void copy_inplace(const array& src, array& dst, CopyType ctype, Stream stream) {
+void copy_cpu_inplace(
+    const array& src,
+    array& dst,
+    CopyType ctype,
+    Stream stream) {
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(src);
  encoder.set_output_array(dst);
@@ -305,7 +309,7 @@ void copy_inplace(const array& src, array& dst, CopyType ctype, Stream stream) {
       ctype]() mutable { copy_inplace_dispatch(src, dst, ctype); });
 }

-void copy(const array& src, array& dst, CopyType ctype, Stream stream) {
+void copy_cpu(const array& src, array& dst, CopyType ctype, Stream stream) {
  bool donated = set_copy_output_data(src, dst, ctype);
  if (donated && src.dtype() == dst.dtype()) {
    // If the output has the same type as the input then there is nothing to
@@ -315,10 +319,10 @@ void copy(const array& src, array& dst, CopyType ctype, Stream stream) {
  if (ctype == CopyType::GeneralGeneral) {
    ctype = CopyType::General;
  }
-  copy_inplace(src, dst, ctype, stream);
+  copy_cpu_inplace(src, dst, ctype, stream);
 }

-void copy_inplace(
+void copy_cpu_inplace(
    const array& src,
    array& dst,
    const Shape& data_shape,
--- a/mlx/backend/cpu/copy.h
+++ b/mlx/backend/cpu/copy.h
@@ -10,10 +10,14 @@

 namespace mlx::core {

-void copy(const array& src, array& dst, CopyType ctype, Stream stream);
-void copy_inplace(const array& src, array& dst, CopyType ctype, Stream stream);
+void copy_cpu(const array& src, array& dst, CopyType ctype, Stream stream);
+void copy_cpu_inplace(
+    const array& src,
+    array& dst,
+    CopyType ctype,
+    Stream stream);

-void copy_inplace(
+void copy_cpu_inplace(
    const array& src,
    array& dst,
    const Shape& data_shape,
--- a/mlx/backend/cpu/distributed.cpp
+++ b/mlx/backend/cpu/distributed.cpp
@@ -14,7 +14,7 @@ std::pair<array, bool> ensure_row_contiguous(const array& arr, Stream stream) {
    return {arr, false};
  } else {
    array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-    copy(arr, arr_copy, CopyType::General, stream);
+    copy_cpu(arr, arr_copy, CopyType::General, stream);
    return {arr_copy, true};
  }
 };
@@ -35,7 +35,7 @@ void AllReduce::eval_cpu(
      return in;
    } else {
      array arr_copy(in.shape(), in.dtype(), nullptr, {});
-      copy(in, arr_copy, CopyType::General, s);
+      copy_cpu(in, arr_copy, CopyType::General, s);
      out.copy_shared_buffer(arr_copy);
      return arr_copy;
    }
--- a/mlx/backend/cpu/eig.cpp
+++ b/mlx/backend/cpu/eig.cpp
@@ -135,7 +135,7 @@ void Eig::eval_cpu(
      : array(a.shape(), complex64, nullptr, {});

  auto a_copy = array(a.shape(), a.dtype(), nullptr, {});
-  copy(
+  copy_cpu(
      a,
      a_copy,
      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
--- a/mlx/backend/cpu/eigh.cpp
+++ b/mlx/backend/cpu/eigh.cpp
@@ -196,7 +196,7 @@ void Eigh::eval_cpu(

  values.set_data(allocator::malloc(values.nbytes()));

-  copy(
+  copy_cpu(
      a,
      vectors,
      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
--- a/mlx/backend/cpu/hadamard.cpp
+++ b/mlx/backend/cpu/hadamard.cpp
@@ -96,7 +96,7 @@ void Hadamard::eval_cpu(const std::vector<array>& inputs, array& out) {
  if (in.flags().row_contiguous && in.is_donatable()) {
    out.copy_shared_buffer(in);
  } else {
-    copy(
+    copy_cpu(
        in,
        out,
        in.flags().row_contiguous ? CopyType::Vector : CopyType::General,
--- a/mlx/backend/cpu/indexing.cpp
+++ b/mlx/backend/cpu/indexing.cpp
@@ -517,7 +517,7 @@ void Scatter::eval_cpu(const std::vector<array>& inputs, array& out) {
  // Copy src into out (copy allocates memory for out)
  auto ctype =
      src.flags().row_contiguous ? CopyType::Vector : CopyType::General;
-  copy(src, out, ctype, stream());
+  copy_cpu(src, out, ctype, stream());

  auto& encoder = cpu::get_command_encoder(stream());
  std::vector<array> inds;
@@ -686,7 +686,7 @@ void ScatterAxis::eval_cpu(const std::vector<array>& inputs, array& out) {
  // Copy src into out (copy allocates memory for out)
  auto ctype =
      src.flags().row_contiguous ? CopyType::Vector : CopyType::General;
-  copy(src, out, ctype, stream());
+  copy_cpu(src, out, ctype, stream());

  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_input_array(idx);
--- a/mlx/backend/cpu/inverse.cpp
+++ b/mlx/backend/cpu/inverse.cpp
@@ -115,7 +115,7 @@ void inverse_impl(
  //   (A⁻¹)ᵀ = (Aᵀ)⁻¹

  // The inverse is computed in place, so just copy the input to the output.
-  copy(
+  copy_cpu(
      a,
      inv,
      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
--- a/mlx/backend/cpu/logsumexp.cpp
+++ b/mlx/backend/cpu/logsumexp.cpp
@@ -88,7 +88,7 @@ void LogSumExp::eval_cpu(const std::vector<array>& inputs, array& out) {
      return x;
    } else {
      auto x_copy = array(x.shape(), x.dtype(), nullptr, {});
-      copy(x, x_copy, CopyType::General, s);
+      copy_cpu(x, x_copy, CopyType::General, s);
      encoder.add_temporary(x_copy);
      return x_copy;
    }
--- a/mlx/backend/cpu/luf.cpp
+++ b/mlx/backend/cpu/luf.cpp
@@ -31,7 +31,7 @@ void luf_impl(
  strides[ndim - 1] = M;
  strides[ndim - 2] = 1;
  lu.set_data(allocator::malloc(lu.nbytes()), lu.nbytes(), strides, flags);
-  copy_inplace(
+  copy_cpu_inplace(
      a,
      lu,
      a.shape(),
--- a/mlx/backend/cpu/masked_mm.cpp
+++ b/mlx/backend/cpu/masked_mm.cpp
@@ -6,6 +6,7 @@
 #include "mlx/backend/common/utils.h"
 #include "mlx/backend/cpu/copy.h"
 #include "mlx/backend/cpu/encoder.h"
+#include "mlx/backend/cpu/gemm.h"
 #include "mlx/backend/cpu/lapack.h"
 #include "mlx/primitives.h"

@@ -52,6 +53,58 @@ inline void mask_matrix(
  }
 }

+template <typename T>
+inline void segmented_mm(
+    const T* a,
+    const T* b,
+    const uint32_t* segments,
+    T* out,
+    bool a_transposed,
+    bool b_transposed,
+    size_t lda,
+    size_t ldb,
+    const Shape& a_shape,
+    const Strides& a_strides,
+    const Shape& b_shape,
+    const Strides& b_strides,
+    size_t num_segments,
+    const Shape& segments_shape,
+    const Strides& segments_strides) {
+  int ndim = a_shape.size();
+  Shape a_copy = a_shape;
+  Shape b_copy = b_shape;
+  int32_t M = a_copy[ndim - 2];
+  int32_t N = b_copy[ndim - 1];
+  for (int i = 0; i < num_segments; i++) {
+    uint32_t k_start =
+        segments[elem_to_loc(2 * i, segments_shape, segments_strides)];
+    uint32_t k_end =
+        segments[elem_to_loc(2 * i + 1, segments_shape, segments_strides)];
+    if (k_end <= k_start) {
+      std::fill_n(out + i * M * N, M * N, T(0));
+      continue;
+    }
+    a_copy[ndim - 1] = k_end - k_start;
+    b_copy[ndim - 2] = k_end - k_start;
+    matmul<T>(
+        a + k_start * a_strides[ndim - 1],
+        b + k_start * b_strides[ndim - 2],
+        out + i * M * N,
+        a_transposed,
+        b_transposed,
+        lda,
+        ldb,
+        N,
+        1.0,
+        0.0,
+        1,
+        a_copy,
+        a_strides,
+        b_copy,
+        b_strides);
+  }
+}
+
 } // namespace

 void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
@@ -71,20 +124,20 @@ void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
        if (!expand_all && stx == arr.shape(-1) && sty == 1) {
          if (do_copy) {
            array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-            copy(arr, arr_copy, CopyType::Vector, s);
+            copy_cpu(arr, arr_copy, CopyType::Vector, s);
            return std::make_tuple(false, stx, arr_copy, true);
          }
          return std::make_tuple(false, stx, arr, false);
        } else if (!expand_all && stx == 1 && sty == arr.shape(-2)) {
          if (do_copy) {
            array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-            copy(arr, arr_copy, CopyType::Vector, s);
+            copy_cpu(arr, arr_copy, CopyType::Vector, s);
            return std::make_tuple(true, sty, arr_copy, true);
          }
          return std::make_tuple(true, sty, arr, false);
        } else {
          array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-          copy(arr, arr_copy, CopyType::General, s);
+          copy_cpu(arr, arr_copy, CopyType::General, s);
          int64_t stx = arr.shape(-1);
          return std::make_tuple(false, stx, arr_copy, true);
        }
@@ -333,7 +386,7 @@ void GatherMM::eval_cpu(const std::vector<array>& inputs, array& out) {
      return std::make_tuple(true, sty, arr);
    } else {
      temps.push_back(array(arr.shape(), arr.dtype(), nullptr, {}));
-      copy(arr, temps.back(), CopyType::General, s);
+      copy_cpu(arr, temps.back(), CopyType::General, s);
      int64_t stx = arr.shape(-1);
      return std::make_tuple(false, stx, temps.back());
    }
@@ -437,4 +490,121 @@ void GatherMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  encoder.add_temporaries(std::move(temps));
 }

+void SegmentedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
+  out.set_data(allocator::malloc(out.nbytes()));
+
+  auto& s = stream();
+  auto& encoder = cpu::get_command_encoder(stream());
+  auto check_transpose = [&s, &encoder](const array& x) {
+    auto stx = x.strides()[x.ndim() - 2];
+    auto sty = x.strides()[x.ndim() - 1];
+    if (stx == x.shape(-1) && sty == 1) {
+      return std::make_tuple(false, stx, x);
+    } else if (stx == 1 && sty == x.shape(-2)) {
+      return std::make_tuple(true, sty, x);
+    } else {
+      array xc(x.shape(), x.dtype(), nullptr, {});
+      copy_cpu(x, xc, CopyType::General, s);
+      encoder.add_temporary(xc);
+      int64_t stx = x.shape(-1);
+      return std::make_tuple(false, stx, xc);
+    }
+  };
+
+  auto [a_transposed, lda, a] = check_transpose(inputs[0]);
+  auto [b_transposed, ldb, b] = check_transpose(inputs[1]);
+  auto& segments = inputs[2];
+
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_input_array(segments);
+  encoder.set_output_array(out);
+  encoder.dispatch([a = array::unsafe_weak_copy(a),
+                    b = array::unsafe_weak_copy(b),
+                    segments = array::unsafe_weak_copy(segments),
+                    out_ptr = out.data<void>(),
+                    a_transposed = a_transposed,
+                    b_transposed = b_transposed,
+                    lda = lda,
+                    ldb = ldb]() {
+    switch (a.dtype()) {
+      case float64:
+        segmented_mm<double>(
+            a.data<double>(),
+            b.data<double>(),
+            segments.data<uint32_t>(),
+            static_cast<double*>(out_ptr),
+            a_transposed,
+            b_transposed,
+            lda,
+            ldb,
+            a.shape(),
+            a.strides(),
+            b.shape(),
+            b.strides(),
+            segments.size() / 2,
+            segments.shape(),
+            segments.strides());
+        break;
+      case float32:
+        segmented_mm<float>(
+            a.data<float>(),
+            b.data<float>(),
+            segments.data<uint32_t>(),
+            static_cast<float*>(out_ptr),
+            a_transposed,
+            b_transposed,
+            lda,
+            ldb,
+            a.shape(),
+            a.strides(),
+            b.shape(),
+            b.strides(),
+            segments.size() / 2,
+            segments.shape(),
+            segments.strides());
+        break;
+      case float16:
+        segmented_mm<float16_t>(
+            a.data<float16_t>(),
+            b.data<float16_t>(),
+            segments.data<uint32_t>(),
+            static_cast<float16_t*>(out_ptr),
+            a_transposed,
+            b_transposed,
+            lda,
+            ldb,
+            a.shape(),
+            a.strides(),
+            b.shape(),
+            b.strides(),
+            segments.size() / 2,
+            segments.shape(),
+            segments.strides());
+        break;
+      case bfloat16:
+        segmented_mm<bfloat16_t>(
+            a.data<bfloat16_t>(),
+            b.data<bfloat16_t>(),
+            segments.data<uint32_t>(),
+            static_cast<bfloat16_t*>(out_ptr),
+            a_transposed,
+            b_transposed,
+            lda,
+            ldb,
+            a.shape(),
+            a.strides(),
+            b.shape(),
+            b.strides(),
+            segments.size() / 2,
+            segments.shape(),
+            segments.strides());
+        break;
+      default:
+        throw std::invalid_argument(
+            "Segmented mm supports only real float types.");
+    }
+  });
+}
+
 } // namespace mlx::core
--- a/mlx/backend/cpu/matmul.cpp
+++ b/mlx/backend/cpu/matmul.cpp
@@ -81,7 +81,7 @@ void matmul_general(
      return std::make_tuple(true, sty, arr);
    } else {
      temps.push_back(array(arr.shape(), arr.dtype(), nullptr, {}));
-      copy(arr, temps.back(), CopyType::General, stream);
+      copy_cpu(arr, temps.back(), CopyType::General, stream);
      stx = arr.shape(-1);
      return std::make_tuple(false, stx, temps.back());
    }
@@ -142,7 +142,7 @@ void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  CopyType ctype = c.data_size() == 1
      ? CopyType::Scalar
      : (c.flags().row_contiguous ? CopyType::Vector : CopyType::General);
-  copy(c, out, ctype, stream());
+  copy_cpu(c, out, ctype, stream());
  if (inputs[0].shape(-1) == 0) {
    return;
  }
--- a/mlx/backend/cpu/primitives.cpp
+++ b/mlx/backend/cpu/primitives.cpp
@@ -22,7 +22,7 @@ void reshape(const array& in, array& out) {
  auto [copy_necessary, out_strides] = prepare_reshape(in, out);
  if (copy_necessary) {
    out.set_data(allocator::malloc(out.nbytes()));
-    copy_inplace(in, out, CopyType::General, out.primitive().stream());
+    copy_cpu_inplace(in, out, CopyType::General, out.primitive().stream());
  } else {
    shared_buffer_reshape(in, out_strides, out);
  }
@@ -175,7 +175,7 @@ void AsType::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  CopyType ctype = in.flags().contiguous ? CopyType::Vector : CopyType::General;
-  copy(in, out, ctype, stream());
+  copy_cpu(in, out, ctype, stream());
 }

 void Concatenate::eval_cpu(const std::vector<array>& inputs, array& out) {
@@ -198,7 +198,7 @@ void Concatenate::eval_cpu(const std::vector<array>& inputs, array& out) {
    size_t data_offset = strides[axis_] * sizes[i];
    out_slice.copy_shared_buffer(
        out, strides, flags, out_slice.size(), data_offset);
-    copy_inplace(inputs[i], out_slice, CopyType::GeneralGeneral, stream());
+    copy_cpu_inplace(inputs[i], out_slice, CopyType::GeneralGeneral, stream());
  }
 }

@@ -211,7 +211,7 @@ void Contiguous::eval_cpu(const std::vector<array>& inputs, array& out) {
       (allow_col_major_ && in.flags().col_contiguous))) {
    out.copy_shared_buffer(in);
  } else {
-    copy(in, out, CopyType::General, stream());
+    copy_cpu(in, out, CopyType::General, stream());
  }
 }

@@ -235,7 +235,7 @@ void Full::eval_cpu(const std::vector<array>& inputs, array& out) {
  } else {
    ctype = CopyType::General;
  }
-  copy(in, out, ctype, stream());
+  copy_cpu(in, out, ctype, stream());
 }

 void Pad::eval_cpu(const std::vector<array>& inputs, array& out) {
@@ -251,7 +251,7 @@ void Pad::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(val.dtype() == in.dtype() && in.dtype() == out.dtype());

  // Fill output with val
-  copy(val, out, CopyType::Scalar, stream());
+  copy_cpu(val, out, CopyType::Scalar, stream());

  // Find offset for start of input values
  size_t data_offset = 0;
@@ -266,7 +266,7 @@ void Pad::eval_cpu(const std::vector<array>& inputs, array& out) {
      out, out.strides(), out.flags(), out_slice.size(), data_offset);

  // Copy input values into the slice
-  copy_inplace(in, out_slice, CopyType::GeneralGeneral, stream());
+  copy_cpu_inplace(in, out_slice, CopyType::GeneralGeneral, stream());
 }

 void RandomBits::eval_cpu(const std::vector<array>& inputs, array& out) {
@@ -340,7 +340,7 @@ void DynamicSlice::eval_cpu(const std::vector<array>& inputs, array& out) {
  out.set_data(allocator::malloc(out.nbytes()));
  auto [in_offset, donated] =
      compute_dynamic_offset(inputs[1], in.strides(), axes_, stream());
-  copy_inplace(
+  copy_cpu_inplace(
      /* const array& src = */ in,
      /* array& dst = */ out,
      /* const Shape& data_shape = */ out.shape(),
@@ -372,11 +372,11 @@ void DynamicSliceUpdate::eval_cpu(
  auto ctype = in.flags().contiguous && in.size() == in.data_size()
      ? CopyType::Vector
      : CopyType::General;
-  copy(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype, stream());
+  copy_cpu(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype, stream());

  auto [out_offset, donated] =
      compute_dynamic_offset(inputs[2], out.strides(), axes_, stream());
-  copy_inplace(
+  copy_cpu_inplace(
      /* const array& src = */ upd,
      /* array& dst = */ out,
      /* const std::vector<int>& data_shape = */ upd.shape(),
@@ -412,14 +412,14 @@ void SliceUpdate::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto ctype = in.flags().contiguous && in.size() == in.data_size()
      ? CopyType::Vector
      : CopyType::General;
-  copy(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype, stream());
+  copy_cpu(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype, stream());

  // Calculate out strides, initial offset and if copy needs to be made
  auto [data_offset, out_strides] =
      prepare_slice(out, start_indices_, strides_);

  // Do copy
-  copy_inplace(
+  copy_cpu_inplace(
      /* const array& src = */ upd,
      /* array& dst = */ out,
      /* const std::vector<int>& data_shape = */ upd.shape(),
@@ -456,9 +456,9 @@ void View::eval_cpu(const std::vector<array>& inputs, array& out) {
    if (in.dtype() == bool_) {
      auto in_tmp = array(in.shape(), uint8, nullptr, {});
      in_tmp.copy_shared_buffer(in);
-      copy_inplace(in_tmp, tmp, CopyType::General, stream());
+      copy_cpu_inplace(in_tmp, tmp, CopyType::General, stream());
    } else {
-      copy_inplace(in, tmp, CopyType::General, stream());
+      copy_cpu_inplace(in, tmp, CopyType::General, stream());
    }

    auto flags = out.flags();
--- a/mlx/backend/cpu/qrf.cpp
+++ b/mlx/backend/cpu/qrf.cpp
@@ -26,7 +26,7 @@ void qrf_impl(const array& a, array& q, array& r, Stream stream) {
  strides[in.ndim() - 2] = 1;
  strides[in.ndim() - 1] = M;
  in.set_data(allocator::malloc(in.nbytes()), in.nbytes(), strides, flags);
-  copy_inplace(a, in, CopyType::GeneralGeneral, stream);
+  copy_cpu_inplace(a, in, CopyType::GeneralGeneral, stream);
  auto& encoder = cpu::get_command_encoder(stream);
  q.set_data(allocator::malloc(q.nbytes()));
  r.set_data(allocator::malloc(r.nbytes()));
--- a/mlx/backend/cpu/quantized.cpp
+++ b/mlx/backend/cpu/quantized.cpp
@@ -529,7 +529,7 @@ void QuantizedMatmul::eval_cpu(const std::vector<array>& inputs, array& out) {
      return arr;
    } else {
      temps.push_back(array(arr.shape(), arr.dtype(), nullptr, {}));
-      copy(arr, temps.back(), CopyType::General, s);
+      copy_cpu(arr, temps.back(), CopyType::General, s);
      return temps.back();
    }
  };
@@ -579,7 +579,7 @@ void GatherQMM::eval_cpu(const std::vector<array>& inputs, array& out) {
      return arr;
    } else {
      temps.push_back(array(arr.shape(), arr.dtype(), nullptr, {}));
-      copy(arr, temps.back(), CopyType::General, s);
+      copy_cpu(arr, temps.back(), CopyType::General, s);
      return temps.back();
    }
  };
@@ -713,7 +713,7 @@ void fast::AffineQuantize::eval_cpu(
      return std::make_pair(arr, false);
    } else {
      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-      copy(arr, arr_copy, CopyType::General, s);
+      copy_cpu(arr, arr_copy, CopyType::General, s);
      return std::make_pair(arr_copy, true);
    }
  };
--- a/mlx/backend/cpu/reduce.cpp
+++ b/mlx/backend/cpu/reduce.cpp
@@ -325,7 +325,15 @@ struct MaxReduce {
  };

  template <int N, typename T>
-  T operator()(simd::Simd<T, N> x) {
+  std::enable_if_t<std::is_integral_v<T>, T> operator()(simd::Simd<T, N> x) {
+    return simd::max(x);
+  };
+
+  template <int N, typename T>
+  std::enable_if_t<!std::is_integral_v<T>, T> operator()(simd::Simd<T, N> x) {
+    if (simd::any(x != x)) {
+      return static_cast<T>(NAN);
+    }
    return simd::max(x);
  };
 };
@@ -342,7 +350,15 @@ struct MinReduce {
  };

  template <int N, typename T>
-  T operator()(simd::Simd<T, N> x) {
+  std::enable_if_t<std::is_integral_v<T>, T> operator()(simd::Simd<T, N> x) {
+    return simd::min(x);
+  };
+
+  template <int N, typename T>
+  std::enable_if_t<!std::is_integral_v<T>, T> operator()(simd::Simd<T, N> x) {
+    if (simd::any(x != x)) {
+      return static_cast<T>(NAN);
+    }
    return simd::min(x);
  };
 };
@@ -527,10 +543,10 @@ void Reduce::eval_cpu(const std::vector<array>& inputs, array& out) {
            reduce_dispatch_min_max<uint64_t>(in, out, reduce_type_, axes_);
            break;
          case int8:
-            reduce_dispatch_min_max<uint8_t>(in, out, reduce_type_, axes_);
+            reduce_dispatch_min_max<int8_t>(in, out, reduce_type_, axes_);
            break;
          case int16:
-            reduce_dispatch_min_max<uint16_t>(in, out, reduce_type_, axes_);
+            reduce_dispatch_min_max<int16_t>(in, out, reduce_type_, axes_);
            break;
          case int32:
            reduce_dispatch_min_max<int32_t>(in, out, reduce_type_, axes_);
--- a/mlx/backend/cpu/scan.cpp
+++ b/mlx/backend/cpu/scan.cpp
@@ -251,7 +251,7 @@ void Scan::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto in = inputs[0];
  if (!in.flags().row_contiguous) {
    array arr_copy(in.shape(), in.dtype(), nullptr, {});
-    copy(in, arr_copy, CopyType::General, stream());
+    copy_cpu(in, arr_copy, CopyType::General, stream());
    in = arr_copy;
    encoder.add_temporary(arr_copy);
  }
--- a/mlx/backend/cpu/softmax.cpp
+++ b/mlx/backend/cpu/softmax.cpp
@@ -132,7 +132,7 @@ void Softmax::eval_cpu(const std::vector<array>& inputs, array& out) {
      return x;
    } else {
      array x_copy(x.shape(), x.dtype(), nullptr, {});
-      copy(x, x_copy, CopyType::General, s);
+      copy_cpu(x, x_copy, CopyType::General, s);
      out.copy_shared_buffer(x_copy);
      return x_copy;
    }
--- a/mlx/backend/cpu/sort.cpp
+++ b/mlx/backend/cpu/sort.cpp
@@ -334,8 +334,10 @@ void Sort::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& in = inputs[0];

  // Copy input to output
-  CopyType ctype = in.flags().contiguous ? CopyType::Vector : CopyType::General;
-  copy(in, out, ctype, stream());
+  CopyType ctype = (in.flags().contiguous && in.strides()[axis_] != 0)
+      ? CopyType::Vector
+      : CopyType::General;
+  copy_cpu(in, out, ctype, stream());

  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_output_array(out);
@@ -426,8 +428,10 @@ void Partition::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& in = inputs[0];

  // Copy input to output
-  CopyType ctype = in.flags().contiguous ? CopyType::Vector : CopyType::General;
-  copy(in, out, ctype, stream());
+  CopyType ctype = (in.flags().contiguous && in.strides()[axis_] != 0)
+      ? CopyType::Vector
+      : CopyType::General;
+  copy_cpu(in, out, ctype, stream());

  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_output_array(out);
--- a/mlx/backend/cpu/svd.cpp
+++ b/mlx/backend/cpu/svd.cpp
@@ -31,7 +31,7 @@ void svd_impl(

  // lapack clobbers the input, so we have to make a copy.
  array in(a.shape(), a.dtype(), nullptr, {});
-  copy(
+  copy_cpu(
      a,
      in,
      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -35,12 +35,14 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/row_reduce.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/rms_norm.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/rope.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/scan.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/sort.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/ternary.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/unary.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/quantized.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/worker.cpp)

 target_compile_definitions(mlx PRIVATE MLX_USE_CUDA)
@@ -67,6 +69,11 @@ target_include_directories(mlx PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/gen")
 target_compile_options(mlx
                       PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>")

+# Enable calling host constexpr functions from device. This is needed because
+# the constexpr version of isnan is host only.
+target_compile_options(
+  mlx PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>")
+
 # CUDA 12.8 emits warning #20280-D for copy kernels which is a false positive.
 # Explicitly pass this flag to suppress the warning, it is safe to set it to
 # true but the warning wouldn't be suppressed.
@@ -119,3 +126,7 @@ target_link_libraries(mlx PRIVATE CUDA::nvrtc CUDA::cuda_driver)
 # Suppress nvcc warnings on MLX headers.
 target_compile_options(mlx PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe
                                   --diag_suppress=997>)
+
+# Install CCCL headers for JIT.
+install(DIRECTORY ${cccl_SOURCE_DIR}/include/cuda
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cccl)
--- a/mlx/backend/cuda/arg_reduce.cu
+++ b/mlx/backend/cuda/arg_reduce.cu
@@ -1,6 +1,7 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/common/utils.h"
 #include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/device/fp16_math.cuh"
 #include "mlx/backend/cuda/iterators/strided_iterator.cuh"
 #include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/dtype_utils.h"
@@ -151,30 +152,29 @@ void ArgReduce::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(in);
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    dispatch_real_types(in.dtype(), "ArgReduce", [&](auto type_tag) {
-      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-      constexpr uint32_t N_READS = 4;
-      dispatch_block_dim(
-          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
-            dim3 num_blocks = get_2d_grid_dims(out.shape(), out.strides());
-            auto kernel =
-                cu::arg_reduce_general<T, cu::ArgMax<T>, block_dim(), N_READS>;
-            if (reduce_type_ == ArgReduce::ArgMin) {
-              kernel = cu::
-                  arg_reduce_general<T, cu::ArgMin<T>, block_dim(), N_READS>;
-            }
-            kernel<<<num_blocks, block_dim(), 0, stream>>>(
-                in.data<T>(),
-                out.data<uint32_t>(),
-                out.size(),
-                const_param(shape),
-                const_param(in_strides),
-                const_param(out_strides),
-                ndim,
-                axis_stride,
-                axis_size);
-          });
+  dispatch_real_types(in.dtype(), "ArgReduce", [&](auto type_tag) {
+    using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+    constexpr uint32_t N_READS = 4;
+    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+      dim3 num_blocks = get_2d_grid_dims(out.shape(), out.strides());
+      auto kernel =
+          cu::arg_reduce_general<T, cu::ArgMax<T>, block_dim(), N_READS>;
+      if (reduce_type_ == ArgReduce::ArgMin) {
+        kernel = cu::arg_reduce_general<T, cu::ArgMin<T>, block_dim(), N_READS>;
+      }
+      encoder.add_kernel_node(
+          kernel,
+          num_blocks,
+          block_dim(),
+          in.data<T>(),
+          out.data<uint32_t>(),
+          out.size(),
+          const_param(shape),
+          const_param(in_strides),
+          const_param(out_strides),
+          ndim,
+          axis_stride,
+          axis_size);
    });
  });
 }
--- a/mlx/backend/cuda/binary.cu
+++ b/mlx/backend/cuda/binary.cu
@@ -3,7 +3,6 @@
 #include "mlx/backend/common/binary.h"
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/device/binary_ops.cuh"
-#include "mlx/backend/cuda/device/cucomplex_math.cuh"
 #include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/dtype_utils.h"
 #include "mlx/primitives.h"
@@ -17,35 +16,86 @@ namespace cu {

 namespace cg = cooperative_groups;

-template <typename Op, typename In, typename Out, typename IdxT>
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void binary_ss(const In* a, const In* b, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    out[index] = Op{}(a[0], b[0]);
+
+  if ((index + 1) * N_READS > size) {
+    for (int i = index * N_READS; i < size; ++i) {
+      out[i] = Op{}(a[0], b[0]);
+    }
+  } else {
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = Op{}(a[0], b[0]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
  }
 }

-template <typename Op, typename In, typename Out, typename IdxT>
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void binary_sv(const In* a, const In* b, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    out[index] = Op{}(a[0], b[index]);
+
+  if ((index + 1) * N_READS > size) {
+    for (IdxT i = index * N_READS; i < size; ++i) {
+      out[i] = Op{}(a[0], b[i]);
+    }
+  } else {
+    auto b_vec = load_vector<N_READS>(b, index);
+
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = Op{}(a[0], b_vec.val[i]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
  }
 }

-template <typename Op, typename In, typename Out, typename IdxT>
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void binary_vs(const In* a, const In* b, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    out[index] = Op{}(a[index], b[0]);
+
+  if ((index + 1) * N_READS > size) {
+    for (IdxT i = index * N_READS; i < size; ++i) {
+      out[i] = Op{}(a[i], b[0]);
+    }
+  } else {
+    auto a_vec = load_vector<N_READS>(a, index);
+
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = Op{}(a_vec.val[i], b[0]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
  }
 }

-template <typename Op, typename In, typename Out, typename IdxT>
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void binary_vv(const In* a, const In* b, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    out[index] = Op{}(a[index], b[index]);
+
+  if ((index + 1) * N_READS > size) {
+    for (IdxT i = index * N_READS; i < size; ++i) {
+      out[i] = Op{}(a[i], b[i]);
+    }
+  } else {
+    auto a_vec = load_vector<N_READS>(a, index);
+    auto b_vec = load_vector<N_READS>(b, index);
+
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = Op{}(a_vec.val[i], b_vec.val[i]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
  }
 }

@@ -126,7 +176,7 @@ template <typename Op>
 void binary_op_gpu_inplace(
    const std::vector<array>& inputs,
    array& out,
-    std::string_view op,
+    const char* op,
    const Stream& s) {
  assert(inputs.size() > 1);
  const auto& a = inputs[0];
@@ -139,90 +189,99 @@ void binary_op_gpu_inplace(
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    dispatch_all_types(a.dtype(), [&](auto in_type_tag) {
-      dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
-        using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
-        using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
-        if constexpr (cu::supports_binary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
-          using InType = cuda_type_t<CTYPE_IN>;
-          using OutType = cuda_type_t<CTYPE_OUT>;
-          auto bopt = get_binary_op_type(a, b);
-          if (bopt == BinaryOpType::General) {
-            dispatch_bool(
-                a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
-                    out.data_size() > INT32_MAX,
-                [&](auto large) {
-                  using IdxT = std::conditional_t<large(), int64_t, int32_t>;
-                  Shape shape;
-                  std::vector<Strides> strides;
-                  std::tie(shape, strides) =
-                      collapse_contiguous_dims(a, b, out);
-                  auto& a_strides = strides[0];
-                  auto& b_strides = strides[1];
-                  int ndim = shape.size();
-                  if (ndim <= 3) {
-                    dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                      auto kernel = cu::binary_g_nd<
-                          Op,
-                          InType,
-                          OutType,
-                          IdxT,
-                          dims_constant()>;
-                      auto [num_blocks, block_dims] =
-                          get_launch_args(kernel, out, large());
-                      kernel<<<num_blocks, block_dims, 0, stream>>>(
-                          a.data<InType>(),
-                          b.data<InType>(),
-                          out.data<OutType>(),
-                          out.size(),
-                          const_param<dims_constant()>(shape),
-                          const_param<dims_constant()>(a_strides),
-                          const_param<dims_constant()>(b_strides));
-                    });
-                  } else {
-                    auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
+  dispatch_all_types(a.dtype(), [&](auto in_type_tag) {
+    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+      using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
+      using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
+      if constexpr (cu::supports_binary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
+        using InType = cuda_type_t<CTYPE_IN>;
+        using OutType = cuda_type_t<CTYPE_OUT>;
+        auto bopt = get_binary_op_type(a, b);
+        if (bopt == BinaryOpType::General) {
+          dispatch_bool(
+              a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
+                  out.data_size() > INT32_MAX,
+              [&](auto large) {
+                using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+                Shape shape;
+                std::vector<Strides> strides;
+                std::tie(shape, strides) = collapse_contiguous_dims(a, b, out);
+                auto& a_strides = strides[0];
+                auto& b_strides = strides[1];
+                int ndim = shape.size();
+                if (ndim <= 3) {
+                  dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                    auto kernel = cu::
+                        binary_g_nd<Op, InType, OutType, IdxT, dims_constant()>;
                    auto [num_blocks, block_dims] =
                        get_launch_args(kernel, out, large());
-                    kernel<<<num_blocks, block_dims, 0, stream>>>(
+                    encoder.add_kernel_node(
+                        kernel,
+                        num_blocks,
+                        block_dims,
                        a.data<InType>(),
                        b.data<InType>(),
                        out.data<OutType>(),
                        out.size(),
-                        const_param(shape),
-                        const_param(a_strides),
-                        const_param(b_strides),
-                        ndim);
-                  }
-                });
-          } else {
-            dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
-              using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
-              auto kernel = cu::binary_ss<Op, InType, OutType, IdxT>;
-              if (bopt == BinaryOpType::ScalarVector) {
-                kernel = cu::binary_sv<Op, InType, OutType, IdxT>;
-              } else if (bopt == BinaryOpType::VectorScalar) {
-                kernel = cu::binary_vs<Op, InType, OutType, IdxT>;
-              } else if (bopt == BinaryOpType::VectorVector) {
-                kernel = cu::binary_vv<Op, InType, OutType, IdxT>;
-              }
-              auto [num_blocks, block_dims] = get_launch_args(
-                  kernel, out.data_size(), out.shape(), out.strides(), large());
-              kernel<<<num_blocks, block_dims, 0, stream>>>(
-                  a.data<InType>(),
-                  b.data<InType>(),
-                  out.data<OutType>(),
-                  out.data_size());
-            });
-          }
+                        const_param<dims_constant()>(shape),
+                        const_param<dims_constant()>(a_strides),
+                        const_param<dims_constant()>(b_strides));
+                  });
+                } else {
+                  auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
+                  auto [num_blocks, block_dims] =
+                      get_launch_args(kernel, out, large());
+                  encoder.add_kernel_node(
+                      kernel,
+                      num_blocks,
+                      block_dims,
+                      a.data<InType>(),
+                      b.data<InType>(),
+                      out.data<OutType>(),
+                      out.size(),
+                      const_param(shape),
+                      const_param(a_strides),
+                      const_param(b_strides),
+                      ndim);
+                }
+              });
        } else {
-          throw std::runtime_error(fmt::format(
-              "Can not do binary op {} on inputs of {} with result of {}.",
-              op,
-              dtype_to_string(a.dtype()),
-              dtype_to_string(out.dtype())));
+          dispatch_bool(out.data_size() > UINT32_MAX, [&](auto large) {
+            using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
+            // TODO: Choose optimized value based on type size.
+            constexpr int N_READS = 4;
+            auto kernel = cu::binary_ss<Op, InType, OutType, IdxT, N_READS>;
+            if (bopt == BinaryOpType::ScalarVector) {
+              kernel = cu::binary_sv<Op, InType, OutType, IdxT, N_READS>;
+            } else if (bopt == BinaryOpType::VectorScalar) {
+              kernel = cu::binary_vs<Op, InType, OutType, IdxT, N_READS>;
+            } else if (bopt == BinaryOpType::VectorVector) {
+              kernel = cu::binary_vv<Op, InType, OutType, IdxT, N_READS>;
+            }
+            auto [num_blocks, block_dims] = get_launch_args(
+                kernel,
+                out.data_size(),
+                out.shape(),
+                out.strides(),
+                large(),
+                N_READS);
+            encoder.add_kernel_node(
+                kernel,
+                num_blocks,
+                block_dims,
+                a.data<InType>(),
+                b.data<InType>(),
+                out.data<OutType>(),
+                out.data_size());
+          });
        }
-      });
+      } else {
+        throw std::runtime_error(fmt::format(
+            "Can not do binary op {} on inputs of {} with result of {}.",
+            op,
+            dtype_to_string(a.dtype()),
+            dtype_to_string(out.dtype())));
+      }
    });
  });
 }
@@ -231,7 +290,7 @@ template <typename Op>
 void binary_op_gpu(
    const std::vector<array>& inputs,
    array& out,
-    std::string_view op,
+    const char* op,
    const Stream& s) {
  auto& a = inputs[0];
  auto& b = inputs[1];
@@ -240,11 +299,11 @@ void binary_op_gpu(
  binary_op_gpu_inplace<Op>(inputs, out, op, s);
 }

-#define BINARY_GPU(func)                                                 \
-  void func::eval_gpu(const std::vector<array>& inputs, array& out) {    \
-    nvtx3::scoped_range r(#func "::eval_gpu");                           \
-    auto& s = out.primitive().stream();                                  \
-    binary_op_gpu<cu::func>(inputs, out, get_primitive_string(this), s); \
+#define BINARY_GPU(func)                                              \
+  void func::eval_gpu(const std::vector<array>& inputs, array& out) { \
+    nvtx3::scoped_range r(#func "::eval_gpu");                        \
+    auto& s = out.primitive().stream();                               \
+    binary_op_gpu<cu::func>(inputs, out, name(), s);                  \
  }

 BINARY_GPU(Add)
@@ -268,33 +327,31 @@ BINARY_GPU(Subtract)
 void Equal::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("Equal::eval_gpu");
  auto& s = out.primitive().stream();
-  auto op = get_primitive_string(this);
  if (equal_nan_) {
-    binary_op_gpu<cu::NaNEqual>(inputs, out, op, s);
+    binary_op_gpu<cu::NaNEqual>(inputs, out, name(), s);
  } else {
-    binary_op_gpu<cu::Equal>(inputs, out, op, s);
+    binary_op_gpu<cu::Equal>(inputs, out, name(), s);
  }
 }

 void BitwiseBinary::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("BitwiseBinary::eval_gpu");
  auto& s = out.primitive().stream();
-  auto op = get_primitive_string(this);
  switch (op_) {
    case BitwiseBinary::And:
-      binary_op_gpu<cu::BitwiseAnd>(inputs, out, op, s);
+      binary_op_gpu<cu::BitwiseAnd>(inputs, out, name(), s);
      break;
    case BitwiseBinary::Or:
-      binary_op_gpu<cu::BitwiseOr>(inputs, out, op, s);
+      binary_op_gpu<cu::BitwiseOr>(inputs, out, name(), s);
      break;
    case BitwiseBinary::Xor:
-      binary_op_gpu<cu::BitwiseXor>(inputs, out, op, s);
+      binary_op_gpu<cu::BitwiseXor>(inputs, out, name(), s);
      break;
    case BitwiseBinary::LeftShift:
-      binary_op_gpu<cu::LeftShift>(inputs, out, op, s);
+      binary_op_gpu<cu::LeftShift>(inputs, out, name(), s);
      break;
    case BitwiseBinary::RightShift:
-      binary_op_gpu<cu::RightShift>(inputs, out, op, s);
+      binary_op_gpu<cu::RightShift>(inputs, out, name(), s);
      break;
  }
 }
--- a/mlx/backend/cuda/binary_two.cu
+++ b/mlx/backend/cuda/binary_two.cu
@@ -3,7 +3,6 @@
 #include "mlx/backend/common/binary.h"
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/device/binary_ops.cuh"
-#include "mlx/backend/cuda/device/cucomplex_math.cuh"
 #include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/dtype_utils.h"
 #include "mlx/primitives.h"
@@ -17,52 +16,119 @@ namespace cu {

 namespace cg = cooperative_groups;

-template <typename Op, typename In, typename Out, typename IdxT>
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void
-binary_ss(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
+binary_two_ss(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto out = Op{}(a[0], b[0]);
-    out_a[0] = out[0];
-    out_b[0] = out[1];
+
+  if ((index + 1) * N_READS > size) {
+    for (IdxT i = index * N_READS; i < size; ++i) {
+      auto out = Op{}(a[0], b[0]);
+      out_a[i] = out[0];
+      out_b[i] = out[1];
+    }
+  } else {
+    AlignedVector<Out, N_READS> out_a_vec;
+    AlignedVector<Out, N_READS> out_b_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      auto out = Op{}(a[0], b[0]);
+      out_a_vec.val[i] = out[0];
+      out_b_vec.val[i] = out[1];
+    }
+
+    store_vector<N_READS>(out_a, index, out_a_vec);
+    store_vector<N_READS>(out_b, index, out_b_vec);
  }
 }

-template <typename Op, typename In, typename Out, typename IdxT>
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void
-binary_sv(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
+binary_two_sv(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto out = Op{}(a[0], b[index]);
-    out_a[index] = out[0];
-    out_b[index] = out[1];
+
+  if ((index + 1) * N_READS > size) {
+    for (IdxT i = index * N_READS; i < size; ++i) {
+      auto out = Op{}(a[0], b[i]);
+      out_a[i] = out[0];
+      out_b[i] = out[1];
+    }
+  } else {
+    auto b_vec = load_vector<N_READS>(b, index);
+
+    AlignedVector<Out, N_READS> out_a_vec;
+    AlignedVector<Out, N_READS> out_b_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      auto out = Op{}(a[0], b_vec.val[i]);
+      out_a_vec.val[i] = out[0];
+      out_b_vec.val[i] = out[1];
+    }
+
+    store_vector<N_READS>(out_a, index, out_a_vec);
+    store_vector<N_READS>(out_b, index, out_b_vec);
  }
 }

-template <typename Op, typename In, typename Out, typename IdxT>
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void
-binary_vs(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
+binary_two_vs(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto out = Op{}(a[index], b[0]);
-    out_a[index] = out[0];
-    out_b[index] = out[1];
+
+  if ((index + 1) * N_READS > size) {
+    for (IdxT i = index * N_READS; i < size; ++i) {
+      auto out = Op{}(a[i], b[0]);
+      out_a[i] = out[0];
+      out_b[i] = out[1];
+    }
+  } else {
+    auto a_vec = load_vector<N_READS>(a, index);
+
+    AlignedVector<Out, N_READS> out_a_vec;
+    AlignedVector<Out, N_READS> out_b_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      auto out = Op{}(a_vec.val[i], b[0]);
+      out_a_vec.val[i] = out[0];
+      out_b_vec.val[i] = out[1];
+    }
+
+    store_vector<N_READS>(out_a, index, out_a_vec);
+    store_vector<N_READS>(out_b, index, out_b_vec);
  }
 }

-template <typename Op, typename In, typename Out, typename IdxT>
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void
-binary_vv(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
+binary_two_vv(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    auto out = Op{}(a[index], b[index]);
-    out_a[index] = out[0];
-    out_b[index] = out[1];
+
+  if ((index + 1) * N_READS > size) {
+    for (IdxT i = index * N_READS; i < size; ++i) {
+      auto out = Op{}(a[i], b[i]);
+      out_a[i] = out[0];
+      out_b[i] = out[1];
+    }
+  } else {
+    auto a_vec = load_vector<N_READS>(a, index);
+    auto b_vec = load_vector<N_READS>(b, index);
+
+    AlignedVector<Out, N_READS> out_a_vec;
+    AlignedVector<Out, N_READS> out_b_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      auto out = Op{}(a_vec.val[i], b_vec.val[i]);
+      out_a_vec.val[i] = out[0];
+      out_b_vec.val[i] = out[1];
+    }
+
+    store_vector<N_READS>(out_a, index, out_a_vec);
+    store_vector<N_READS>(out_b, index, out_b_vec);
  }
 }

 template <typename Op, typename In, typename Out, typename IdxT, int NDIM>
-__global__ void binary_g_nd(
+__global__ void binary_two_g_nd(
    const In* a,
    const In* b,
    Out* out_a,
@@ -82,7 +148,7 @@ __global__ void binary_g_nd(
 }

 template <typename Op, typename In, typename Out, typename IdxT>
-__global__ void binary_g(
+__global__ void binary_two_g(
    const In* a,
    const In* b,
    Out* out_a,
@@ -103,7 +169,7 @@ __global__ void binary_g(
 }

 template <typename Op, typename In, typename Out>
-constexpr bool supports_binary_op() {
+constexpr bool supports_binary_two_op() {
  if (std::is_same_v<Op, DivMod>) {
    return std::is_same_v<In, Out> &&
        (std::is_integral_v<Out> || is_floating_v<Out>);
@@ -114,10 +180,10 @@ constexpr bool supports_binary_op() {
 } // namespace cu

 template <typename Op>
-void binary_op_gpu_inplace(
+void binary_two_op_gpu_inplace(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
-    std::string_view op,
+    const char* op,
    const Stream& s) {
  assert(inputs.size() > 1);
  const auto& a = inputs[0];
@@ -137,114 +203,124 @@ void binary_op_gpu_inplace(
  encoder.set_input_array(b);
  encoder.set_output_array(out_a);
  encoder.set_output_array(out_b);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    dispatch_all_types(a.dtype(), [&](auto in_type_tag) {
-      dispatch_all_types(out_a.dtype(), [&](auto out_type_tag) {
-        using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
-        using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
-        if constexpr (cu::supports_binary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
-          using InType = cuda_type_t<CTYPE_IN>;
-          using OutType = cuda_type_t<CTYPE_OUT>;
+  dispatch_all_types(a.dtype(), [&](auto in_type_tag) {
+    dispatch_all_types(out_a.dtype(), [&](auto out_type_tag) {
+      using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
+      using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
+      if constexpr (cu::supports_binary_two_op<Op, CTYPE_IN, CTYPE_OUT>()) {
+        using InType = cuda_type_t<CTYPE_IN>;
+        using OutType = cuda_type_t<CTYPE_OUT>;

-          auto bopt = get_binary_op_type(a, b);
-          if (bopt == BinaryOpType::General) {
-            dispatch_bool(
-                a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
-                    out_a.data_size() > INT32_MAX,
-                [&](auto large) {
-                  using IdxT = std::conditional_t<large(), int64_t, int32_t>;
-                  Shape shape;
-                  std::vector<Strides> strides;
-                  std::tie(shape, strides) =
-                      collapse_contiguous_dims(a, b, out_a);
-                  auto& a_strides = strides[0];
-                  auto& b_strides = strides[1];
-                  int ndim = shape.size();
-                  if (ndim <= 3) {
-                    dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                      auto kernel = cu::binary_g_nd<
-                          Op,
-                          InType,
-                          OutType,
-                          IdxT,
-                          dims_constant()>;
-                      auto [num_blocks, block_dims] =
-                          get_launch_args(kernel, out_a, large());
-                      kernel<<<num_blocks, block_dims, 0, stream>>>(
-                          a.data<InType>(),
-                          b.data<InType>(),
-                          out_a.data<OutType>(),
-                          out_b.data<OutType>(),
-                          out_a.size(),
-                          const_param<dims_constant()>(shape),
-                          const_param<dims_constant()>(a_strides),
-                          const_param<dims_constant()>(b_strides));
-                    });
-                  } else {
-                    auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
+        auto bopt = get_binary_op_type(a, b);
+        if (bopt == BinaryOpType::General) {
+          dispatch_bool(
+              a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
+                  out_a.data_size() > INT32_MAX,
+              [&](auto large) {
+                using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+                Shape shape;
+                std::vector<Strides> strides;
+                std::tie(shape, strides) =
+                    collapse_contiguous_dims(a, b, out_a);
+                auto& a_strides = strides[0];
+                auto& b_strides = strides[1];
+                int ndim = shape.size();
+                if (ndim <= 3) {
+                  dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                    auto kernel = cu::binary_two_g_nd<
+                        Op,
+                        InType,
+                        OutType,
+                        IdxT,
+                        dims_constant()>;
                    auto [num_blocks, block_dims] =
                        get_launch_args(kernel, out_a, large());
-                    kernel<<<num_blocks, block_dims, 0, stream>>>(
+                    encoder.add_kernel_node(
+                        kernel,
+                        num_blocks,
+                        block_dims,
                        a.data<InType>(),
                        b.data<InType>(),
                        out_a.data<OutType>(),
                        out_b.data<OutType>(),
                        out_a.size(),
-                        const_param(shape),
-                        const_param(a_strides),
-                        const_param(b_strides),
-                        ndim);
-                  }
-                });
-          } else {
-            dispatch_bool(out_a.data_size() > INT32_MAX, [&](auto large) {
-              using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
-              auto kernel = cu::binary_ss<Op, InType, OutType, IdxT>;
-              if (bopt == BinaryOpType::ScalarVector) {
-                kernel = cu::binary_sv<Op, InType, OutType, IdxT>;
-              } else if (bopt == BinaryOpType::VectorScalar) {
-                kernel = cu::binary_vs<Op, InType, OutType, IdxT>;
-              } else if (bopt == BinaryOpType::VectorVector) {
-                kernel = cu::binary_vv<Op, InType, OutType, IdxT>;
-              }
-              auto [num_blocks, block_dims] = get_launch_args(
-                  kernel,
-                  out_a.data_size(),
-                  out_a.shape(),
-                  out_a.strides(),
-                  large());
-              kernel<<<num_blocks, block_dims, 0, stream>>>(
-                  a.data<InType>(),
-                  b.data<InType>(),
-                  out_a.data<OutType>(),
-                  out_b.data<OutType>(),
-                  out_a.data_size());
-            });
-          }
+                        const_param<dims_constant()>(shape),
+                        const_param<dims_constant()>(a_strides),
+                        const_param<dims_constant()>(b_strides));
+                  });
+                } else {
+                  auto kernel = cu::binary_two_g<Op, InType, OutType, IdxT>;
+                  auto [num_blocks, block_dims] =
+                      get_launch_args(kernel, out_a, large());
+                  encoder.add_kernel_node(
+                      kernel,
+                      num_blocks,
+                      block_dims,
+                      a.data<InType>(),
+                      b.data<InType>(),
+                      out_a.data<OutType>(),
+                      out_b.data<OutType>(),
+                      out_a.size(),
+                      const_param(shape),
+                      const_param(a_strides),
+                      const_param(b_strides),
+                      ndim);
+                }
+              });
        } else {
-          throw std::runtime_error(fmt::format(
-              "Can not do binary op {} on inputs of {} with result of {}.",
-              op,
-              dtype_to_string(a.dtype()),
-              dtype_to_string(out_a.dtype())));
+          dispatch_bool(out_a.data_size() > UINT32_MAX, [&](auto large) {
+            using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
+            // TODO: Choose optimized value based on type size.
+            constexpr int N_READS = 4;
+            auto kernel = cu::binary_two_ss<Op, InType, OutType, IdxT, N_READS>;
+            if (bopt == BinaryOpType::ScalarVector) {
+              kernel = cu::binary_two_sv<Op, InType, OutType, IdxT, N_READS>;
+            } else if (bopt == BinaryOpType::VectorScalar) {
+              kernel = cu::binary_two_vs<Op, InType, OutType, IdxT, N_READS>;
+            } else if (bopt == BinaryOpType::VectorVector) {
+              kernel = cu::binary_two_vv<Op, InType, OutType, IdxT, N_READS>;
+            }
+            auto [num_blocks, block_dims] = get_launch_args(
+                kernel,
+                out_a.data_size(),
+                out_a.shape(),
+                out_a.strides(),
+                large(),
+                N_READS);
+            encoder.add_kernel_node(
+                kernel,
+                num_blocks,
+                block_dims,
+                a.data<InType>(),
+                b.data<InType>(),
+                out_a.data<OutType>(),
+                out_b.data<OutType>(),
+                out_a.data_size());
+          });
        }
-      });
+      } else {
+        throw std::runtime_error(fmt::format(
+            "Can not do binary op {} on inputs of {} with result of {}.",
+            op,
+            dtype_to_string(a.dtype()),
+            dtype_to_string(out_a.dtype())));
+      }
    });
  });
 }

 template <typename Op>
-void binary_op_gpu(
+void binary_two_op_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
-    std::string_view op,
+    const char* op,
    const Stream& s) {
  auto& a = inputs[0];
  auto& b = inputs[1];
  auto bopt = get_binary_op_type(a, b);
  set_binary_op_output_data(a, b, outputs[0], bopt);
  set_binary_op_output_data(a, b, outputs[1], bopt);
-  binary_op_gpu_inplace<Op>(inputs, outputs, op, s);
+  binary_two_op_gpu_inplace<Op>(inputs, outputs, op, s);
 }

 void DivMod::eval_gpu(
@@ -252,7 +328,7 @@ void DivMod::eval_gpu(
    std::vector<array>& outputs) {
  nvtx3::scoped_range r("DivMod::eval_gpu");
  auto& s = outputs[0].primitive().stream();
-  binary_op_gpu<cu::DivMod>(inputs, outputs, get_primitive_string(this), s);
+  binary_two_op_gpu<cu::DivMod>(inputs, outputs, name(), s);
 }

 } // namespace mlx::core
--- a/mlx/backend/cuda/compiled.cpp
+++ b/mlx/backend/cuda/compiled.cpp
@@ -3,6 +3,7 @@
 #include "mlx/backend/common/compiled.h"
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/jit_module.h"
+#include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/graph_utils.h"
 #include "mlx/primitives.h"

@@ -105,9 +106,7 @@ struct FusedKernelBuilder {
        value = fmt::format(
            "static_cast<{}>(tmp_{})", type, namer.get_name(x.inputs()[0]));
      } else {
-        std::ostringstream ss;
-        x.primitive().print(ss);
-        value = ss.str();
+        value = x.primitive().name();
        value += "{}(";
        for (size_t i = 0; i < x.inputs().size() - 1; ++i) {
          value += fmt::format("tmp_{}, ", namer.get_name(x.inputs()[i]));
@@ -178,6 +177,7 @@ void Compiled::eval_gpu(
  // Whether to use large index.
  bool large = compiled_use_large_index(inputs, outputs, contiguous);

+  cu::KernelArgs args;
  // Put inputs.
  int strides_index = 1;
  for (size_t i = 0; i < inputs.size(); ++i) {
@@ -185,26 +185,26 @@ void Compiled::eval_gpu(
      continue;
    }
    const auto& x = inputs[i];
-    mod.append_arg(x);
+    args.append(x);
    if (!contiguous && !is_scalar(x)) {
-      mod.append_arg(strides_vec[strides_index++]);
+      args.append_ptr(strides_vec[strides_index++].data());
    }
  }

  // Put outputs.
  compiled_allocate_outputs(inputs, outputs, is_constant_, contiguous);
  for (auto& x : outputs) {
-    mod.append_arg(x);
+    args.append(x);
  }

  // Put shape and size.
  if (!contiguous) {
-    mod.append_arg(shape);
+    args.append_ptr(shape.data());
  }
  if (large) {
-    mod.append_arg<int64_t>(outputs[0].data_size());
+    args.append<int64_t>(outputs[0].data_size());
  } else {
-    mod.append_arg<uint32_t>(outputs[0].data_size());
+    args.append<uint32_t>(outputs[0].data_size());
  }

  // Launch kernel.
@@ -222,9 +222,10 @@ void Compiled::eval_gpu(
  for (const auto& out : outputs) {
    encoder.set_output_array(out);
  }
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    mod.launch_kernel(stream, kernel_name, outputs[0], large);
-  });
+
+  auto kernel = mod.get_kernel(kernel_name);
+  auto [num_blocks, block_dims] = get_launch_args(kernel, outputs[0], large);
+  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
 }

 } // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_contiguous.cu
+++ b/mlx/backend/cuda/copy/copy_contiguous.cu
@@ -10,19 +10,43 @@ namespace cu {

 namespace cg = cooperative_groups;

-template <typename In, typename Out, typename IdxT>
+template <typename In, typename Out, typename IdxT, int N_READS>
 __global__ void copy_s(const In* in, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    out[index] = CastOp<In, Out>{}(in[0]);
+
+  if ((index + 1) * N_READS > size) {
+    for (IdxT i = index * N_READS; i < size; ++i) {
+      out[i] = cast_to<Out>(in[0]);
+    }
+  } else {
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = cast_to<Out>(in[0]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
  }
 }

-template <typename In, typename Out, typename IdxT>
+template <typename In, typename Out, typename IdxT, int N_READS>
 __global__ void copy_v(const In* in, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    out[index] = CastOp<In, Out>{}(in[index]);
+
+  if ((index + 1) * N_READS > size) {
+    for (IdxT i = index * N_READS; i < size; ++i) {
+      out[i] = cast_to<Out>(in[i]);
+    }
+  } else {
+    auto in_vec = load_vector<N_READS>(in, index);
+
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = cast_to<Out>(in_vec.val[i]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
  }
 }

@@ -35,24 +59,32 @@ void copy_contiguous(
    array& out,
    int64_t in_offset,
    int64_t out_offset) {
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
-      dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
-        dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
-          using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
-          using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
-          using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
-          auto kernel = cu::copy_s<InType, OutType, IdxT>;
-          if (ctype == CopyType::Vector) {
-            kernel = cu::copy_v<InType, OutType, IdxT>;
-          }
-          auto [num_blocks, block_dims] = get_launch_args(
-              kernel, out.data_size(), out.shape(), out.strides(), large());
-          kernel<<<num_blocks, block_dims, 0, stream>>>(
-              in.data<InType>() + in_offset,
-              out.data<OutType>() + out_offset,
-              out.data_size());
-        });
+  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
+    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+      dispatch_bool(out.data_size() > UINT32_MAX, [&](auto large) {
+        using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
+        using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
+        using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
+        // TODO: Choose optimized value based on type size.
+        constexpr int N_READS = 4;
+        auto kernel = cu::copy_s<InType, OutType, IdxT, N_READS>;
+        if (ctype == CopyType::Vector) {
+          kernel = cu::copy_v<InType, OutType, IdxT, N_READS>;
+        }
+        auto [num_blocks, block_dims] = get_launch_args(
+            kernel,
+            out.data_size(),
+            out.shape(),
+            out.strides(),
+            large(),
+            N_READS);
+        encoder.add_kernel_node(
+            kernel,
+            num_blocks,
+            block_dims,
+            in.data<InType>() + in_offset,
+            out.data<OutType>() + out_offset,
+            out.data_size());
      });
    });
  });
--- a/mlx/backend/cuda/copy/copy_general.cu
+++ b/mlx/backend/cuda/copy/copy_general.cu
@@ -55,50 +55,54 @@ void copy_general(
    const Shape& shape,
    const Strides& strides_in,
    const Strides& strides_out) {
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
-      dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
-        dispatch_bool(
-            in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
-            [&](auto large) {
-              using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
-              using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
-              using IdxT = std::conditional_t<large(), int64_t, int32_t>;
-              const InType* in_ptr = in.data<InType>() + offset_in;
-              OutType* out_ptr = out.data<OutType>() + offset_out;
-              int ndim = shape.size();
-              size_t data_size = 1;
-              for (auto& s : shape)
-                data_size *= s;
-              if (ndim <= 3) {
-                dispatch_1_2_3(ndim, [&](auto ndim_constant) {
-                  auto kernel =
-                      cu::copy_gg_nd<InType, OutType, IdxT, ndim_constant()>;
-                  auto [num_blocks, block_dims] = get_launch_args(
-                      kernel, data_size, shape, out.strides(), large());
-                  kernel<<<num_blocks, block_dims, 0, stream>>>(
-                      in_ptr,
-                      out_ptr,
-                      data_size,
-                      const_param<ndim_constant()>(shape),
-                      const_param<ndim_constant()>(strides_in),
-                      const_param<ndim_constant()>(strides_out));
-                });
-              } else { // ndim >= 4
-                auto kernel = cu::copy_gg<InType, OutType, IdxT>;
+  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
+    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+      dispatch_bool(
+          in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
+          [&](auto large) {
+            using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
+            using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
+            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+            const InType* in_ptr = in.data<InType>() + offset_in;
+            OutType* out_ptr = out.data<OutType>() + offset_out;
+            int ndim = shape.size();
+            size_t data_size = 1;
+            for (auto& s : shape)
+              data_size *= s;
+            if (ndim <= 3) {
+              dispatch_1_2_3(ndim, [&](auto ndim_constant) {
+                auto kernel =
+                    cu::copy_gg_nd<InType, OutType, IdxT, ndim_constant()>;
                auto [num_blocks, block_dims] = get_launch_args(
                    kernel, data_size, shape, out.strides(), large());
-                kernel<<<num_blocks, block_dims, 0, stream>>>(
+                encoder.add_kernel_node(
+                    kernel,
+                    num_blocks,
+                    block_dims,
                    in_ptr,
                    out_ptr,
                    data_size,
-                    const_param(shape),
-                    const_param(strides_in),
-                    const_param(strides_out),
-                    ndim);
-              }
-            });
-      });
+                    const_param<ndim_constant()>(shape),
+                    const_param<ndim_constant()>(strides_in),
+                    const_param<ndim_constant()>(strides_out));
+              });
+            } else { // ndim >= 4
+              auto kernel = cu::copy_gg<InType, OutType, IdxT>;
+              auto [num_blocks, block_dims] = get_launch_args(
+                  kernel, data_size, shape, out.strides(), large());
+              encoder.add_kernel_node(
+                  kernel,
+                  num_blocks,
+                  block_dims,
+                  in_ptr,
+                  out_ptr,
+                  data_size,
+                  const_param(shape),
+                  const_param(strides_in),
+                  const_param(strides_out),
+                  ndim);
+            }
+          });
    });
  });
 }
--- a/mlx/backend/cuda/copy/copy_general_dynamic.cu
+++ b/mlx/backend/cuda/copy/copy_general_dynamic.cu
@@ -61,54 +61,55 @@ void copy_general_dynamic(
    const Strides& strides_out,
    const array& dynamic_offset_in,
    const array& dynamic_offset_out) {
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
-      dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
-        dispatch_bool(
-            in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
-            [&](auto large) {
-              using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
-              using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
-              using IdxT = std::conditional_t<large(), int64_t, int32_t>;
-              const InType* in_ptr = in.data<InType>() + offset_in;
-              OutType* out_ptr = out.data<OutType>() + offset_out;
-              int ndim = shape.size();
-              if (ndim <= 3) {
-                dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                  auto kernel = cu::copy_gg_dynamic_nd<
-                      InType,
-                      OutType,
-                      IdxT,
-                      dims_constant()>;
-                  auto [num_blocks, block_dims] =
-                      get_launch_args(kernel, out, large());
-                  kernel<<<num_blocks, block_dims, 0, stream>>>(
-                      in_ptr,
-                      out_ptr,
-                      out.size(),
-                      const_param<dims_constant()>(shape),
-                      const_param<dims_constant()>(strides_in),
-                      const_param<dims_constant()>(strides_out),
-                      dynamic_offset_in.data<int64_t>(),
-                      dynamic_offset_out.data<int64_t>());
-                });
-              } else { // ndim >= 4
-                auto kernel = cu::copy_gg_dynamic<InType, OutType, IdxT>;
+  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
+    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+      dispatch_bool(
+          in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
+          [&](auto large) {
+            using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
+            using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
+            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+            const InType* in_ptr = in.data<InType>() + offset_in;
+            OutType* out_ptr = out.data<OutType>() + offset_out;
+            int ndim = shape.size();
+            if (ndim <= 3) {
+              dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                auto kernel = cu::
+                    copy_gg_dynamic_nd<InType, OutType, IdxT, dims_constant()>;
                auto [num_blocks, block_dims] =
                    get_launch_args(kernel, out, large());
-                kernel<<<num_blocks, block_dims, 0, stream>>>(
+                encoder.add_kernel_node(
+                    kernel,
+                    num_blocks,
+                    block_dims,
                    in_ptr,
                    out_ptr,
                    out.size(),
-                    const_param(shape),
-                    const_param(strides_in),
-                    const_param(strides_out),
-                    ndim,
+                    const_param<dims_constant()>(shape),
+                    const_param<dims_constant()>(strides_in),
+                    const_param<dims_constant()>(strides_out),
                    dynamic_offset_in.data<int64_t>(),
                    dynamic_offset_out.data<int64_t>());
-              }
-            });
-      });
+              });
+            } else { // ndim >= 4
+              auto kernel = cu::copy_gg_dynamic<InType, OutType, IdxT>;
+              auto [num_blocks, block_dims] =
+                  get_launch_args(kernel, out, large());
+              encoder.add_kernel_node(
+                  kernel,
+                  num_blocks,
+                  block_dims,
+                  in_ptr,
+                  out_ptr,
+                  out.size(),
+                  const_param(shape),
+                  const_param(strides_in),
+                  const_param(strides_out),
+                  ndim,
+                  dynamic_offset_in.data<int64_t>(),
+                  dynamic_offset_out.data<int64_t>());
+            }
+          });
    });
  });
 }
--- a/mlx/backend/cuda/copy/copy_general_input.cu
+++ b/mlx/backend/cuda/copy/copy_general_input.cu
@@ -50,45 +50,49 @@ void copy_general_input(
    int64_t offset_out,
    const Shape& shape,
    const Strides& strides_in) {
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
-      dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
-        dispatch_bool(
-            in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
-            [&](auto large) {
-              using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
-              using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
-              using IdxT = std::conditional_t<large(), int64_t, int32_t>;
-              const InType* in_ptr = in.data<InType>() + offset_in;
-              OutType* out_ptr = out.data<OutType>() + offset_out;
-              int ndim = shape.size();
-              if (ndim <= 3) {
-                dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                  auto kernel =
-                      cu::copy_g_nd<InType, OutType, IdxT, dims_constant()>;
-                  auto [num_blocks, block_dims] =
-                      get_launch_args(kernel, out, large());
-                  kernel<<<num_blocks, block_dims, 0, stream>>>(
-                      in_ptr,
-                      out_ptr,
-                      out.size(),
-                      const_param<dims_constant()>(shape),
-                      const_param<dims_constant()>(strides_in));
-                });
-              } else { // ndim >= 4
-                auto kernel = cu::copy_g<InType, OutType, IdxT>;
+  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
+    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+      dispatch_bool(
+          in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
+          [&](auto large) {
+            using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
+            using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
+            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+            const InType* in_ptr = in.data<InType>() + offset_in;
+            OutType* out_ptr = out.data<OutType>() + offset_out;
+            int ndim = shape.size();
+            if (ndim <= 3) {
+              dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                auto kernel =
+                    cu::copy_g_nd<InType, OutType, IdxT, dims_constant()>;
                auto [num_blocks, block_dims] =
                    get_launch_args(kernel, out, large());
-                kernel<<<num_blocks, block_dims, 0, stream>>>(
+                encoder.add_kernel_node(
+                    kernel,
+                    num_blocks,
+                    block_dims,
                    in_ptr,
                    out_ptr,
                    out.size(),
-                    const_param(shape),
-                    const_param(strides_in),
-                    ndim);
-              }
-            });
-      });
+                    const_param<dims_constant()>(shape),
+                    const_param<dims_constant()>(strides_in));
+              });
+            } else { // ndim >= 4
+              auto kernel = cu::copy_g<InType, OutType, IdxT>;
+              auto [num_blocks, block_dims] =
+                  get_launch_args(kernel, out, large());
+              encoder.add_kernel_node(
+                  kernel,
+                  num_blocks,
+                  block_dims,
+                  in_ptr,
+                  out_ptr,
+                  out.size(),
+                  const_param(shape),
+                  const_param(strides_in),
+                  ndim);
+            }
+          });
    });
  });
 }
--- a/mlx/backend/cuda/device.cpp
+++ b/mlx/backend/cuda/device.cpp
@@ -2,38 +2,28 @@

 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/worker.h"
-#include "mlx/backend/metal/metal.h"
+#include "mlx/utils.h"

 #include <fmt/format.h>
 #include <nvtx3/nvtx3.hpp>
 #include <future>
+#include <unordered_set>

 namespace mlx::core {

+// Can be tuned with MLX_MAX_OPS_PER_BUFFER
+// This should be less than 255
+constexpr int default_max_nodes_per_graph = 20;
+
+int cuda_graph_cache_size() {
+  static int cache_size = []() {
+    return env::get_var("MLX_CUDA_GRAPH_CACHE_SIZE", 100);
+  }();
+  return cache_size;
+}
+
 namespace cu {

-DeviceStream::DeviceStream(Device& device) : device_(device), stream_(device) {}
-
-void DeviceStream::synchronize() {
-  cudaStreamSynchronize(stream_);
-}
-
-cudaStream_t DeviceStream::schedule_cuda_stream() {
-  // TODO: Return a stream that maximizes parallelism.
-  return stream_;
-}
-
-cudaStream_t DeviceStream::last_cuda_stream() {
-  return stream_;
-}
-
-CommandEncoder& DeviceStream::get_encoder() {
-  if (!encoder_) {
-    encoder_ = std::make_unique<CommandEncoder>(*this);
-  }
-  return *encoder_;
-}
-
 Device::Device(int device) : device_(device) {
  CHECK_CUDA_ERROR(cudaDeviceGetAttribute(
      &compute_capability_major_, cudaDevAttrComputeCapabilityMajor, device_));
@@ -67,49 +57,262 @@ void Device::make_current() {
  }
 }

-DeviceStream& Device::get_stream(Stream s) {
-  auto it = streams_.find(s.index);
-  if (it == streams_.end()) {
-    it = streams_.try_emplace(s.index, *this).first;
+CommandEncoder& Device::get_command_encoder(Stream s) {
+  auto it = encoders_.find(s.index);
+  if (it == encoders_.end()) {
+    it = encoders_.try_emplace(s.index, *this).first;
  }
  return it->second;
 }

-CommandEncoder::CommandEncoder(DeviceStream& s)
-    : device_(s.device()), stream_(s) {}
+CommandEncoder::CaptureContext::CaptureContext(CommandEncoder& enc) : enc(enc) {
+  CHECK_CUDA_ERROR(cudaGraphCreate(&graph, 0));
+  CHECK_CUDA_ERROR(
+      cudaStreamBeginCapture(enc.stream(), cudaStreamCaptureModeGlobal));
+}
+
+CommandEncoder::CaptureContext::~CaptureContext() {
+  CHECK_CUDA_ERROR(cudaStreamEndCapture(enc.stream(), &graph));
+  size_t num_nodes;
+  CHECK_CUDA_ERROR(cudaGraphGetNodes(graph, NULL, &num_nodes));
+  if (num_nodes == 1) {
+    cudaGraphNode_t captured_node;
+    CHECK_CUDA_ERROR(cudaGraphGetNodes(graph, &captured_node, &num_nodes));
+    CUDA_KERNEL_NODE_PARAMS params;
+    CHECK_CUDA_ERROR(cuGraphKernelNodeGetParams(captured_node, &params));
+    cudaGraphNode_t node;
+    CHECK_CUDA_ERROR(cuGraphAddKernelNode(&node, enc.graph_, NULL, 0, &params));
+    enc.insert_graph_dependencies(GraphNode{node, 'K'});
+  } else {
+    cudaGraphNode_t node;
+    CHECK_CUDA_ERROR(
+        cudaGraphAddChildGraphNode(&node, enc.graph_, NULL, 0, graph));
+    enc.insert_graph_dependencies(GraphNode{node, 'G'});
+  }
+  CHECK_CUDA_ERROR(cudaGraphDestroy(graph));
+}
+
+CommandEncoder::ConcurrentContext::ConcurrentContext(CommandEncoder& enc)
+    : enc(enc) {
+  enc.in_concurrent_ = true;
+}
+
+CommandEncoder::ConcurrentContext::~ConcurrentContext() {
+  enc.in_concurrent_ = false;
+
+  // Use an empty graph node for synchronization
+  CommandEncoder::GraphNode empty{NULL, 'E', std::to_string(enc.node_count_++)};
+  enc.empty_node_count_++;
+  CHECK_CUDA_ERROR(cudaGraphAddEmptyNode(&empty.node, enc.graph_, NULL, 0));
+
+  // Insert the concurrent -> empty node dependencies
+  for (auto& from : enc.concurrent_nodes_) {
+    enc.from_nodes_.push_back(from.node);
+    enc.to_nodes_.push_back(empty.node);
+    enc.graph_key_ += from.id;
+    enc.graph_key_ += from.node_type;
+    enc.graph_key_ += empty.id;
+    enc.graph_key_ += empty.node_type;
+  }
+
+  // Insert the input -> concurrent node dependencies without updating output
+  // nodes
+  auto outputs = std::move(enc.active_outputs_);
+  enc.insert_graph_dependencies(std::move(enc.concurrent_nodes_));
+
+  // Update output node to be the empty node
+  for (auto o : outputs) {
+    enc.node_map_.emplace(o, empty).first->second = empty;
+  }
+}
+
+void CommandEncoder::insert_graph_dependencies(GraphNode node) {
+  if (node.node_type == 'G') {
+    graph_node_count_++;
+  }
+  node.id = std::to_string(node_count_++);
+  if (in_concurrent_) {
+    concurrent_nodes_.push_back(std::move(node));
+  } else {
+    std::vector<GraphNode> nodes;
+    nodes.push_back(std::move(node));
+    insert_graph_dependencies(std::move(nodes));
+  }
+}
+
+void CommandEncoder::insert_graph_dependencies(std::vector<GraphNode> nodes) {
+  std::vector<GraphNode> deps;
+  {
+    // Dependencies must be added in the same order to produce a consistent
+    // topology
+    std::unordered_set<cudaGraphNode_t> set_deps;
+    for (auto d : active_deps_) {
+      if (auto it = node_map_.find(d); it != node_map_.end()) {
+        auto [_, inserted] = set_deps.insert(it->second.node);
+        if (inserted) {
+          deps.push_back(it->second);
+        }
+      }
+    }
+  }
+  active_deps_.clear();
+
+  for (auto o : active_outputs_) {
+    for (auto& node : nodes) {
+      node_map_.emplace(o, node).first->second = node;
+    }
+  }
+  active_outputs_.clear();
+
+  for (auto& from : deps) {
+    for (auto& to : nodes) {
+      from_nodes_.push_back(from.node);
+      to_nodes_.push_back(to.node);
+      graph_key_ += from.id;
+      graph_key_ += from.node_type;
+      graph_key_ += to.id;
+      graph_key_ += to.node_type;
+    }
+  }
+}
+
+CommandEncoder::CommandEncoder(Device& d) : device_(d), stream_(d) {
+  CHECK_CUDA_ERROR(cudaGraphCreate(&graph_, 0));
+}
+
+void clear_graphs(std::unordered_map<std::string, cudaGraphExec_t>& graphs) {
+  for (auto& [_, graph_exec] : graphs) {
+    CHECK_CUDA_ERROR(cudaGraphExecDestroy(graph_exec));
+  }
+  graphs.clear();
+}
+
+CommandEncoder::~CommandEncoder() {
+  clear_graphs(graph_cache_);
+}

 void CommandEncoder::add_completed_handler(std::function<void()> task) {
  worker_.add_task(std::move(task));
 }

-void CommandEncoder::end_encoding() {
-  if (!temporaries_.empty()) {
-    add_completed_handler([temporaries = std::move(temporaries_)]() {});
-  }
+void CommandEncoder::set_input_array(const array& arr) {
+  auto id = reinterpret_cast<std::uintptr_t>(arr.buffer().ptr());
+  active_deps_.push_back(id);
+}

-  // There is no kernel running, run completion handlers immediately.
-  if (!has_gpu_work_) {
-    worker_.consume_in_this_thread();
-    return;
-  }
-  has_gpu_work_ = false;
+void CommandEncoder::set_output_array(const array& arr) {
+  auto id = reinterpret_cast<std::uintptr_t>(arr.buffer().ptr());
+  active_deps_.push_back(id);
+  active_outputs_.push_back(id);
+}

-  // Put completion handlers in a batch.
-  worker_.end_batch();
-
-  // Signaling kernel completion is expensive, delay until enough batches.
-  // TODO: This number is arbitrarily picked, profile for a better stragety.
-  if (worker_.uncommited_batches() > 8) {
+void CommandEncoder::maybe_commit() {
+  if (node_count_ >= env::max_ops_per_buffer(default_max_nodes_per_graph)) {
    commit();
  }
 }

+void CommandEncoder::add_kernel_node(
+    void* func,
+    dim3 grid_dim,
+    dim3 block_dim,
+    void** params) {
+  cudaKernelNodeParams kernel_params = {0};
+  kernel_params.func = func;
+  kernel_params.gridDim = grid_dim;
+  kernel_params.blockDim = block_dim;
+  kernel_params.kernelParams = params;
+  cudaGraphNode_t node;
+  CHECK_CUDA_ERROR(
+      cudaGraphAddKernelNode(&node, graph_, NULL, 0, &kernel_params));
+  insert_graph_dependencies(GraphNode{node, 'K'});
+}
+
+void CommandEncoder::add_kernel_node(
+    CUfunction func,
+    dim3 grid_dim,
+    dim3 block_dim,
+    void** params) {
+  CUDA_KERNEL_NODE_PARAMS kernel_params = {0};
+  kernel_params.func = func;
+  kernel_params.gridDimX = grid_dim.x;
+  kernel_params.gridDimY = grid_dim.y;
+  kernel_params.gridDimZ = grid_dim.z;
+  kernel_params.blockDimX = block_dim.x;
+  kernel_params.blockDimY = block_dim.y;
+  kernel_params.blockDimZ = block_dim.z;
+  kernel_params.kernelParams = params;
+  CUgraphNode node;
+  CHECK_CUDA_ERROR(
+      cuGraphAddKernelNode(&node, graph_, NULL, 0, &kernel_params));
+  insert_graph_dependencies(GraphNode{node, 'K'});
+}
+
 void CommandEncoder::commit() {
-  worker_.commit(stream_.last_cuda_stream());
+  if (!temporaries_.empty()) {
+    add_completed_handler([temporaries = std::move(temporaries_)]() {});
+  }
+  if (node_count_ > 0) {
+    if (!from_nodes_.empty()) {
+      CHECK_CUDA_ERROR(cudaGraphAddDependencies(
+          graph_, from_nodes_.data(), to_nodes_.data(), from_nodes_.size()));
+    }
+
+    graph_key_ += ".";
+    graph_key_ += std::to_string(node_count_);
+    graph_key_ += ".";
+    graph_key_ += std::to_string(graph_node_count_);
+    graph_key_ += ".";
+    graph_key_ += std::to_string(empty_node_count_);
+
+    cudaGraphExec_t& graph_exec = graph_cache_[graph_key_];
+
+    if (graph_exec != nullptr) {
+      cudaGraphExecUpdateResult update_result;
+#if CUDART_VERSION >= 12000
+      cudaGraphExecUpdateResultInfo info;
+      cudaGraphExecUpdate(graph_exec, graph_, &info);
+      update_result = info.result;
+#else
+      cudaGraphNode_t error_node;
+      cudaGraphExecUpdate(graph_exec, graph_, &error_node, &update_result);
+#endif // CUDART_VERSION >= 12000
+      if (update_result != cudaGraphExecUpdateSuccess) {
+        cudaGetLastError(); // reset error
+        CHECK_CUDA_ERROR(cudaGraphExecDestroy(graph_exec));
+        graph_exec = nullptr;
+      }
+    }
+    if (graph_exec == nullptr) {
+      CHECK_CUDA_ERROR(
+          cudaGraphInstantiate(&graph_exec, graph_, NULL, NULL, 0));
+    }
+    device_.make_current();
+    CHECK_CUDA_ERROR(cudaGraphLaunch(graph_exec, stream_));
+
+    // TODO smarter cache policy
+    if (graph_cache_.size() > cuda_graph_cache_size()) {
+      clear_graphs(graph_cache_);
+    }
+
+    // Reset state
+    node_count_ = 0;
+    graph_node_count_ = 0;
+    from_nodes_.clear();
+    to_nodes_.clear();
+    graph_key_.clear();
+    node_map_.clear();
+    CHECK_CUDA_ERROR(cudaGraphDestroy(graph_));
+    CHECK_CUDA_ERROR(cudaGraphCreate(&graph_, 0));
+  }
+
+  // Put completion handlers in a batch.
+  worker_.end_batch();
+  worker_.commit(stream_);
 }

 void CommandEncoder::synchronize() {
-  stream().synchronize();
+  cudaStreamSynchronize(stream_);
  auto p = std::make_shared<std::promise<void>>();
  std::future<void> f = p->get_future();
  add_completed_handler([p = std::move(p)]() { p->set_value(); });
@@ -127,12 +330,8 @@ Device& device(mlx::core::Device device) {
  return it->second;
 }

-DeviceStream& get_stream(Stream s) {
-  return device(s.device).get_stream(s);
-}
-
 CommandEncoder& get_command_encoder(Stream s) {
-  return get_stream(s).get_encoder();
+  return device(s.device).get_command_encoder(s);
 }

 } // namespace cu
--- a/mlx/backend/cuda/device.h
+++ b/mlx/backend/cuda/device.h
@@ -7,41 +7,109 @@
 #include "mlx/stream.h"

 #include <cublasLt.h>
+#include <cuda.h>
 #include <thrust/execution_policy.h>

 #include <unordered_map>

 namespace mlx::core::cu {

-class Device;
-class CommandEncoder;
-
-class DeviceStream {
+class CommandEncoder {
 public:
-  explicit DeviceStream(Device& device);
+  struct CaptureContext {
+    CaptureContext(CommandEncoder& enc);
+    ~CaptureContext();
+    cudaGraph_t graph;
+    CommandEncoder& enc;
+  };
+  struct ConcurrentContext {
+    ConcurrentContext(CommandEncoder& enc);
+    ~ConcurrentContext();
+    CommandEncoder& enc;
+  };

-  DeviceStream(const DeviceStream&) = delete;
-  DeviceStream& operator=(const DeviceStream&) = delete;
+  explicit CommandEncoder(Device& d);
+  ~CommandEncoder();

-  // Wait until kernels in the stream complete.
-  void synchronize();
+  CommandEncoder(const CommandEncoder&) = delete;
+  CommandEncoder& operator=(const CommandEncoder&) = delete;

-  // Return a cuda stream for launching kernels.
-  cudaStream_t schedule_cuda_stream();
-
-  // Return the last cuda stream used.
-  cudaStream_t last_cuda_stream();
-
-  CommandEncoder& get_encoder();
-
-  Device& device() {
-    return device_;
+  CaptureContext capture_context() {
+    return CaptureContext{*this};
+  }
+  ConcurrentContext concurrent_context() {
+    return ConcurrentContext{*this};
  }

+  void set_input_array(const array& arr);
+  void set_output_array(const array& arr);
+
+  template <typename F, typename... Params>
+  void
+  add_kernel_node(F* func, dim3 grid_dim, dim3 block_dim, Params&&... params) {
+    constexpr size_t num = sizeof...(Params);
+    void* ptrs[num];
+    size_t i = 0;
+    ([&](auto&& p) { ptrs[i++] = static_cast<void*>(&p); }(
+         std::forward<Params>(params)),
+     ...);
+    add_kernel_node((void*)func, grid_dim, block_dim, ptrs);
+  }
+
+  void add_kernel_node(
+      CUfunction func,
+      dim3 grid_dim,
+      dim3 block_dim,
+      void** params);
+
+  void
+  add_kernel_node(void* func, dim3 grid_dim, dim3 block_dim, void** params);
+
+  void add_temporary(const array& arr) {
+    temporaries_.push_back(arr.data_shared_ptr());
+  }
+
+  void add_completed_handler(std::function<void()> task);
+  void maybe_commit();
+  void commit();
+
+  CudaStream& stream() {
+    return stream_;
+  }
+
+  // Wait until kernels and completion handlers are finished
+  void synchronize();
+
 private:
+  struct GraphNode {
+    cudaGraphNode_t node;
+    // K = kernel
+    // E = empty
+    // G = subgraph
+    char node_type;
+    std::string id;
+  };
+
+  void insert_graph_dependencies(GraphNode node);
+  void insert_graph_dependencies(std::vector<GraphNode> nodes);
+
  Device& device_;
  CudaStream stream_;
-  std::unique_ptr<CommandEncoder> encoder_;
+  cudaGraph_t graph_;
+  Worker worker_;
+  char node_count_{0};
+  char graph_node_count_{0};
+  char empty_node_count_{0};
+  bool in_concurrent_{false};
+  std::vector<cudaGraphNode_t> from_nodes_;
+  std::vector<cudaGraphNode_t> to_nodes_;
+  std::string graph_key_;
+  std::vector<GraphNode> concurrent_nodes_;
+  std::vector<std::shared_ptr<array::Data>> temporaries_;
+  std::unordered_map<std::string, cudaGraphExec_t> graph_cache_;
+  std::vector<std::uintptr_t> active_deps_;
+  std::vector<std::uintptr_t> active_outputs_;
+  std::unordered_map<std::uintptr_t, GraphNode> node_map_;
 };

 class Device {
@@ -55,7 +123,7 @@ class Device {
  // Make this device the current cuda device, required by some cuda calls.
  void make_current();

-  DeviceStream& get_stream(Stream s);
+  CommandEncoder& get_command_encoder(Stream s);

  int cuda_device() const {
    return device_;
@@ -75,67 +143,10 @@ class Device {
  int compute_capability_major_;
  int compute_capability_minor_;
  cublasLtHandle_t lt_;
-  std::unordered_map<int, DeviceStream> streams_;
-};
-
-class CommandEncoder {
- public:
-  explicit CommandEncoder(DeviceStream& stream);
-
-  CommandEncoder(const CommandEncoder&) = delete;
-  CommandEncoder& operator=(const CommandEncoder&) = delete;
-
-  void set_input_array(const array& arr) {}
-  void set_output_array(const array& arr) {}
-
-  void add_temporary(const array& arr) {
-    temporaries_.push_back(arr.data_shared_ptr());
-  }
-
-  void add_completed_handler(std::function<void()> task);
-  void end_encoding();
-  void commit();
-
-  // Schedule a cuda stream for |fun| to launch kernels, and check error
-  // afterwards.
-  template <typename F>
-  void launch_kernel(F&& fun) {
-    launch_kernel(stream_.schedule_cuda_stream(), std::forward<F>(fun));
-  }
-
-  template <typename F>
-  void launch_kernel(cudaStream_t stream, F&& fun) {
-    device_.make_current();
-    fun(stream);
-    check_cuda_error("kernel launch", cudaGetLastError());
-    has_gpu_work_ = true;
-  }
-
-  Device& device() {
-    return device_;
-  }
-
-  DeviceStream& stream() {
-    return stream_;
-  }
-
-  bool has_gpu_work() const {
-    return has_gpu_work_;
-  }
-
-  // Wait until kernels and completion handlers are finished
-  void synchronize();
-
- private:
-  Device& device_;
-  DeviceStream& stream_;
-  Worker worker_;
-  bool has_gpu_work_{false};
-  std::vector<std::shared_ptr<array::Data>> temporaries_;
+  std::unordered_map<int, CommandEncoder> encoders_;
 };

 Device& device(mlx::core::Device device);
-DeviceStream& get_stream(Stream s);
 CommandEncoder& get_command_encoder(Stream s);

 // Return an execution policy that does not sync for result.
--- a/mlx/backend/cuda/device/atomic_ops.cuh
+++ b/mlx/backend/cuda/device/atomic_ops.cuh
@@ -2,7 +2,7 @@

 #pragma once

-#include "mlx/backend/cuda/device/cucomplex_math.cuh"
+#include "mlx/backend/cuda/device/complex.cuh"
 #include "mlx/backend/cuda/device/fp16_math.cuh"

 #include <cuda/atomic>
@@ -48,7 +48,7 @@ inline __device__ void atomic_add(__half* out, __half val) {
  atomicAdd(out, val);
 }

-inline __device__ void atomic_add(cuComplex* out, cuComplex val) {
+inline __device__ void atomic_add(complex64_t* out, complex64_t val) {
 #if __CUDA_ARCH__ < 900
  atomic_add_general(out, val);
 #else
@@ -58,12 +58,7 @@ inline __device__ void atomic_add(cuComplex* out, cuComplex val) {

 inline __device__ void atomic_add(__nv_bfloat16* out, __nv_bfloat16 val) {
 #if __CUDA_ARCH__ < 800
-#if CCCL_VERSION >= 2008000
  atomic_add_general(out, val);
-#else
-  bool cccl_version_too_old_for_bfloat16_atomic_add = false;
-  assert(cccl_version_too_old_for_bfloat16_atomic_add);
-#endif
 #else
  atomicAdd(out, val);
 #endif
--- a/mlx/backend/cuda/device/binary_ops.cuh
+++ b/mlx/backend/cuda/device/binary_ops.cuh
@@ -1,10 +1,7 @@
 // Copyright © 2025 Apple Inc.

-#include "mlx/backend/cuda/device/cucomplex_math.cuh"
-#include "mlx/backend/cuda/device/fp16_math.cuh"
-#include "mlx/backend/cuda/device/utils.cuh"
+#include "mlx/backend/cuda/device/unary_ops.cuh"

-#include <cuComplex.h>
 #include <cuda/std/array>

 namespace mlx::core::cu {
@@ -47,7 +44,7 @@ struct Remainder {
      } else {
        return x % y;
      }
-    } else if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+    } else if constexpr (is_complex_v<T>) {
      return x % y;
    } else {
      T r = fmod(x, y);
@@ -69,14 +66,12 @@ struct Equal {
 struct NaNEqual {
  template <typename T>
  __device__ bool operator()(T x, T y) {
-    if constexpr (std::is_same_v<T, cuComplex>) {
+    if constexpr (is_complex_v<T>) {
      return x == y ||
-          (isnan(cuCrealf(x)) && isnan(cuCrealf(y)) && isnan(cuCimagf(x)) &&
-           isnan(cuCimagf(y))) ||
-          (cuCrealf(x) == cuCrealf(y) && isnan(cuCimagf(x)) &&
-           isnan(cuCimagf(y))) ||
-          (isnan(cuCrealf(x)) && isnan(cuCrealf(y)) &&
-           cuCimagf(x) == cuCimagf(y));
+          (isnan(x.real()) && isnan(y.real()) && isnan(x.imag()) &&
+           isnan(y.imag())) ||
+          (x.real() == y.real() && isnan(x.imag()) && isnan(y.imag())) ||
+          (isnan(x.real()) && isnan(y.real()) && x.imag() == y.imag());
    } else {
      return x == y || (isnan(x) && isnan(y));
    }
@@ -114,36 +109,38 @@ struct LessEqual {
 struct LogAddExp {
  template <typename T>
  __device__ T operator()(T x, T y) {
-    if (isnan(x) || isnan(y)) {
-      return cuda::std::numeric_limits<T>::quiet_NaN();
+    if constexpr (is_complex_v<T>) {
+      if (isnan(x.real()) || isnan(x.imag()) || isnan(y.real()) ||
+          isnan(y.imag())) {
+        return {
+            cuda::std::numeric_limits<float>::quiet_NaN(),
+            cuda::std::numeric_limits<float>::quiet_NaN()};
+      }
+      auto max = x.real() > y.real() ? x : y;
+      auto min = x.real() < y.real() ? x : y;
+      auto min_real = min.real();
+      auto max_real = max.real();
+      if (!isfinite(min_real) && (min_real == max_real)) {
+        if (min_real < 0) {
+          return min;
+        } else {
+          return Log{}(Exp{}(min) + Exp{}(max));
+        }
+      } else {
+        return Log1p{}(Exp{}(min - max)) + max;
+      }
+    } else {
+      if (isnan(x) || isnan(y)) {
+        return cuda::std::numeric_limits<T>::quiet_NaN();
+      }
+      T maxval = max(x, y);
+      T minval = min(x, y);
+      return (minval == -cuda::std::numeric_limits<T>::infinity() ||
+              maxval == cuda::std::numeric_limits<T>::infinity())
+          ? maxval
+          : T(float(maxval) + log1p(expf(minval - maxval)));
    }
-    T maxval = max(x, y);
-    T minval = min(x, y);
-    return (minval == -cuda::std::numeric_limits<T>::infinity() ||
-            maxval == cuda::std::numeric_limits<T>::infinity())
-        ? maxval
-        : T(float(maxval) + log1p(expf(minval - maxval)));
  };
-
-  __device__ cuComplex operator()(cuComplex x, cuComplex y) {
-    if (isnan(cuCrealf(x)) || isnan(cuCimagf(x)) || isnan(cuCrealf(y)) ||
-        isnan(cuCimagf(y))) {
-      return {
-          cuda::std::numeric_limits<float>::quiet_NaN(),
-          cuda::std::numeric_limits<float>::quiet_NaN()};
-    }
-    float inf = cuda::std::numeric_limits<float>::infinity();
-    auto maxval = x > y ? x : y;
-    auto minval = x < y ? x : y;
-    if (cuCrealf(minval) == -inf || cuCrealf(maxval) == inf)
-      return maxval;
-    float m = exp(cuCrealf(minval) - cuCrealf(maxval));
-    cuComplex dexp{
-        m * cos(cuCimagf(minval) - cuCimagf(maxval)),
-        m * sin(cuCimagf(minval) - cuCimagf(maxval)),
-    };
-    return maxval + log1p(dexp);
-  }
 };

 struct Maximum {
@@ -151,8 +148,8 @@ struct Maximum {
  __device__ T operator()(T x, T y) {
    if constexpr (cuda::std::is_integral_v<T>) {
      return max(x, y);
-    } else if constexpr (cuda::std::is_same_v<T, cuComplex>) {
-      if (isnan(cuCrealf(x)) || isnan(cuCimagf(x))) {
+    } else if constexpr (is_complex_v<T>) {
+      if (isnan(x.real()) || isnan(x.imag())) {
        return x;
      }
      return x > y ? x : y;
@@ -170,8 +167,8 @@ struct Minimum {
  __device__ T operator()(T x, T y) {
    if constexpr (cuda::std::is_integral_v<T>) {
      return min(x, y);
-    } else if constexpr (cuda::std::is_same_v<T, cuComplex>) {
-      if (isnan(cuCrealf(x)) || isnan(cuCimagf(x))) {
+    } else if constexpr (is_complex_v<T>) {
+      if (isnan(x.real()) || isnan(x.imag())) {
        return x;
      }
      return x < y ? x : y;
@@ -194,8 +191,8 @@ struct Multiply {
 struct NotEqual {
  template <typename T>
  __device__ bool operator()(T x, T y) {
-    if constexpr (std::is_same_v<T, cuComplex>) {
-      return cuCrealf(x) != cuCrealf(y) || cuCimagf(x) != cuCimagf(y);
+    if constexpr (is_complex_v<T>) {
+      return x.real() != y.real() || x.imag() != y.imag();
    } else {
      return x != y;
    }
@@ -215,19 +212,8 @@ struct Power {
        base *= base;
      }
      return res;
-    } else if constexpr (cuda::std::is_same_v<T, cuComplex>) {
-      if (base.y == 0 && base.x == 0) {
-        if (isnan(exp.x) || isnan(exp.y)) {
-          auto nan = cuda::std::numeric_limits<float>::quiet_NaN();
-          return make_cuFloatComplex(nan, nan);
-        }
-        return make_cuFloatComplex(0.0, 0.0);
-      }
-      auto x_theta = atan2f(base.y, base.x);
-      auto x_ln_r = 0.5 * logf(base.x * base.x + base.y * base.y);
-      auto mag = expf(exp.x * x_ln_r - exp.y * x_theta);
-      auto phase = exp.y * x_ln_r + exp.x * x_theta;
-      return make_cuFloatComplex(mag * cosf(phase), mag * sinf(phase));
+    } else if constexpr (is_complex_v<T>) {
+      return pow(base, exp);
    } else {
      return powf(base, exp);
    }
--- a/mlx/backend/cuda/device/cast_op.cuh
+++ b/mlx/backend/cuda/device/cast_op.cuh
@@ -2,7 +2,10 @@

 #pragma once

-#include <cuComplex.h>
+#include "mlx/backend/cuda/device/complex.cuh"
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
 #include <thrust/iterator/transform_iterator.h>

 namespace mlx::core::cu {
@@ -17,34 +20,48 @@ struct CastOp {
  }
 };

-// Converting a complex number to real number discards the imaginary part.
-template <typename DstT>
-struct CastOp<
-    cuComplex,
-    DstT,
-    cuda::std::enable_if_t<!cuda::std::is_same_v<cuComplex, DstT>>> {
-  static constexpr bool is_castable = cuda::std::is_convertible_v<float, DstT>;
+// Castings between complex and boolean.
+template <typename T>
+struct CastOp<complex_t<T>, bool> {
+  static constexpr bool is_castable = true;

-  __device__ DstT operator()(cuComplex x) {
-    static_assert(!cuda::std::is_same_v<cuComplex, DstT>);
-    return static_cast<DstT>(cuCrealf(x));
+  __device__ bool operator()(complex_t<T> x) {
+    return x.real() != 0 && x.imag() != 0;
+  }
+};
+
+template <typename T>
+struct CastOp<bool, complex_t<T>> {
+  static constexpr bool is_castable = true;
+
+  __device__ complex_t<T> operator()(bool x) {
+    return x ? complex_t<T>{1, 1} : complex_t<T>{0, 0};
+  }
+};
+
+// Converting a complex number to real number discards the imaginary part.
+template <typename T, typename DstT>
+struct CastOp<complex_t<T>, DstT, cuda::std::enable_if_t<!is_complex_v<DstT>>> {
+  static constexpr bool is_castable = cuda::std::is_convertible_v<T, DstT>;
+
+  __device__ DstT operator()(complex_t<T> x) {
+    static_assert(!is_complex_v<DstT>);
+    return static_cast<DstT>(x.real());
  }
 };

 // Allow converting a real number to complex number.
-template <typename SrcT>
-struct CastOp<
-    SrcT,
-    cuComplex,
-    cuda::std::enable_if_t<!cuda::std::is_same_v<SrcT, cuComplex>>> {
-  static constexpr bool is_castable = cuda::std::is_convertible_v<SrcT, float>;
+template <typename SrcT, typename T>
+struct CastOp<SrcT, complex_t<T>, cuda::std::enable_if_t<!is_complex_v<SrcT>>> {
+  static constexpr bool is_castable = cuda::std::is_convertible_v<SrcT, T>;

-  __device__ cuComplex operator()(SrcT x) {
-    static_assert(!cuda::std::is_same_v<SrcT, cuComplex>);
-    return cuComplex{static_cast<float>(x), 0};
+  __device__ complex_t<T> operator()(SrcT x) {
+    static_assert(!is_complex_v<SrcT>);
+    return complex_t<T>{static_cast<T>(x), 0};
  }
 };

+// Do nothing when no casting is needed.
 template <typename SrcT, typename DstT>
 struct CastOp<
    SrcT,
@@ -57,9 +74,51 @@ struct CastOp<
  }
 };

+// In CUDA 11 the half types do not define conversions between some types,
+// provide fallbacks here.
+#if CUDART_VERSION < 12000
+template <typename SrcT, typename DstT>
+struct CastOp<
+    SrcT,
+    DstT,
+    cuda::std::enable_if_t<
+        !cuda::std::is_convertible_v<SrcT, DstT> && !is_complex_v<SrcT> &&
+        (cuda::std::is_same_v<DstT, __half> ||
+         cuda::std::is_same_v<DstT, __nv_bfloat16>)>> {
+  static constexpr bool is_castable = true;
+
+  __device__ DstT operator()(SrcT x) {
+    return DstT(static_cast<float>(x));
+  }
+};
+
+template <typename SrcT, typename DstT>
+struct CastOp<
+    SrcT,
+    DstT,
+    cuda::std::enable_if_t<
+        !cuda::std::is_convertible_v<SrcT, DstT> && !is_complex_v<SrcT> &&
+        !cuda::std::is_same_v<DstT, __half> &&
+        !cuda::std::is_same_v<DstT, __nv_bfloat16> &&
+        (cuda::std::is_same_v<SrcT, __half> ||
+         cuda::std::is_same_v<SrcT, __nv_bfloat16>)>> {
+  static constexpr bool is_castable = true;
+
+  __device__ DstT operator()(SrcT x) {
+    return DstT(static_cast<float>(x));
+  }
+};
+#endif // CUDART_VERSION < 12000
+
+// Helper to deduce the SrcT.
+template <typename DstT, typename SrcT>
+inline __host__ __device__ auto cast_to(SrcT x) {
+  return CastOp<SrcT, DstT>{}(x);
+}
+
 // Return an iterator that cast the value to DstT using CastOp.
 template <typename DstT, typename Iterator>
-__host__ __device__ auto make_cast_iterator(Iterator it) {
+inline __host__ __device__ auto make_cast_iterator(Iterator it) {
  using SrcT = typename cuda::std::iterator_traits<Iterator>::value_type;
  if constexpr (std::is_same_v<SrcT, DstT>) {
    return it;
--- a/mlx/backend/cuda/device/complex.cuh
+++ b/mlx/backend/cuda/device/complex.cuh
@@ -0,0 +1,60 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+// Make multiplication and division faster.
+#define LIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_OPERATIONS
+
+#include <cuda/std/complex>
+#include <cuda/std/type_traits>
+
+namespace mlx::core::cu {
+
+// TODO: Consider using a faster implementation as cuda::std::complex has to
+// conform to C++ standard.
+template <typename T>
+using complex_t = cuda::std::complex<T>;
+
+using complex64_t = complex_t<float>;
+using complex128_t = complex_t<double>;
+
+template <typename T>
+struct is_complex : cuda::std::false_type {};
+
+template <typename T>
+struct is_complex<cuda::std::complex<T>> : cuda::std::true_type {};
+
+template <typename T>
+inline constexpr bool is_complex_v = is_complex<T>::value;
+
+// cuda::std::complex is missing some operators.
+template <typename T>
+inline __host__ __device__ complex_t<T> operator%(
+    complex_t<T> a,
+    complex_t<T> b) {
+  T r = a.real() - floor(a.real() / b.real()) * b.real();
+  T i = a.imag() - floor(a.imag() / b.imag()) * b.imag();
+  return complex_t<T>{r, i};
+}
+
+template <typename T>
+inline __host__ __device__ bool operator>(complex_t<T> a, complex_t<T> b) {
+  return (a.real() > b.real()) || (a.real() == b.real() && a.imag() > b.imag());
+}
+
+template <typename T>
+inline __host__ __device__ bool operator<(complex_t<T> a, complex_t<T> b) {
+  return operator>(b, a);
+}
+
+template <typename T>
+inline __host__ __device__ bool operator<=(complex_t<T> a, complex_t<T> b) {
+  return !(a > b);
+}
+
+template <typename T>
+inline __host__ __device__ bool operator>=(complex_t<T> a, complex_t<T> b) {
+  return !(a < b);
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/cucomplex_math.cuh
+++ b/mlx/backend/cuda/device/cucomplex_math.cuh
@@ -1,240 +0,0 @@
-// Copyright © 2025 Apple Inc.
-// Copyright © 2017-2024 The Simons Foundation, Inc.
-//
-// FINUFFT is licensed under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance with the
-// License.  You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Forked from
-// https://github.com/flatironinstitute/finufft/blob/main/include/cufinufft/contrib/helper_math.h
-
-#pragma once
-
-#include <cuComplex.h>
-
-// This header provides some helper functions for cuComplex types.
-// It mainly wraps existing CUDA implementations to provide operator overloads
-// e.g. cuAdd, cuSub, cuMul, cuDiv, cuCreal, cuCimag, cuCabs, cuCarg, cuConj are
-// all provided by CUDA
-
-__forceinline__ __host__ __device__ cuDoubleComplex
-operator+(const cuDoubleComplex& a, const cuDoubleComplex& b) {
-  return cuCadd(a, b);
-}
-
-__forceinline__ __host__ __device__ cuDoubleComplex
-operator-(const cuDoubleComplex& a, const cuDoubleComplex& b) {
-  return cuCsub(a, b);
-}
-
-__forceinline__ __host__ __device__ cuDoubleComplex
-operator*(const cuDoubleComplex& a, const cuDoubleComplex& b) {
-  return cuCmul(a, b);
-}
-
-__forceinline__ __host__ __device__ cuDoubleComplex
-operator/(const cuDoubleComplex& a, const cuDoubleComplex& b) {
-  return cuCdiv(a, b);
-}
-
-__forceinline__ __host__ __device__ cuDoubleComplex
-operator%(const cuDoubleComplex& a, const cuDoubleComplex& b) {
-  double r = cuCreal(a) - (floorf(cuCreal(a) / cuCreal(b)) * cuCreal(b));
-  double i = cuCimag(a) - (floorf(cuCimag(a) / cuCimag(b)) * cuCimag(b));
-  return make_cuDoubleComplex(r, i);
-}
-
-__forceinline__ __host__ __device__ bool operator==(
-    const cuDoubleComplex& a,
-    const cuDoubleComplex& b) {
-  return cuCreal(a) == cuCreal(b) && cuCimag(a) == cuCimag(b);
-}
-
-__forceinline__ __host__ __device__ bool operator!=(
-    const cuDoubleComplex& a,
-    const cuDoubleComplex& b) {
-  return !(a == b);
-}
-
-__forceinline__ __host__ __device__ bool operator>(
-    const cuDoubleComplex& a,
-    const cuDoubleComplex& b) {
-  double mag_a = sqrt(cuCreal(a) * cuCreal(a) + cuCimag(a) * cuCimag(a));
-  double mag_b = sqrt(cuCreal(b) * cuCreal(b) + cuCimag(b) * cuCimag(b));
-  return mag_a > mag_b;
-}
-
-__forceinline__ __host__ __device__ bool operator>=(
-    const cuDoubleComplex& a,
-    const cuDoubleComplex& b) {
-  return a > b || a == b;
-}
-
-__forceinline__ __host__ __device__ bool operator<(
-    const cuDoubleComplex& a,
-    const cuDoubleComplex& b) {
-  return b > a;
-}
-
-__forceinline__ __host__ __device__ bool operator<=(
-    const cuDoubleComplex& a,
-    const cuDoubleComplex& b) {
-  return b > a || a == b;
-}
-
-__forceinline__ __host__ __device__ cuDoubleComplex
-operator+(const cuDoubleComplex& a, double b) {
-  return make_cuDoubleComplex(cuCreal(a) + b, cuCimag(a));
-}
-
-__forceinline__ __host__ __device__ cuDoubleComplex
-operator+(double a, const cuDoubleComplex& b) {
-  return make_cuDoubleComplex(a + cuCreal(b), cuCimag(b));
-}
-
-__forceinline__ __host__ __device__ cuDoubleComplex
-operator-(const cuDoubleComplex& a, double b) {
-  return make_cuDoubleComplex(cuCreal(a) - b, cuCimag(a));
-}
-
-__forceinline__ __host__ __device__ cuDoubleComplex
-operator-(double a, const cuDoubleComplex& b) {
-  return make_cuDoubleComplex(a - cuCreal(b), -cuCimag(b));
-}
-
-__forceinline__ __host__ __device__ cuDoubleComplex
-operator*(const cuDoubleComplex& a, double b) {
-  return make_cuDoubleComplex(cuCreal(a) * b, cuCimag(a) * b);
-}
-
-__forceinline__ __host__ __device__ cuDoubleComplex
-operator*(double a, const cuDoubleComplex& b) {
-  return make_cuDoubleComplex(a * cuCreal(b), a * cuCimag(b));
-}
-
-__forceinline__ __host__ __device__ cuDoubleComplex
-operator/(const cuDoubleComplex& a, double b) {
-  return make_cuDoubleComplex(cuCreal(a) / b, cuCimag(a) / b);
-}
-
-__forceinline__ __host__ __device__ cuDoubleComplex
-operator/(double a, const cuDoubleComplex& b) {
-  double denom = cuCreal(b) * cuCreal(b) + cuCimag(b) * cuCimag(b);
-  return make_cuDoubleComplex(
-      (a * cuCreal(b)) / denom, (-a * cuCimag(b)) / denom);
-}
-
-__forceinline__ __host__ __device__ cuFloatComplex
-operator+(const cuFloatComplex& a, const cuFloatComplex& b) {
-  return cuCaddf(a, b);
-}
-
-__forceinline__ __host__ __device__ cuFloatComplex
-operator-(const cuFloatComplex& a, const cuFloatComplex& b) {
-  return cuCsubf(a, b);
-}
-
-__forceinline__ __host__ __device__ cuFloatComplex
-operator*(const cuFloatComplex& a, const cuFloatComplex& b) {
-  return cuCmulf(a, b);
-}
-
-__forceinline__ __host__ __device__ cuFloatComplex
-operator/(const cuFloatComplex& a, const cuFloatComplex& b) {
-  return cuCdivf(a, b);
-}
-
-__forceinline__ __host__ __device__ cuFloatComplex
-operator%(const cuFloatComplex& a, const cuFloatComplex& b) {
-  float r = cuCrealf(a) - (floorf(cuCrealf(a) / cuCrealf(b)) * cuCrealf(b));
-  float i = cuCimagf(a) - (floorf(cuCimagf(a) / cuCimagf(b)) * cuCimagf(b));
-  return make_cuFloatComplex(r, i);
-}
-
-__forceinline__ __host__ __device__ bool operator==(
-    const cuFloatComplex& a,
-    const cuFloatComplex& b) {
-  return cuCrealf(a) == cuCrealf(b) && cuCimagf(a) == cuCimagf(b);
-}
-
-__forceinline__ __host__ __device__ bool operator!=(
-    const cuFloatComplex& a,
-    const cuFloatComplex& b) {
-  return !(a == b);
-}
-
-__forceinline__ __host__ __device__ bool operator>(
-    const cuFloatComplex& a,
-    const cuFloatComplex& b) {
-  float mag_a = sqrt(cuCrealf(a) * cuCrealf(a) + cuCimagf(a) * cuCimagf(a));
-  float mag_b = sqrt(cuCrealf(b) * cuCrealf(b) + cuCimagf(b) * cuCimagf(b));
-  return mag_a > mag_b;
-}
-
-__forceinline__ __host__ __device__ bool operator>=(
-    const cuFloatComplex& a,
-    const cuFloatComplex& b) {
-  return a > b || a == b;
-}
-
-__forceinline__ __host__ __device__ bool operator<(
-    const cuFloatComplex& a,
-    const cuFloatComplex& b) {
-  return b > a;
-}
-
-__forceinline__ __host__ __device__ bool operator<=(
-    const cuFloatComplex& a,
-    const cuFloatComplex& b) {
-  return b > a || a == b;
-}
-
-__forceinline__ __host__ __device__ cuFloatComplex
-operator+(const cuFloatComplex& a, float b) {
-  return make_cuFloatComplex(cuCrealf(a) + b, cuCimagf(a));
-}
-
-__forceinline__ __host__ __device__ cuFloatComplex
-operator+(float a, const cuFloatComplex& b) {
-  return make_cuFloatComplex(a + cuCrealf(b), cuCimagf(b));
-}
-
-__forceinline__ __host__ __device__ cuFloatComplex
-operator-(const cuFloatComplex& a, float b) {
-  return make_cuFloatComplex(cuCrealf(a) - b, cuCimagf(a));
-}
-
-__forceinline__ __host__ __device__ cuFloatComplex
-operator-(float a, const cuFloatComplex& b) {
-  return make_cuFloatComplex(a - cuCrealf(b), -cuCimagf(b));
-}
-
-__forceinline__ __host__ __device__ cuFloatComplex
-operator*(const cuFloatComplex& a, float b) {
-  return make_cuFloatComplex(cuCrealf(a) * b, cuCimagf(a) * b);
-}
-
-__forceinline__ __host__ __device__ cuFloatComplex
-operator*(float a, const cuFloatComplex& b) {
-  return make_cuFloatComplex(a * cuCrealf(b), a * cuCimagf(b));
-}
-
-__forceinline__ __host__ __device__ cuFloatComplex
-operator/(const cuFloatComplex& a, float b) {
-  return make_cuFloatComplex(cuCrealf(a) / b, cuCimagf(a) / b);
-}
-
-__forceinline__ __host__ __device__ cuFloatComplex
-operator/(float a, const cuFloatComplex& b) {
-  float denom = cuCrealf(b) * cuCrealf(b) + cuCimagf(b) * cuCimagf(b);
-  return make_cuFloatComplex(
-      (a * cuCrealf(b)) / denom, (-a * cuCimagf(b)) / denom);
-}
--- a/mlx/backend/cuda/device/unary_ops.cuh
+++ b/mlx/backend/cuda/device/unary_ops.cuh
@@ -14,8 +14,6 @@ struct Abs {
  __device__ T operator()(T x) {
    if constexpr (cuda::std::is_unsigned_v<T>) {
      return x;
-    } else if constexpr (cuda::std::is_same_v<T, cuComplex>) {
-      return {sqrt(cuCrealf(x) * cuCrealf(x) + cuCimagf(x) * cuCimagf(x)), 0};
    } else {
      return abs(x);
    }
@@ -27,8 +25,6 @@ struct ArcCos {
  __device__ T operator()(T x) {
    return acos(x);
  }
-
-  __device__ cuComplex operator()(cuComplex x);
 };

 struct ArcCosh {
@@ -43,8 +39,6 @@ struct ArcSin {
  __device__ T operator()(T x) {
    return asin(x);
  }
-
-  __device__ cuComplex operator()(cuComplex x);
 };

 struct ArcSinh {
@@ -59,8 +53,6 @@ struct ArcTan {
  __device__ T operator()(T x) {
    return atan(x);
  }
-
-  __device__ cuComplex operator()(cuComplex x);
 };

 struct ArcTanh {
@@ -82,6 +74,8 @@ struct Ceil {
  __device__ T operator()(T x) {
    if constexpr (cuda::std::is_integral_v<T>) {
      return x;
+    } else if constexpr (is_complex_v<T>) {
+      return T{ceil(x.real()), ceil(x.imag())};
    } else {
      return ceil(x);
    }
@@ -89,34 +83,23 @@ struct Ceil {
 };

 struct Conjugate {
-  __device__ cuComplex operator()(cuComplex x) {
-    return {cuCrealf(x), -cuCimagf(x)};
+  template <typename T>
+  __device__ complex_t<T> operator()(complex_t<T> x) {
+    return conj(x);
  }
 };

 struct Cos {
  template <typename T>
  __device__ T operator()(T x) {
-    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
-      return {
-          cos(cuCrealf(x)) * cosh(cuCimagf(x)),
-          -sin(cuCrealf(x)) * sinh(cuCimagf(x))};
-    } else {
-      return cos(x);
-    }
+    return cos(x);
  }
 };

 struct Cosh {
  template <typename T>
  __device__ T operator()(T x) {
-    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
-      return {
-          cosh(cuCrealf(x)) * cos(cuCimagf(x)),
-          sinh(cuCrealf(x)) * sin(cuCimagf(x))};
-    } else {
-      return cosh(x);
-    }
+    return cosh(x);
  }
 };

@@ -149,12 +132,7 @@ struct ErfInv {
 struct Exp {
  template <typename T>
  __device__ T operator()(T x) {
-    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
-      auto m = exp(cuCrealf(x));
-      return {m * cos(cuCimagf(x)), m * sinh(cuCimagf(x))};
-    } else {
-      return exp(x);
-    }
+    return exp(x);
  }
 };

@@ -176,6 +154,8 @@ struct Floor {
  __device__ T operator()(T x) {
    if constexpr (cuda::std::is_integral_v<T>) {
      return x;
+    } else if constexpr (is_complex_v<T>) {
+      return T{floor(x.real()), floor(x.imag())};
    } else {
      return floor(x);
    }
@@ -183,30 +163,25 @@ struct Floor {
 };

 struct Imag {
-  __device__ float operator()(cuComplex x) {
-    return cuCimagf(x);
+  template <typename T>
+  __device__ auto operator()(complex_t<T> x) {
+    return x.imag();
  }
 };

 struct Log {
  template <typename T>
  __device__ T operator()(T x) {
-    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
-      auto r = log(cuCrealf(Abs{}(x)));
-      auto i = atan2f(cuCimagf(x), cuCrealf(x));
-      return {r, i};
-    } else {
-      return log(x);
-    }
+    return log(x);
  }
 };

 struct Log2 {
  template <typename T>
  __device__ T operator()(T x) {
-    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+    if constexpr (is_complex_v<T>) {
      auto y = Log{}(x);
-      return {cuCrealf(y) / CUDART_LN2_F, cuCimagf(y) / CUDART_LN2_F};
+      return {y.real() / CUDART_LN2_F, y.imag() / CUDART_LN2_F};
    } else {
      return log2(x);
    }
@@ -216,20 +191,31 @@ struct Log2 {
 struct Log10 {
  template <typename T>
  __device__ T operator()(T x) {
-    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
-      auto y = Log{}(x);
-      return {cuCrealf(y) / CUDART_LNT_F, cuCimagf(y) / CUDART_LNT_F};
-      return y;
-    } else {
-      return log10(x);
-    }
+    return log10(x);
  }
 };

 struct Log1p {
  template <typename T>
-  __device__ T operator()(T x) {
-    return log1p(x);
+  __device__ T operator()(T z) {
+    if constexpr (is_complex_v<T>) {
+      float x = z.real();
+      float y = z.imag();
+      float zabs = Abs{}(z).real();
+      float theta = atan2f(y, x + 1);
+      if (zabs < 0.5f) {
+        float r = x * (2 + x) + y * y;
+        if (r == 0) { // handle underflow
+          return {x, theta};
+        }
+        return {0.5f * log1pf(r), theta};
+      } else {
+        float z0 = hypotf(x + 1, y);
+        return {logf(z0), theta};
+      }
+    } else {
+      return log1p(z);
+    }
  }
 };

@@ -242,8 +228,8 @@ struct LogicalNot {
 struct Negative {
  template <typename T>
  __device__ T operator()(T x) {
-    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
-      return 0 - x;
+    if constexpr (is_complex_v<T>) {
+      return T{0, 0} - x;
    } else {
      return -x;
    }
@@ -251,16 +237,17 @@ struct Negative {
 };

 struct Real {
-  __device__ float operator()(cuComplex x) {
-    return cuCrealf(x);
+  template <typename T>
+  __device__ auto operator()(complex_t<T> x) {
+    return x.real();
  }
 };

 struct Round {
  template <typename T>
  __device__ T operator()(T x) {
-    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
-      return {rint(cuCrealf(x)), rint(cuCimagf(x))};
+    if constexpr (is_complex_v<T>) {
+      return {rint(x.real()), rint(x.imag())};
    } else {
      return rint(x);
    }
@@ -280,8 +267,8 @@ struct Sign {
  __device__ T operator()(T x) {
    if constexpr (cuda::std::is_unsigned_v<T>) {
      return x != 0;
-    } else if constexpr (cuda::std::is_same_v<T, cuComplex>) {
-      if (cuCrealf(x) == 0 && cuCimagf(x) == 0) {
+    } else if constexpr (is_complex_v<T>) {
+      if (x.real() == 0 && x.imag() == 0) {
        return x;
      } else {
        return x / Abs()(x);
@@ -297,26 +284,14 @@ struct Sign {
 struct Sin {
  template <typename T>
  __device__ T operator()(T x) {
-    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
-      return {
-          sin(cuCrealf(x)) * cosh(cuCimagf(x)),
-          cos(cuCrealf(x)) * sinh(cuCimagf(x))};
-    } else {
-      return sin(x);
-    }
+    return sin(x);
  }
 };

 struct Sinh {
  template <typename T>
  __device__ T operator()(T x) {
-    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
-      return {
-          sinh(cuCrealf(x)) * cos(cuCimagf(x)),
-          cosh(cuCrealf(x)) * sin(cuCimagf(x))};
-    } else {
-      return sinh(x);
-    }
+    return sinh(x);
  }
 };

@@ -332,77 +307,31 @@ struct Sqrt {
  __device__ T operator()(T x) {
    return sqrt(x);
  }
-
-  __device__ cuComplex operator()(cuComplex x) {
-    auto xr = cuCrealf(x);
-    auto xi = cuCimagf(x);
-    if (xr == 0.0f && xi == 0.0f) {
-      return {0.0f, 0.0f};
-    }
-    auto r = cuCrealf(Abs{}(x));
-    auto a = sqrt((r + xr) / 2.0f);
-    auto b_abs = sqrt((r - xr) / 2.0f);
-    auto b = copysign(b_abs, xi);
-    return {a, b};
-  }
 };

 struct Rsqrt {
  template <typename T>
  __device__ T operator()(T x) {
-    return rsqrt(x);
-  }
-  __device__ cuComplex operator()(cuComplex x) {
-    return 1.0f / Sqrt{}(x);
+    if constexpr (is_complex_v<T>) {
+      return 1.0f / Sqrt{}(x);
+    } else {
+      return rsqrt(x);
+    }
  }
 };

 struct Tan {
  template <typename T>
  __device__ T operator()(T x) {
-    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
-      float tan_a = tan(cuCrealf(x));
-      float tanh_b = tanh(cuCimagf(x));
-      float t1 = tan_a * tanh_b;
-      float denom = 1. + t1 * t1;
-      return {(tan_a - tanh_b * t1) / denom, (tanh_b + tan_a * t1) / denom};
-    } else {
-      return tan(x);
-    }
+    return tan(x);
  }
 };

 struct Tanh {
  template <typename T>
  __device__ T operator()(T x) {
-    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
-      float tanh_a = tanh(cuCrealf(x));
-      float tan_b = tan(cuCimagf(x));
-      float t1 = tanh_a * tan_b;
-      float denom = 1. + t1 * t1;
-      return {(tanh_a + tan_b * t1) / denom, (tan_b - tanh_a * t1) / denom};
-    } else {
-      return tanh(x);
-    }
+    return tanh(x);
  }
 };

-__device__ cuComplex ArcCos::operator()(cuComplex x) {
-  auto i = cuComplex{0.0, 1.0};
-  auto y = Log{}(x + i * Sqrt{}(1.0 - x * x));
-  return {cuCimagf(y), -cuCrealf(y)};
-};
-
-__device__ cuComplex ArcSin::operator()(cuComplex x) {
-  auto i = cuComplex{0.0f, 1.0f};
-  auto y = Log{}(i * x + Sqrt{}(1.0f - x * x));
-  return {cuCimagf(y), -cuCrealf(y)};
-};
-
-__device__ cuComplex ArcTan::operator()(cuComplex x) {
-  auto i = cuComplex{0.0f, 1.0f};
-  auto ix = i * x;
-  return (1.0f / cuComplex{0.0f, 2.0f}) * Log{}((1.0f + ix) / (1.0f - ix));
-};
-
 } // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/utils.cuh
+++ b/mlx/backend/cuda/device/utils.cuh
@@ -8,9 +8,9 @@

 #pragma once

+#include "mlx/backend/cuda/device/complex.cuh"
 #include "mlx/backend/cuda/device/config.h"

-#include <cuComplex.h>
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #include <cuda/std/array>
@@ -28,6 +28,27 @@ namespace mlx::core::cu {
 using Shape = cuda::std::array<int32_t, MAX_NDIM>;
 using Strides = cuda::std::array<int64_t, MAX_NDIM>;

+// Vectorized load/store.
+template <typename T, int N>
+struct alignas(sizeof(T) * N) AlignedVector {
+  T val[N];
+};
+
+template <int N, typename T>
+inline __device__ AlignedVector<T, N> load_vector(
+    const T* ptr,
+    uint32_t offset) {
+  auto* from = reinterpret_cast<const AlignedVector<T, N>*>(ptr);
+  return from[offset];
+}
+
+template <int N, typename T>
+inline __device__ void
+store_vector(T* ptr, uint32_t offset, const AlignedVector<T, N>& vec) {
+  auto* to = reinterpret_cast<AlignedVector<T, N>*>(ptr);
+  to[offset] = vec;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // Type limits utils
 ///////////////////////////////////////////////////////////////////////////////
@@ -78,20 +99,20 @@ struct Limits<
    return cuda::std::numeric_limits<T>::infinity();
  }
  static constexpr __host__ __device__ T min() {
-#if defined(__CUDA_ARCH__) || CUDART_VERSION >= 12000
-    return -cuda::std::numeric_limits<T>::infinity();
-#else
+#if CUDART_VERSION < 12000 && __CUDA_ARCH__ < 800
    return -cuda::std::numeric_limits<float>::infinity();
+#else
+    return -cuda::std::numeric_limits<T>::infinity();
 #endif
  }
  static constexpr __host__ __device__ T finite_max() {
    return cuda::std::numeric_limits<T>::max();
  }
  static constexpr __host__ __device__ T finite_min() {
-#if defined(__CUDA_ARCH__) || CUDART_VERSION >= 12000
-    return cuda::std::numeric_limits<T>::lowest();
-#else
+#if CUDART_VERSION < 12000 && __CUDA_ARCH__ < 800
    return cuda::std::numeric_limits<float>::lowest();
+#else
+    return cuda::std::numeric_limits<T>::lowest();
 #endif
  }
 };
@@ -106,13 +127,13 @@ struct Limits<bool> {
  }
 };

-template <>
-struct Limits<cuComplex> {
-  static constexpr __host__ __device__ cuComplex max() {
-    return {Limits<float>::max(), Limits<float>::max()};
+template <typename T>
+struct Limits<complex_t<T>> {
+  static constexpr __host__ __device__ complex_t<T> max() {
+    return {Limits<T>::max(), Limits<T>::max()};
  }
-  static constexpr __host__ __device__ cuComplex min() {
-    return {Limits<float>::min(), Limits<float>::min()};
+  static constexpr __host__ __device__ complex_t<T> min() {
+    return {Limits<T>::min(), Limits<T>::min()};
  }
 };

@@ -338,21 +359,4 @@ struct LoopedElemToLoc<1, false, OffsetT> {
  }
 };

-inline __device__ cuComplex log1p(cuComplex in) {
-  float x = cuCrealf(in);
-  float y = cuCimagf(in);
-  float zabs = sqrt(x * x + y * y);
-  float theta = atan2f(y, x + 1);
-  if (zabs < 0.5f) {
-    float r = x * (2 + x) + y * y;
-    if (r == 0) { // handle underflow
-      return {x, theta};
-    }
-    return {0.5f * log1pf(r), theta};
-  } else {
-    auto z0 = sqrt((x + 1) * (x + 1) + y * y);
-    return {log(z0), theta};
-  }
-}
-
 } // namespace mlx::core::cu
--- a/mlx/backend/cuda/eval.cpp
+++ b/mlx/backend/cuda/eval.cpp
@@ -37,22 +37,20 @@ void eval(array& arr) {
  }

  auto& encoder = cu::get_command_encoder(arr.primitive().stream());
-  if (encoder.has_gpu_work()) {
-    // Keep used buffers alive until kernel finishes running.
-    std::unordered_set<std::shared_ptr<array::Data>> buffers;
-    for (auto& in : arr.inputs()) {
-      buffers.insert(in.data_shared_ptr());
-    }
-    for (auto& s : arr.siblings()) {
-      buffers.insert(s.data_shared_ptr());
-    }
-    // Remove the output if it was donated to by an input.
-    if (auto it = buffers.find(arr.data_shared_ptr()); it != buffers.end()) {
-      buffers.erase(it);
-    }
-    encoder.add_completed_handler([buffers = std::move(buffers)]() {});
+  // Keep used buffers alive until kernel finishes running.
+  std::unordered_set<std::shared_ptr<array::Data>> buffers;
+  for (auto& in : arr.inputs()) {
+    buffers.insert(in.data_shared_ptr());
  }
-  encoder.end_encoding();
+  for (auto& s : arr.siblings()) {
+    buffers.insert(s.data_shared_ptr());
+  }
+  // Remove the output if it was donated to by an input.
+  if (auto it = buffers.find(arr.data_shared_ptr()); it != buffers.end()) {
+    buffers.erase(it);
+  }
+  encoder.add_completed_handler([buffers = std::move(buffers)]() {});
+  encoder.maybe_commit();
 }

 void finalize(Stream s) {
--- a/mlx/backend/cuda/event.cu
+++ b/mlx/backend/cuda/event.cu
@@ -61,7 +61,9 @@ void CudaEvent::wait(Stream s) {
  if (s.device == mlx::core::Device::cpu) {
    scheduler::enqueue(s, [*this]() mutable { wait(); });
  } else {
-    wait(cu::get_stream(s).last_cuda_stream());
+    auto& enc = cu::get_command_encoder(s);
+    enc.commit();
+    wait(enc.stream());
  }
 }

@@ -74,7 +76,9 @@ void CudaEvent::record(Stream s) {
  if (s.device == mlx::core::Device::cpu) {
    throw std::runtime_error("CudaEvent can not wait on cpu stream.");
  } else {
-    record(cu::get_stream(s).last_cuda_stream());
+    auto& enc = cu::get_command_encoder(s);
+    enc.commit();
+    record(enc.stream());
  }
 }

@@ -86,8 +90,6 @@ bool CudaEvent::completed() const {
 // SharedEvent implementations
 ///////////////////////////////////////////////////////////////////////////////

-namespace {
-
 __host__ __device__ void event_wait(SharedEvent::Atomic* ac, uint64_t value) {
  uint64_t current;
  while ((current = ac->load()) < value) {
@@ -108,8 +110,6 @@ __global__ void event_signal_kernel(SharedEvent::Atomic* ac, uint64_t value) {
  event_signal(ac, value);
 }

-} // namespace
-
 SharedEvent::SharedEvent() {
  // Allocate cuda::atomic on managed memory.
  Atomic* ac;
@@ -136,11 +136,9 @@ void SharedEvent::wait(Stream s, uint64_t value) {
    scheduler::enqueue(s, [*this, value]() mutable { wait(value); });
  } else {
    auto& encoder = get_command_encoder(s);
-    encoder.launch_kernel(
-        encoder.stream().last_cuda_stream(),
-        [this, value](cudaStream_t stream) { wait(stream, value); });
+    encoder.commit();
+    wait(encoder.stream(), value);
    encoder.add_completed_handler([ac = ac_]() {});
-    encoder.end_encoding();
  }
 }

@@ -162,11 +160,9 @@ void SharedEvent::signal(Stream s, uint64_t value) {
    scheduler::enqueue(s, [*this, value]() mutable { signal(stream, value); });
  } else {
    auto& encoder = get_command_encoder(s);
-    encoder.launch_kernel(
-        encoder.stream().last_cuda_stream(),
-        [this, value](cudaStream_t stream) { signal(stream, value); });
+    encoder.commit();
+    signal(encoder.stream(), value);
    encoder.add_completed_handler([ac = ac_]() {});
-    encoder.end_encoding();
  }
 }

--- a/mlx/backend/cuda/indexing.cpp
+++ b/mlx/backend/cuda/indexing.cpp
@@ -3,13 +3,16 @@
 #include "mlx/backend/common/compiled.h"
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/jit_module.h"
+#include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/backend/gpu/copy.h"
 #include "mlx/dtype_utils.h"
 #include "mlx/primitives.h"

 #include "cuda_jit_sources.h"

+#include <cuda.h>
 #include <fmt/format.h>
+#include <nvrtc.h>
 #include <nvtx3/nvtx3.hpp>

 #include <cassert>
@@ -22,7 +25,7 @@ namespace {
 constexpr const char* g_scatter_ops[] = {"Max", "Min", "Sum", "Prod", "Assign"};

 void append_indices_arg(
-    cu::JitModule& mod,
+    cu::KernelArgs& args,
    const std::vector<array>& inputs,
    int nidx,
    int idx_ndim) {
@@ -30,7 +33,7 @@ void append_indices_arg(
  for (int i = 0; i < nidx; ++i) {
    indices[i] = inputs[i + 1].data<void>();
  }
-  mod.append_arg(std::move(indices));
+  args.append(std::move(indices));
  std::vector<int32_t> indices_shape(nidx * idx_ndim);
  for (int i = 0; i < nidx; ++i) {
    std::copy_n(
@@ -38,7 +41,7 @@ void append_indices_arg(
        idx_ndim,
        indices_shape.data() + i * idx_ndim);
  }
-  mod.append_arg(std::move(indices_shape));
+  args.append(std::move(indices_shape));
  std::vector<int64_t> indices_strides(nidx * idx_ndim);
  for (int i = 0; i < nidx; ++i) {
    std::copy_n(
@@ -46,7 +49,7 @@ void append_indices_arg(
        idx_ndim,
        indices_strides.data() + i * idx_ndim);
  }
-  mod.append_arg(std::move(indices_strides));
+  args.append(std::move(indices_strides));
 }

 } // namespace
@@ -94,20 +97,21 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
    return std::make_pair(jit_source_gather, std::move(kernel_names));
  });

-  mod.append_arg(src);
-  mod.append_arg(out);
+  cu::KernelArgs args;
+  args.append(src);
+  args.append(out);
  if (large) {
-    mod.append_arg<int64_t>(out.size());
+    args.append<int64_t>(out.size());
  } else {
-    mod.append_arg<int32_t>(out.size());
+    args.append<int32_t>(out.size());
  }
-  mod.append_ndim_arg(src.shape());
-  mod.append_ndim_arg(src.strides());
-  mod.append_arg<int32_t>(src.ndim());
-  mod.append_ndim_arg(slice_sizes_);
-  mod.append_arg(slice_size);
-  mod.append_arg(axes_);
-  append_indices_arg(mod, inputs, nidx, idx_ndim);
+  args.append_ndim(src.shape());
+  args.append_ndim(src.strides());
+  args.append<int32_t>(src.ndim());
+  args.append_ndim(slice_sizes_);
+  args.append(slice_size);
+  args.append(axes_);
+  append_indices_arg(args, inputs, nidx, idx_ndim);

  std::string kernel_name = fmt::format(
      "mlx::core::cu::gather<{}, {}, {}, {}, {}>",
@@ -122,9 +126,10 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
    encoder.set_input_array(in);
  }
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    mod.launch_kernel(stream, kernel_name, out, large);
-  });
+
+  auto kernel = mod.get_kernel(kernel_name);
+  auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
+  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
 }

 void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
@@ -187,26 +192,27 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
    return std::make_pair(jit_source_scatter, std::move(kernel_names));
  });

-  mod.append_arg(upd);
-  mod.append_arg(out);
+  cu::KernelArgs args;
+  args.append(upd);
+  args.append(out);
  if (large) {
-    mod.append_arg<int64_t>(upd.size());
+    args.append<int64_t>(upd.size());
  } else {
-    mod.append_arg<int32_t>(upd.size());
+    args.append<int32_t>(upd.size());
  }
-  mod.append_ndim_arg(upd.shape());
-  mod.append_ndim_arg(upd.strides());
-  mod.append_arg<int32_t>(upd.ndim());
+  args.append_ndim(upd.shape());
+  args.append_ndim(upd.strides());
+  args.append<int32_t>(upd.ndim());
  if (large) {
-    mod.append_arg<int64_t>(upd_post_idx_size);
+    args.append<int64_t>(upd_post_idx_size);
  } else {
-    mod.append_arg<int32_t>(upd_post_idx_size);
+    args.append<int32_t>(upd_post_idx_size);
  }
-  mod.append_ndim_arg(out.shape());
-  mod.append_ndim_arg(out.strides());
-  mod.append_arg<int32_t>(out.ndim());
-  mod.append_arg(axes_);
-  append_indices_arg(mod, inputs, nidx, idx_ndim);
+  args.append_ndim(out.shape());
+  args.append_ndim(out.strides());
+  args.append<int32_t>(out.ndim());
+  args.append(axes_);
+  append_indices_arg(args, inputs, nidx, idx_ndim);

  std::string kernel_name = fmt::format(
      "mlx::core::cu::scatter<{}, {}, mlx::core::cu::Scatter{}, {}, {}, {}>",
@@ -222,9 +228,9 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
    encoder.set_input_array(in);
  }
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    mod.launch_kernel(stream, kernel_name, upd, large);
-  });
+  auto kernel = mod.get_kernel(kernel_name);
+  auto [num_blocks, block_dims] = get_launch_args(kernel, upd, large);
+  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
 }

 void GatherAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
@@ -275,25 +281,26 @@ void GatherAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
  }
  size_t idx_size_axis = idx.shape(axis_);

-  mod.append_arg(src);
-  mod.append_arg(idx);
-  mod.append_arg(out);
+  cu::KernelArgs args;
+  args.append(src);
+  args.append(idx);
+  args.append(out);
  if (large) {
-    mod.append_arg<int64_t>(idx_size_pre);
-    mod.append_arg<int64_t>(idx_size_axis);
-    mod.append_arg<int64_t>(idx_size_post);
+    args.append<int64_t>(idx_size_pre);
+    args.append<int64_t>(idx_size_axis);
+    args.append<int64_t>(idx_size_post);
  } else {
-    mod.append_arg<int32_t>(idx_size_pre);
-    mod.append_arg<int32_t>(idx_size_axis);
-    mod.append_arg<int32_t>(idx_size_post);
+    args.append<int32_t>(idx_size_pre);
+    args.append<int32_t>(idx_size_axis);
+    args.append<int32_t>(idx_size_post);
  }
-  mod.append_arg(remove_index(idx.shape(), axis_));
-  mod.append_arg(remove_index(src.strides(), axis_));
-  mod.append_arg(remove_index(idx.strides(), axis_));
-  mod.append_arg<int32_t>(axis_);
-  mod.append_arg(src.shape(axis_));
-  mod.append_arg(src.strides(axis_));
-  mod.append_arg(idx.strides(axis_));
+  args.append(remove_index(idx.shape(), axis_));
+  args.append(remove_index(src.strides(), axis_));
+  args.append(remove_index(idx.strides(), axis_));
+  args.append<int32_t>(axis_);
+  args.append(src.shape(axis_));
+  args.append(src.strides(axis_));
+  args.append(idx.strides(axis_));

  std::string kernel_name = fmt::format(
      "mlx::core::cu::gather_axis<{}, {}, {}, {}, {}, {}>",
@@ -309,9 +316,9 @@ void GatherAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
    encoder.set_input_array(in);
  }
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    mod.launch_kernel(stream, kernel_name, idx, large);
-  });
+  auto kernel = mod.get_kernel(kernel_name);
+  auto [num_blocks, block_dims] = get_launch_args(kernel, idx, large);
+  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
 }

 void ScatterAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
@@ -377,25 +384,26 @@ void ScatterAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
  }
  size_t idx_size_axis = idx.shape(axis_);

-  mod.append_arg(upd);
-  mod.append_arg(idx);
-  mod.append_arg(out);
+  cu::KernelArgs args;
+  args.append(upd);
+  args.append(idx);
+  args.append(out);
  if (large) {
-    mod.append_arg<int64_t>(idx_size_pre);
-    mod.append_arg<int64_t>(idx_size_axis);
-    mod.append_arg<int64_t>(idx_size_post);
+    args.append<int64_t>(idx_size_pre);
+    args.append<int64_t>(idx_size_axis);
+    args.append<int64_t>(idx_size_post);
  } else {
-    mod.append_arg<int32_t>(idx_size_pre);
-    mod.append_arg<int32_t>(idx_size_axis);
-    mod.append_arg<int32_t>(idx_size_post);
+    args.append<int32_t>(idx_size_pre);
+    args.append<int32_t>(idx_size_axis);
+    args.append<int32_t>(idx_size_post);
  }
-  mod.append_arg(remove_index(idx.shape(), axis_));
-  mod.append_arg(remove_index(upd.strides(), axis_));
-  mod.append_arg(remove_index(idx.strides(), axis_));
-  mod.append_arg<int32_t>(axis_);
-  mod.append_arg(out.shape(axis_));
-  mod.append_arg(upd.strides(axis_));
-  mod.append_arg(idx.strides(axis_));
+  args.append(remove_index(idx.shape(), axis_));
+  args.append(remove_index(upd.strides(), axis_));
+  args.append(remove_index(idx.strides(), axis_));
+  args.append<int32_t>(axis_);
+  args.append(out.shape(axis_));
+  args.append(upd.strides(axis_));
+  args.append(idx.strides(axis_));

  std::string kernel_name = fmt::format(
      "mlx::core::cu::scatter_axis<{}, {}, mlx::core::cu::Scatter{}, {}, {}, {}, {}>",
@@ -412,9 +420,9 @@ void ScatterAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
    encoder.set_input_array(in);
  }
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    mod.launch_kernel(stream, kernel_name, idx, large);
-  });
+  auto kernel = mod.get_kernel(kernel_name);
+  auto [num_blocks, block_dims] = get_launch_args(kernel, idx, large);
+  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
 }

 } // namespace mlx::core
--- a/mlx/backend/cuda/jit_module.cpp
+++ b/mlx/backend/cuda/jit_module.cpp
@@ -2,6 +2,7 @@

 #include "mlx/backend/cuda/jit_module.h"
 #include "mlx/backend/cuda/device.h"
+#include "mlx/version.h"

 #include "cuda_jit_sources.h"

@@ -12,6 +13,7 @@

 #include <fmt/format.h>
 #include <nvrtc.h>
+#include <unistd.h>

 namespace mlx::core::cu {

@@ -26,16 +28,6 @@ void check_nvrtc_error(const char* name, nvrtcResult err) {
  }
 }

-#define CHECK_CU_ERROR(cmd) check_cu_error(#cmd, (cmd))
-
-void check_cu_error(const char* name, CUresult err) {
-  if (err != CUDA_SUCCESS) {
-    const char* err_str = "Unknown error";
-    cuGetErrorString(err, &err_str);
-    throw std::runtime_error(fmt::format("{} failed: {}", name, err_str));
-  }
-}
-
 // Return the location of the CUDA toolkit.
 const std::string& cuda_home() {
  static std::string home = []() -> std::string {
@@ -59,14 +51,25 @@ const std::string& cuda_home() {
  return home;
 }

+// Return the location of CCCL headers shipped with the distribution.
+bool get_cccl_include(std::string* out) {
+  auto cccl_headers = current_binary_dir().parent_path() / "include" / "cccl";
+  if (!std::filesystem::exists(cccl_headers)) {
+    return false;
+  }
+  *out = fmt::format("--include-path={}", cccl_headers.string());
+  return true;
+}
+
 // Get the cache directory for storing compiled results.
 const std::filesystem::path& ptx_cache_dir() {
  static std::filesystem::path cache = []() -> std::filesystem::path {
    std::filesystem::path cache;
-    if (auto c = std::getenv("MLX_PTX_CACHE"); c) {
+    if (auto c = std::getenv("MLX_PTX_CACHE_DIR"); c) {
      cache = c;
    } else {
-      cache = std::filesystem::temp_directory_path() / "mlx" / "ptx";
+      cache =
+          std::filesystem::temp_directory_path() / "mlx" / version() / "ptx";
    }
    if (!std::filesystem::exists(cache)) {
      std::error_code error;
@@ -170,7 +173,7 @@ constexpr const char* g_include_names[] = {
    INCLUDE_PREFIX "binary_ops.cuh",
    INCLUDE_PREFIX "cast_op.cuh",
    INCLUDE_PREFIX "config.h",
-    INCLUDE_PREFIX "cucomplex_math.cuh",
+    INCLUDE_PREFIX "complex.cuh",
    INCLUDE_PREFIX "fp16_math.cuh",
    INCLUDE_PREFIX "indexing.cuh",
    INCLUDE_PREFIX "scatter_ops.cuh",
@@ -186,7 +189,7 @@ constexpr const char* g_headers[] = {
    jit_source_binary_ops,
    jit_source_cast_op,
    jit_source_config,
-    jit_source_cucomplex_math,
+    jit_source_complex,
    jit_source_fp16_math,
    jit_source_indexing,
    jit_source_scatter_ops,
@@ -223,16 +226,23 @@ JitModule::JitModule(
    }

    // Compile program.
+    std::vector<const char*> args;
    bool use_sass = compiler_supports_device_sass(device);
    std::string compute = fmt::format(
        "--gpu-architecture={}_{}{}",
        use_sass ? "sm" : "compute",
        device.compute_capability_major(),
        device.compute_capability_minor());
-    std::string include = fmt::format("--include-path={}/include", cuda_home());
-    const char* args[] = {compute.c_str(), include.c_str()};
+    args.push_back(compute.c_str());
+    std::string cccl_include;
+    if (get_cccl_include(&cccl_include)) {
+      args.push_back(cccl_include.c_str());
+    }
+    std::string cuda_include =
+        fmt::format("--include-path={}/include", cuda_home());
+    args.push_back(cuda_include.c_str());
    nvrtcResult compile_result =
-        nvrtcCompileProgram(prog, std::size(args), args);
+        nvrtcCompileProgram(prog, args.size(), args.data());
    if (compile_result != NVRTC_SUCCESS) {
      size_t log_size;
      CHECK_NVRTC_ERROR(nvrtcGetProgramLogSize(prog, &log_size));
@@ -280,60 +290,13 @@ JitModule::JitModule(
  // Load kernels.
  for (const auto& [name, mangled] : ptx_kernels) {
    CUfunction kernel;
-    CHECK_CU_ERROR(cuModuleGetFunction(&kernel, module_, mangled.c_str()));
+    CHECK_CUDA_ERROR(cuModuleGetFunction(&kernel, module_, mangled.c_str()));
    kernels_[name] = kernel;
  }
 }

 JitModule::~JitModule() {
-  CHECK_CU_ERROR(cuModuleUnload(module_));
-}
-
-void JitModule::launch_kernel(
-    CUstream stream,
-    const std::string& kernel_name,
-    const array& arr,
-    bool large,
-    int work_per_thread) {
-  CUfunction kernel = get_kernel(kernel_name);
-  size_t nthreads = cuda::ceil_div(arr.size(), work_per_thread);
-  int _, block_dim;
-  CHECK_CU_ERROR(
-      cuOccupancyMaxPotentialBlockSize(&_, &block_dim, kernel, 0, 0, 0));
-  if (block_dim > nthreads) {
-    block_dim = nthreads;
-  }
-  Dims num_blocks{1, 1, 1};
-  if (large) {
-    num_blocks =
-        get_2d_grid_dims_common(arr.shape(), arr.strides(), work_per_thread);
-    std::get<0>(num_blocks) =
-        (std::get<0>(num_blocks) + block_dim - 1) / block_dim;
-  } else {
-    std::get<0>(num_blocks) = (nthreads + block_dim - 1) / block_dim;
-  }
-  launch_kernel(stream, kernel, num_blocks, Dims{block_dim, 1, 1});
-}
-
-void JitModule::launch_kernel(
-    CUstream stream,
-    CUfunction kernel,
-    Dims num_blocks,
-    Dims block_dims) {
-  CHECK_CU_ERROR(cuLaunchKernel(
-      kernel,
-      std::get<0>(num_blocks),
-      std::get<1>(num_blocks),
-      std::get<2>(num_blocks),
-      std::get<0>(block_dims),
-      std::get<1>(block_dims),
-      std::get<2>(block_dims),
-      0,
-      stream,
-      args_.data(),
-      nullptr));
-  args_.clear();
-  storage_.clear();
+  CHECK_CUDA_ERROR(cuModuleUnload(module_));
 }

 CUfunction JitModule::get_kernel(const std::string& kernel_name) {
@@ -345,10 +308,6 @@ CUfunction JitModule::get_kernel(const std::string& kernel_name) {
  return it->second;
 }

-void JitModule::append_ptr_arg(const void* v) {
-  args_.push_back(const_cast<void*>(v));
-}
-
 JitModule& get_jit_module(
    const mlx::core::Device& device,
    const std::string& name,
--- a/mlx/backend/cuda/jit_module.h
+++ b/mlx/backend/cuda/jit_module.h
@@ -4,6 +4,7 @@

 #include "mlx/array.h"
 #include "mlx/backend/common/utils.h"
+#include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/device/config.h"

 #include <deque>
@@ -23,72 +24,48 @@ using KernelBuilderResult = std::pair<
    /* kernel names */ std::vector<std::string>>;
 using KernelBuilder = std::function<KernelBuilderResult()>;

-class JitModule {
- public:
-  JitModule(
-      Device& device,
-      const std::string& module_name,
-      const KernelBuilder& builder);
-  ~JitModule();
+struct KernelArgs {
+  void** args() {
+    return args_.data();
+  }

-  JitModule(const JitModule&) = delete;
-  JitModule& operator=(const JitModule&) = delete;
-
-  void append_arg(const array& a) {
-    append_arg(reinterpret_cast<CUdeviceptr>(a.data<void>()));
+  void append(const array& a) {
+    append(reinterpret_cast<CUdeviceptr>(a.data<void>()));
  }

  template <typename T>
-  void append_arg(T val) {
+  void append(T val) {
    storage_.emplace_back(val);
-    append_ptr_arg(&storage_.back());
+    append_ptr(&storage_.back());
  }

  template <typename T>
-  void append_arg(std::vector<T> vec) {
+  void append(std::vector<T> vec) {
    if (vec.empty()) {
      // The nullptr can not be used as arg, pass something not null.
-      append_arg(std::monostate{});
+      append(std::monostate{});
    } else {
-      append_ptr_arg(vec.data());
+      append_ptr(vec.data());
      storage_.emplace_back(std::move(vec));
    }
  }

  // Make sure the arg is copied to an array with size of NDIM.
  template <size_t NDIM = MAX_NDIM, typename T>
-  void append_ndim_arg(const std::vector<T>& vec) {
+  void append_ndim(std::vector<T> vec) {
    if (vec.size() > NDIM) {
      throw std::runtime_error(
          fmt::format("ndim can not be larger than {}.", NDIM));
    }
-    std::vector<T> copied(NDIM);
-    std::copy(vec.begin(), vec.end(), copied.data());
-    append_arg(std::move(copied));
+    vec.resize(NDIM);
+    append(std::move(vec));
  }

-  // Launch kernel with |kernel_name| that each thread works on
-  // |work_per_thread| elements of |arr|.
-  void launch_kernel(
-      CUstream stream,
-      const std::string& kernel_name,
-      const array& arr,
-      bool large,
-      int work_per_thread = 1);
-
-  void launch_kernel(
-      CUstream stream,
-      CUfunction kernel,
-      Dims num_blocks,
-      Dims block_dims);
-
-  CUfunction get_kernel(const std::string& kernel_name);
+  void append_ptr(const void* v) {
+    args_.push_back(const_cast<void*>(v));
+  }

 private:
-  void append_ptr_arg(const void* v);
-
-  CUmodule module_{nullptr};
-  std::unordered_map<std::string, CUfunction> kernels_;
  std::vector<void*> args_;

  // The cuLaunchKernel API requires passing pointers to arguments so store
@@ -105,6 +82,23 @@ class JitModule {
  std::deque<Arg> storage_;
 };

+class JitModule {
+ public:
+  JitModule(
+      Device& device,
+      const std::string& module_name,
+      const KernelBuilder& builder);
+  ~JitModule();
+
+  JitModule(const JitModule&) = delete;
+  JitModule& operator=(const JitModule&) = delete;
+  CUfunction get_kernel(const std::string& kernel_name);
+
+ private:
+  CUmodule module_{nullptr};
+  std::unordered_map<std::string, CUfunction> kernels_;
+};
+
 JitModule& get_jit_module(
    const mlx::core::Device& device,
    const std::string& name,
--- a/mlx/backend/cuda/kernel_utils.cuh
+++ b/mlx/backend/cuda/kernel_utils.cuh
@@ -11,7 +11,7 @@
 #include "mlx/array.h"
 #include "mlx/backend/cuda/device/utils.cuh"

-#include <cuComplex.h>
+#include <cuda.h>
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #include <fmt/format.h>
@@ -78,7 +78,7 @@ struct CTypeToCudaType<bfloat16_t> {

 template <>
 struct CTypeToCudaType<complex64_t> {
-  using type = cuComplex;
+  using type = cu::complex64_t;
 };

 template <typename T>
@@ -90,10 +90,14 @@ inline constexpr bool is_floating_v =
    cuda::std::is_same_v<T, float> || cuda::std::is_same_v<T, double> ||
    cuda::std::is_same_v<T, float16_t> || cuda::std::is_same_v<T, bfloat16_t>;

+// Type traits for detecting complex numbers.
+template <typename T>
+inline constexpr bool is_complex_v = cuda::std::is_same_v<T, complex64_t> ||
+    cuda::std::is_same_v<T, complex128_t>;
+
 // Type traits for detecting complex or real floating point numbers.
 template <typename T>
-inline constexpr bool is_inexact_v =
-    is_floating_v<T> || cuda::std::is_same_v<T, complex64_t>;
+inline constexpr bool is_inexact_v = is_floating_v<T> || is_complex_v<T>;

 // Utility to copy data from vector to array in host.
 template <int NDIM = MAX_NDIM, typename T = int32_t>
@@ -120,7 +124,13 @@ std::pair<dim3, dim3> get_grid_and_block(int dim0, int dim1, int dim2);
 template <typename T>
 inline uint max_occupancy_block_dim(T kernel) {
  int _, block_dim;
-  CHECK_CUDA_ERROR(cudaOccupancyMaxPotentialBlockSize(&_, &block_dim, kernel));
+  if constexpr (std::is_same_v<T, CUfunction>) {
+    CHECK_CUDA_ERROR(
+        cuOccupancyMaxPotentialBlockSize(&_, &block_dim, kernel, 0, 0, 0));
+  } else {
+    CHECK_CUDA_ERROR(
+        cudaOccupancyMaxPotentialBlockSize(&_, &block_dim, kernel));
+  }
  return block_dim;
 }

--- a/mlx/backend/cuda/layer_norm.cu
+++ b/mlx/backend/cuda/layer_norm.cu
@@ -258,23 +258,23 @@ void LayerNorm::eval_gpu(
  encoder.set_input_array(w);
  encoder.set_input_array(b);
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    dispatch_float_types(out.dtype(), "layernorm", [&](auto type_tag) {
-      constexpr uint32_t N_READS = 4;
-      dispatch_block_dim(
-          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
-            using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-            auto kernel = cu::layer_norm<DataType, block_dim(), N_READS>;
-            kernel<<<n_rows, block_dim(), 0, stream>>>(
-                x.data<DataType>(),
-                w.data<DataType>(),
-                b.data<DataType>(),
-                out.data<DataType>(),
-                eps_,
-                axis_size,
-                w_stride,
-                b_stride);
-          });
+  dispatch_float_types(out.dtype(), "layernorm", [&](auto type_tag) {
+    constexpr uint32_t N_READS = 4;
+    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+      auto kernel = cu::layer_norm<DataType, block_dim(), N_READS>;
+      encoder.add_kernel_node(
+          kernel,
+          n_rows,
+          block_dim(),
+          x.data<DataType>(),
+          w.data<DataType>(),
+          b.data<DataType>(),
+          out.data<DataType>(),
+          eps_,
+          axis_size,
+          w_stride,
+          b_stride);
    });
  });
 }
@@ -289,21 +289,25 @@ void LayerNormVJP::eval_gpu(
  // Ensure row contiguity. We could relax this step by checking that the array
  // is contiguous (no broadcasts or holes) and that the input strides are the
  // same as the cotangent strides but for now this is simpler.
-  auto check_input = [&s](const array& x) -> std::pair<array, bool> {
+  auto check_input = [&s](const array& x, bool& copied) {
    if (x.flags().row_contiguous) {
-      return {x, false};
+      copied = false;
+      return x;
    }
+    copied = true;
    array x_copy(x.shape(), x.dtype(), nullptr, {});
    copy_gpu(x, x_copy, CopyType::General, s);
-    return {x_copy, true};
+    return x_copy;
  };
  bool donate_x = inputs[0].is_donatable();
  bool donate_g = inputs[3].is_donatable();
-  auto [x, copied] = check_input(inputs[0]);
+  bool copied;
+  auto x = check_input(inputs[0], copied);
  donate_x |= copied;
  const array& w = inputs[1];
  const array& b = inputs[2];
-  auto [g, g_copied] = check_input(inputs[3]);
+  bool g_copied;
+  auto g = check_input(inputs[3], g_copied);
  donate_g |= g_copied;
  array& gx = outputs[0];
  array& gw = outputs[1];
@@ -334,8 +338,10 @@ void LayerNormVJP::eval_gpu(
  // gradient accumulators.
  array gw_temp =
      (has_w) ? array({n_rows, x.shape().back()}, gw.dtype(), nullptr, {}) : w;
+  bool g_in_gw = false;
  if (has_w) {
    if (!g_in_gx && donate_g) {
+      g_in_gw = true;
      gw_temp.copy_shared_buffer(g);
    } else {
      gw_temp.set_data(allocator::malloc(gw_temp.nbytes()));
@@ -343,41 +349,47 @@ void LayerNormVJP::eval_gpu(
    }
  }

-  // Finish with the gradient for b in case we had a b.
-  if (gb.ndim() == 1 && gb.size() == axis_size) {
+  // The gradient for b in case we had a b.
+  bool has_gb = (gb.ndim() == 1 && gb.size() == axis_size);
+  if (has_gb) {
    ReductionPlan plan(
        ReductionOpType::ContiguousStridedReduce, {n_rows}, {axis_size});
    col_reduce(encoder, g, gb, Reduce::ReduceType::Sum, {0}, plan);
  }

+  // Insert dependency if `g` was donated
+  if ((g_in_gx || g_in_gw) && has_gb) {
+    encoder.set_input_array(gb);
+  }
  encoder.set_input_array(x);
  encoder.set_input_array(w);
  encoder.set_input_array(g);
  encoder.set_output_array(gx);
  encoder.set_output_array(gw_temp);
-  encoder.launch_kernel([&, x = x, g = g](cudaStream_t stream) {
-    dispatch_float_types(gx.dtype(), "layernorm_vjp", [&](auto type_tag) {
-      dispatch_bool(has_w, [&](auto has_w_constant) {
-        constexpr int N_READS = 4;
-        dispatch_block_dim(
-            cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
-              using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-              auto kernel = cu::layer_norm_vjp<
-                  DataType,
-                  has_w_constant(),
-                  block_dim(),
-                  N_READS>;
-              kernel<<<n_rows, block_dim(), 0, stream>>>(
-                  x.data<DataType>(),
-                  w.data<DataType>(),
-                  g.data<DataType>(),
-                  gx.data<DataType>(),
-                  gw_temp.data<DataType>(),
-                  eps_,
-                  axis_size,
-                  w_stride);
-            });
-      });
+  dispatch_float_types(gx.dtype(), "layernorm_vjp", [&](auto type_tag) {
+    dispatch_bool(has_w, [&](auto has_w_constant) {
+      constexpr int N_READS = 4;
+      dispatch_block_dim(
+          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+            using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+            auto kernel = cu::layer_norm_vjp<
+                DataType,
+                has_w_constant.value,
+                block_dim(),
+                N_READS>;
+            encoder.add_kernel_node(
+                kernel,
+                n_rows,
+                block_dim(),
+                x.data<DataType>(),
+                w.data<DataType>(),
+                g.data<DataType>(),
+                gx.data<DataType>(),
+                gw_temp.data<DataType>(),
+                eps_,
+                axis_size,
+                w_stride);
+          });
    });
  });

--- a/mlx/backend/cuda/logsumexp.cu
+++ b/mlx/backend/cuda/logsumexp.cu
@@ -143,16 +143,18 @@ void LogSumExp::eval_gpu(const std::vector<array>& inputs, array& out) {

  encoder.set_input_array(in);
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    dispatch_float_types(out.dtype(), "logsumexp", [&](auto type_tag) {
-      constexpr int N_READS = 4;
-      dispatch_block_dim(
-          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
-            using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-            auto kernel = cu::logsumexp<DataType, float, block_dim(), N_READS>;
-            kernel<<<n_rows, block_dim(), 0, stream>>>(
-                in.data<DataType>(), out.data<DataType>(), axis_size);
-          });
+  dispatch_float_types(out.dtype(), "logsumexp", [&](auto type_tag) {
+    constexpr int N_READS = 4;
+    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+      auto kernel = cu::logsumexp<DataType, float, block_dim(), N_READS>;
+      encoder.add_kernel_node(
+          kernel,
+          n_rows,
+          block_dim(),
+          in.data<DataType>(),
+          out.data<DataType>(),
+          axis_size);
    });
  });
 }
--- a/mlx/backend/cuda/matmul.cpp
+++ b/mlx/backend/cuda/matmul.cpp
@@ -42,7 +42,8 @@ class MatMul {
      int64_t ldb,
      int32_t batch_count,
      int64_t a_batch_stride,
-      int64_t b_batch_stride) {
+      int64_t b_batch_stride)
+      : handle_(device.lt_handle()) {
    heuristic_.state = CUBLAS_STATUS_NOT_INITIALIZED;

    auto scale_type = dtype_to_cuda_type(dtype);
@@ -147,7 +148,7 @@ class MatMul {
    if (heuristic_.state != CUBLAS_STATUS_SUCCESS) {
      int ret = 0;
      CHECK_CUBLAS_ERROR(cublasLtMatmulAlgoGetHeuristic(
-          encoder.device().lt_handle(),
+          handle_,
          matmul_desc_,
          a_desc_,
          b_desc_,
@@ -172,25 +173,24 @@ class MatMul {
      workspace_ptr = workspace.data<void>();
    }

-    encoder.launch_kernel([&](cudaStream_t stream) {
-      CHECK_CUBLAS_ERROR(cublasLtMatmul(
-          encoder.device().lt_handle(),
-          matmul_desc_,
-          &alpha,
-          a,
-          a_desc_,
-          b,
-          b_desc_,
-          &beta,
-          c ? c : out,
-          c ? c_desc_ : out_desc_,
-          out,
-          out_desc_,
-          &heuristic_.algo,
-          workspace_ptr,
-          heuristic_.workspaceSize,
-          stream));
-    });
+    auto capture = encoder.capture_context();
+    CHECK_CUBLAS_ERROR(cublasLtMatmul(
+        handle_,
+        matmul_desc_,
+        &alpha,
+        a,
+        a_desc_,
+        b,
+        b_desc_,
+        &beta,
+        c ? c : out,
+        c ? c_desc_ : out_desc_,
+        out,
+        out_desc_,
+        &heuristic_.algo,
+        workspace_ptr,
+        heuristic_.workspaceSize,
+        encoder.stream()));
  }

 private:
@@ -259,6 +259,7 @@ class MatMul {
    return desc;
  }

+  cublasLtHandle_t handle_{nullptr};
  cublasLtMatmulDesc_t matmul_desc_{nullptr};
  cublasLtMatmulPreference_t pref_{nullptr};
  cublasLtMatrixLayout_t a_desc_{nullptr};
@@ -273,7 +274,7 @@ class MatMul {
 namespace {

 std::tuple<bool, int64_t, array>
-check_transpose(std::vector<array>& copies, const Stream& s, const array& arr) {
+check_transpose(cu::CommandEncoder& enc, const Stream& s, const array& arr) {
  auto stx = arr.strides()[arr.ndim() - 2];
  auto sty = arr.strides()[arr.ndim() - 1];
  if (sty == 1 && stx == arr.shape(-1)) {
@@ -283,7 +284,7 @@ check_transpose(std::vector<array>& copies, const Stream& s, const array& arr) {
  } else {
    array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
    copy_gpu(arr, arr_copy, CopyType::General, s);
-    copies.push_back(arr_copy);
+    enc.add_temporary(arr_copy);
    return std::make_tuple(false, arr.shape(-1), arr_copy);
  }
 }
@@ -317,13 +318,8 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {

  // Keep a vector with copies to be cleared in the completed buffer to release
  // the arrays
-  std::vector<array> copies;
-  auto [a_transposed, lda, a] = check_transpose(copies, s, a_pre);
-  auto [b_transposed, ldb, b] = check_transpose(copies, s, b_pre);
-
-  for (auto& temp : copies) {
-    encoder.add_temporary(temp);
-  }
+  auto [a_transposed, lda, a] = check_transpose(encoder, s, a_pre);
+  auto [b_transposed, ldb, b] = check_transpose(encoder, s, b_pre);

  /////////////////////////////////////////////////////////////////////////////
  // Check and collapse batch dimensions
@@ -348,7 +344,7 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
  // Invoke cublasLt

  cu::MatMul matmul(
-      encoder.device(),
+      cu::device(s.device),
      a.dtype(),
      a_transposed,
      M,
@@ -373,6 +369,7 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {

  ContiguousIterator a_it(batch_shape, a_batch_strides, batch_shape.size() - 1);
  ContiguousIterator b_it(batch_shape, b_batch_strides, batch_shape.size() - 1);
+  auto concurrent = encoder.concurrent_context();
  for (size_t i = 0; i < nbatch; ++i) {
    matmul.run(
        encoder,
@@ -405,14 +402,9 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {

  // Keep a vector with copies to be cleared in the completed buffer to release
  // the arrays
-  std::vector<array> copies;
-  auto [a_transposed, lda, a] = check_transpose(copies, s, a_pre);
-  auto [b_transposed, ldb, b] = check_transpose(copies, s, b_pre);
-  auto [c_transposed, ldc, c] = check_transpose(copies, s, c_pre);
-
-  for (auto& temp : copies) {
-    encoder.add_temporary(temp);
-  }
+  auto [a_transposed, lda, a] = check_transpose(encoder, s, a_pre);
+  auto [b_transposed, ldb, b] = check_transpose(encoder, s, b_pre);
+  auto [c_transposed, ldc, c] = check_transpose(encoder, s, c_pre);

  /////////////////////////////////////////////////////////////////////////////
  // Check and collapse batch dimensions
@@ -440,7 +432,7 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  // Invoke cublasLt

  cu::MatMul matmul(
-      encoder.device(),
+      cu::device(s.device),
      a.dtype(),
      a_transposed,
      M,
@@ -478,6 +470,7 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  ContiguousIterator a_it(batch_shape, a_batch_strides, batch_shape.size() - 1);
  ContiguousIterator b_it(batch_shape, b_batch_strides, batch_shape.size() - 1);
  ContiguousIterator c_it(batch_shape, c_batch_strides, batch_shape.size() - 1);
+  auto concurrent = encoder.concurrent_context();
  for (size_t i = 0; i < nbatch; ++i) {
    matmul.run(
        encoder,
--- a/mlx/backend/cuda/primitives.cu
+++ b/mlx/backend/cuda/primitives.cu
@@ -24,23 +24,21 @@ void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
  if (out.size() == 0) {
    return;
  }
-  auto& s = stream();
-  auto& encoder = cu::get_command_encoder(s);
+  auto& encoder = cu::get_command_encoder(stream());
  encoder.set_output_array(out);
-  encoder.launch_kernel([&, this](cudaStream_t stream) {
-    dispatch_int_float_types(out.dtype(), "Arange", [&](auto type_tag) {
-      using CTYPE = MLX_GET_TYPE(type_tag);
-      using OutType = cuda_type_t<CTYPE>;
-      CTYPE step =
-          static_cast<CTYPE>(start_ + step_) - static_cast<CTYPE>(start_);
-      thrust::transform(
-          cu::thrust_policy(stream),
-          thrust::counting_iterator<uint32_t>(0),
-          thrust::counting_iterator<uint32_t>(out.data_size()),
-          thrust::device_pointer_cast(out.data<OutType>()),
-          cu::Arange<OutType>{
-              static_cast<OutType>(start_), static_cast<OutType>(step)});
-    });
+  auto capture = encoder.capture_context();
+  dispatch_int_float_types(out.dtype(), "Arange", [&](auto type_tag) {
+    using CTYPE = MLX_GET_TYPE(type_tag);
+    using OutType = cuda_type_t<CTYPE>;
+    CTYPE step =
+        static_cast<CTYPE>(start_ + step_) - static_cast<CTYPE>(start_);
+    thrust::transform(
+        cu::thrust_policy(encoder.stream()),
+        thrust::counting_iterator<uint32_t>(0),
+        thrust::counting_iterator<uint32_t>(out.data_size()),
+        thrust::device_pointer_cast(out.data<OutType>()),
+        cu::Arange<OutType>{
+            static_cast<OutType>(start_), static_cast<OutType>(step)});
  });
 }

@@ -84,7 +82,7 @@ NO_GPU(Load)
 NO_GPU_MULTI(LUF)
 NO_GPU_MULTI(QRF)
 NO_GPU(QuantizedMatmul)
-NO_GPU(Scan)
+NO_GPU(SegmentedMM)
 NO_GPU_MULTI(SVD)
 NO_GPU(Inverse)
 NO_GPU(Cholesky)
@@ -93,7 +91,6 @@ NO_GPU_MULTI(Eigh)

 namespace fast {
 NO_GPU(ScaledDotProductAttention)
-NO_GPU_MULTI(AffineQuantize)
 NO_GPU_MULTI(CustomKernel)
 } // namespace fast

--- a/mlx/backend/cuda/quantized.cu
+++ b/mlx/backend/cuda/quantized.cu
@@ -0,0 +1,387 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/backend/gpu/copy.h"
+#include "mlx/dtype_utils.h"
+#include "mlx/fast_primitives.h"
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+#include <nvtx3/nvtx3.hpp>
+
+namespace mlx::core {
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <int bits, int wsize = 8>
+inline constexpr __device__ short get_pack_factor() {
+  return (bits == 3 || bits == 5) ? 8 : (bits == 6 ? 4 : wsize / bits);
+}
+
+template <int bits, int wsize = 8>
+inline constexpr __device__ short get_bytes_per_pack() {
+  constexpr int power_of_2_bits = (bits & (bits - 1)) == 0;
+  return power_of_2_bits ? (wsize / 8) : (bits == 5 ? 5 : 3);
+}
+
+template <typename T, int group_size, int bits>
+__global__ void
+affine_quantize(const T* w, uint8_t* out, T* scales, T* biases, size_t size) {
+  auto block_size = cg::this_thread_block().dim_threads();
+  auto block_idx = cg::this_thread_block().group_index();
+  auto idx_in_block = cg::this_thread_block().thread_index();
+
+  auto tidx = block_idx.x * block_size.x + idx_in_block.x;
+  auto tidy = block_idx.y * block_size.y + idx_in_block.y;
+
+  auto grid_dim_x =
+      cg::this_grid().dim_blocks().x * cg::this_grid().block_index().x;
+  constexpr float eps = 1e-7;
+  constexpr int simd_size = WARP_SIZE;
+  constexpr float n_bins = (1 << bits) - 1;
+  constexpr int pack_factor = get_pack_factor<bits, 8>();
+  constexpr int bytes_per_pack = get_bytes_per_pack<bits>();
+  constexpr int values_per_reduce = group_size / simd_size;
+  constexpr int writes_per_reduce = pack_factor / values_per_reduce;
+  constexpr int writes_per_pack =
+      writes_per_reduce > 1 ? 1 : values_per_reduce / pack_factor;
+  constexpr int power_of_2_bits = (bits & (bits - 1)) == 0;
+
+  size_t offset = tidx + grid_dim_x * size_t(tidy);
+  size_t in_index = offset * values_per_reduce;
+  if (in_index >= size) {
+    return;
+  }
+  size_t out_index = power_of_2_bits
+      ? offset * writes_per_pack
+      : offset * bytes_per_pack / writes_per_reduce;
+
+  float w_thread[values_per_reduce];
+  float w_min = Limits<float>::max();
+  float w_max = 0;
+
+#pragma clang loop unroll(full)
+  for (int i = 0; i < values_per_reduce; i++) {
+    float val = w[in_index + i];
+    w_thread[i] = val;
+    w_min = min(w_min, val);
+    w_max = max(w_max, val);
+  }
+
+  cg::greater<float> max_op;
+  cg::less<float> min_op;
+  auto warp = cg::tiled_partition<WARP_SIZE>(cg::this_thread_block());
+
+  w_min = cg::reduce(warp, w_min, min_op);
+  w_max = cg::reduce(warp, w_max, max_op);
+
+  float scale = max((w_max - w_min) / n_bins, eps);
+  bool side = abs(w_min) > abs(w_max);
+  scale = side ? scale : -scale;
+  float edge = side ? w_min : w_max;
+  float q0 = round(edge / scale);
+  bool at_zero = q0 == 0.0f;
+  scale = at_zero ? scale : edge / q0;
+  float bias = at_zero ? 0 : edge;
+
+  // Write out the scales and biases
+  size_t gindex = in_index / group_size;
+  if (in_index % group_size == 0) {
+    scales[gindex] = static_cast<T>(scale);
+    biases[gindex] = static_cast<T>(bias);
+  }
+
+  using OutType = std::conditional_t<bits == 5, uint64_t, uint32_t>;
+  OutType output = 0;
+
+#pragma clang loop unroll(full)
+  for (int i = 0; i < values_per_reduce; i++) {
+    uint8_t val = min(round((w_thread[i] - bias) / scale), n_bins);
+    if (bits == 8) {
+      output = val;
+    } else {
+      output |= val << (bits * (i % pack_factor));
+    }
+
+    if (pack_factor < values_per_reduce && i % pack_factor == pack_factor - 1) {
+      out[out_index + i / pack_factor] = output;
+      output = 0;
+    } else {
+#pragma clang loop unroll(full)
+      for (int j = 1; j < writes_per_reduce; j++) {
+        uint8_t sval = warp.shfl_down(val, j);
+        output |= static_cast<OutType>(sval)
+            << (bits * (j * values_per_reduce + i));
+      }
+    }
+  }
+  if constexpr (bits == 3 || bits == 6) {
+    if (in_index % pack_factor == 0 && out_index % bytes_per_pack == 0) {
+      out[out_index] = output & 0xff;
+      out[out_index + 1] = (output & 0xff00) >> 8;
+      out[out_index + 2] = (output & 0xff0000) >> 16;
+    }
+  } else if constexpr (bits == 5) {
+    if (in_index % pack_factor == 0 && out_index % bytes_per_pack == 0) {
+      out[out_index] = output & 0xff;
+      out[out_index + 1] = (output & 0xff00) >> 8;
+      out[out_index + 2] = (output & 0xff0000) >> 16;
+      out[out_index + 3] = (output & 0xff000000) >> 24;
+      out[out_index + 4] = (output & 0xff00000000) >> 32;
+    }
+  } else {
+    if constexpr (writes_per_reduce > 0) {
+      if (out_index % writes_per_reduce == 0) {
+        out[out_index / writes_per_reduce] = output;
+      }
+    }
+  }
+}
+
+template <typename T, int group_size, int bits>
+__global__ void affine_dequantize(
+    const uint8_t* w,
+    const T* scales,
+    const T* biases,
+    T* out,
+    size_t size) {
+  auto block_size = cg::this_thread_block().dim_threads();
+  auto block_idx = cg::this_thread_block().group_index();
+  auto idx_in_block = cg::this_thread_block().thread_index();
+
+  auto tidx = block_idx.x * block_size.x + idx_in_block.x;
+  auto tidy = block_idx.y * block_size.y + idx_in_block.y;
+
+  auto grid_dim_x =
+      cg::this_grid().dim_blocks().x * cg::this_grid().block_index().x;
+
+  constexpr int pack_factor = get_pack_factor<bits, 8>();
+  constexpr int bytes_per_pack = get_bytes_per_pack<bits>();
+
+  size_t offset = tidx + grid_dim_x * size_t(tidy);
+  size_t oindex = offset * pack_factor;
+
+  if (oindex >= size) {
+    return;
+  }
+
+  size_t gindex = oindex / group_size;
+  T scale = scales[gindex];
+  T bias = biases[gindex];
+  out += oindex;
+
+  if constexpr (bits == 3) {
+    w += offset * bytes_per_pack;
+    out[0] = static_cast<T>(w[0] & 0x7) * scale + bias;
+    out[1] = static_cast<T>((w[0] & 0x38) >> 3) * scale + bias;
+    out[2] = (static_cast<T>((w[0] & 0xc0) >> 6) +
+              static_cast<T>((w[1] & 0x1) << 2)) *
+            scale +
+        bias;
+    out[3] = static_cast<T>((w[1] & 0xe) >> 1) * scale + bias;
+    out[4] = static_cast<T>((w[1] & 0x70) >> 4) * scale + bias;
+    out[5] = (static_cast<T>((w[1] & 0x80) >> 7) +
+              static_cast<T>((w[2] & 0x3) << 1)) *
+            scale +
+        bias;
+    out[6] = static_cast<T>((w[2] & 0x1c) >> 2) * scale + bias;
+    out[7] = static_cast<T>((w[2] & 0xe0) >> 5) * scale + bias;
+  } else if constexpr (bits == 5) {
+    w += offset * bytes_per_pack;
+    out[0] = static_cast<T>(w[0] & 0x1f) * scale + bias;
+    out[1] = (static_cast<T>((w[0] & 0xe0) >> 5) +
+              static_cast<T>((w[1] & 0x3) << 3)) *
+            scale +
+        bias;
+    out[2] = static_cast<T>((w[1] & 0x7c) >> 2) * scale + bias;
+    out[3] = (static_cast<T>((w[1] & 0x80) >> 7) +
+              static_cast<T>((w[2] & 0xf) << 1)) *
+            scale +
+        bias;
+    out[4] = (static_cast<T>((w[2] & 0xf0) >> 4) +
+              static_cast<T>((w[3] & 0x1) << 4)) *
+            scale +
+        bias;
+    out[5] = static_cast<T>((w[3] & 0x3e) >> 1) * scale + bias;
+    out[6] = (static_cast<T>((w[3] & 0xc0) >> 6) +
+              static_cast<T>((w[4] & 0x7) << 2)) *
+            scale +
+        bias;
+    out[7] = static_cast<T>((w[4] & 0xf8) >> 3) * scale + bias;
+  } else if constexpr (bits == 6) {
+    w += offset * bytes_per_pack;
+    out[0] = static_cast<T>(w[0] & 0x3f) * scale + bias;
+    out[1] = (static_cast<T>((w[0] >> 6) & 0x03) +
+              static_cast<T>((w[1] & 0x0f) << 2)) *
+            scale +
+        bias;
+    out[2] = (static_cast<T>((w[1] >> 4) & 0x0f) +
+              static_cast<T>((w[2] & 0x03) << 4)) *
+            scale +
+        bias;
+    out[3] = static_cast<T>((w[2] >> 2) & 0x3f) * scale + bias;
+  } else {
+    uint val = w[offset];
+#pragma clang loop unroll(full)
+    for (int i = 0; i < pack_factor; i++) {
+      uint8_t d;
+      if (bits == 2) {
+        d = (val >> (bits * i)) & 0x03;
+      } else if (bits == 4) {
+        d = (val >> (bits * i)) & 0x0f;
+      } else if (bits == 8) {
+        d = val;
+      }
+      out[i] = scale * static_cast<T>(d) + bias;
+    }
+  }
+}
+
+} // namespace cu
+namespace {
+
+inline array ensure_row_contiguous(
+    const array& x,
+    cu::CommandEncoder& enc,
+    const Stream& s) {
+  if (!x.flags().row_contiguous) {
+    array x_copy(x.shape(), x.dtype(), nullptr, {});
+    copy_gpu(x, x_copy, CopyType::General, s);
+    enc.add_temporary(x_copy);
+    return x_copy;
+  } else {
+    return x;
+  }
+}
+
+} // namespace
+
+template <typename F>
+void dispatch_groups(int group_size, F&& f) {
+  switch (group_size) {
+    case 32:
+      f(std::integral_constant<int, 32>{});
+      break;
+    case 64:
+      f(std::integral_constant<int, 64>{});
+      break;
+    case 128:
+      f(std::integral_constant<int, 128>{});
+      break;
+  }
+}
+
+template <typename F>
+void dispatch_bits(int bits, F&& f) {
+  switch (bits) {
+    case 2:
+      f(std::integral_constant<int, 2>{});
+      break;
+    case 3:
+      f(std::integral_constant<int, 3>{});
+      break;
+    case 4:
+      f(std::integral_constant<int, 4>{});
+      break;
+    case 5:
+      f(std::integral_constant<int, 5>{});
+      break;
+    case 6:
+      f(std::integral_constant<int, 6>{});
+      break;
+    case 8:
+      f(std::integral_constant<int, 8>{});
+      break;
+  }
+}
+
+void fast::AffineQuantize::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  auto& w_pre = inputs[0];
+  auto& out = outputs[0];
+  out.set_data(allocator::malloc(out.nbytes()));
+
+  auto& s = stream();
+  auto& d = cu::device(s.device);
+  auto& enc = d.get_command_encoder(s);
+
+  auto w = ensure_row_contiguous(w_pre, enc, s);
+  enc.set_input_array(w);
+  if (dequantize_) {
+    auto scales = ensure_row_contiguous(inputs[1], enc, s);
+    auto biases = ensure_row_contiguous(inputs[2], enc, s);
+    enc.set_input_array(scales);
+    enc.set_input_array(biases);
+    enc.set_output_array(out);
+  } else {
+    auto& scales = outputs[1];
+    auto& biases = outputs[2];
+    scales.set_data(allocator::malloc(scales.nbytes()));
+    biases.set_data(allocator::malloc(biases.nbytes()));
+    enc.set_output_array(out);
+    enc.set_output_array(scales);
+    enc.set_output_array(biases);
+  }
+
+  auto dtype = dequantize_ ? outputs[0].dtype() : inputs[0].dtype();
+
+  // Treat uint32 as uint8 in kernel
+  int uint8_per_uint32 = 4;
+  int packs_per_int = (bits_ == 3 || bits_ == 5) ? 8
+      : bits_ == 6                               ? 4
+                                                 : 8 / bits_;
+  int per_thread = dequantize_ ? packs_per_int : group_size_ / WARP_SIZE;
+  size_t size =
+      dequantize_ ? out.size() / packs_per_int : w.size() / per_thread;
+
+  bool large = size > UINT_MAX;
+  auto grid_shape = w.shape();
+
+  if (dequantize_) {
+    grid_shape.back() *= uint8_per_uint32;
+  } else {
+    grid_shape.back() /= per_thread;
+  }
+
+  dispatch_float_types(dtype, "affine_quantize", [&](auto type_tag) {
+    dispatch_groups(group_size_, [&](auto group_size) {
+      dispatch_bits(bits_, [&](auto bits) {
+        using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+        if (dequantize_) {
+          auto kernel =
+              cu::affine_dequantize<DataType, group_size.value, bits.value>;
+          auto [num_blocks, block_dims] =
+              get_launch_args(kernel, size, grid_shape, w.strides(), large);
+          enc.add_kernel_node(
+              kernel,
+              num_blocks,
+              block_dims,
+              w.data<uint8_t>(),
+              inputs[1].data<DataType>(),
+              inputs[2].data<DataType>(),
+              out.data<DataType>(),
+              out.size());
+        } else {
+          auto kernel =
+              cu::affine_quantize<DataType, group_size.value, bits.value>;
+          auto [num_blocks, block_dims] =
+              get_launch_args(kernel, size, grid_shape, w.strides(), large);
+          enc.add_kernel_node(
+              kernel,
+              num_blocks,
+              block_dims,
+              w.data<DataType>(),
+              out.data<uint8_t>(),
+              outputs[1].data<DataType>(),
+              outputs[2].data<DataType>(),
+              w.size());
+        }
+      });
+    });
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/random.cu
+++ b/mlx/backend/cuda/random.cu
@@ -156,34 +156,39 @@ void RandomBits::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(keys);
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    dim3 grid_dims{num_keys, half_size + odd};
-    int64_t total = grid_dims.x * grid_dims.y;
-    int32_t threads_y = 1;
-    while ((total / threads_y) >= (1U << 31)) {
-      threads_y *= 2;
-    }
-    int32_t threads_x = cuda::ceil_div(total, threads_y);
-    auto [grid, block] = get_grid_and_block(threads_x, threads_y, 1);
-    if (keys.flags().row_contiguous) {
-      cu::rbitsc<<<grid, block, 0, stream>>>(
-          keys.data<uint32_t>(),
-          out.data<uint8_t>(),
-          grid_dims,
-          odd,
-          bytes_per_key);
-    } else {
-      cu::rbits<<<grid, block, 0, stream>>>(
-          keys.data<uint32_t>(),
-          out.data<uint8_t>(),
-          grid_dims,
-          odd,
-          bytes_per_key,
-          keys.ndim(),
-          const_param(keys.shape()),
-          const_param(keys.strides()));
-    }
-  });
+  dim3 grid_dims{num_keys, half_size + odd};
+  int64_t total = grid_dims.x * grid_dims.y;
+  int32_t threads_y = 1;
+  while ((total / threads_y) >= (1U << 31)) {
+    threads_y *= 2;
+  }
+  int32_t threads_x = cuda::ceil_div(total, threads_y);
+  auto [grid, block] = get_grid_and_block(threads_x, threads_y, 1);
+  auto& stream = encoder.stream();
+  if (keys.flags().row_contiguous) {
+    encoder.add_kernel_node(
+        cu::rbitsc,
+        grid,
+        block,
+        keys.data<uint32_t>(),
+        out.data<uint8_t>(),
+        grid_dims,
+        odd,
+        bytes_per_key);
+  } else {
+    encoder.add_kernel_node(
+        cu::rbits,
+        grid,
+        block,
+        keys.data<uint32_t>(),
+        out.data<uint8_t>(),
+        grid_dims,
+        odd,
+        bytes_per_key,
+        keys.ndim(),
+        const_param(keys.shape()),
+        const_param(keys.strides()));
+  }
 }

 } // namespace mlx::core
--- a/mlx/backend/cuda/reduce/all_reduce.cu
+++ b/mlx/backend/cuda/reduce/all_reduce.cu
@@ -37,15 +37,15 @@ __global__ void all_reduce(T* in, U* out, size_t block_step, size_t size) {
  for (; i + block.size() * N <= check; i += block.size() * N) {
    cub::LoadDirectBlockedVectorized<T, N>(block.thread_rank(), in + i, vals);
    for (int j = 0; j < N; j++) {
-      accs[0] = op(accs[0], __cast<U, T>(vals[j]));
+      accs[0] = op(accs[0], cast_to<U>(vals[j]));
    }
  }

  if (i < check) {
    cub::LoadDirectBlocked(
-        block.thread_rank(), in + i, vals, check - i, __cast<T, U>(init));
+        block.thread_rank(), in + i, vals, check - i, cast_to<T>(init));
    for (int i = 0; i < N; i++) {
-      accs[0] = op(accs[0], __cast<U, T>(vals[i]));
+      accs[0] = op(accs[0], cast_to<U>(vals[i]));
    }
  }

@@ -110,19 +110,20 @@ void all_reduce(
    intermediate.set_data(allocator::malloc(intermediate.nbytes()));
    encoder.add_temporary(intermediate);
    encoder.set_output_array(intermediate);
-    encoder.launch_kernel([&](cudaStream_t stream) {
-      dispatch_all_types(dt, [&](auto type_tag) {
-        dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
-          using OP = MLX_GET_TYPE(reduce_type_tag);
-          using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-          using U = typename cu::ReduceResult<OP, T>::type;
-          auto kernel = cu::all_reduce<T, U, OP, N_READS>;
-          kernel<<<blocks, threads, 0, stream>>>(
-              static_cast<T*>(indata),
-              intermediate.data<U>(),
-              block_step,
-              insize);
-        });
+    dispatch_all_types(dt, [&](auto type_tag) {
+      dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+        using OP = MLX_GET_TYPE(reduce_type_tag);
+        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+        using U = typename cu::ReduceResult<OP, T>::type;
+        auto kernel = cu::all_reduce<T, U, OP, N_READS>;
+        encoder.add_kernel_node(
+            kernel,
+            blocks,
+            threads,
+            static_cast<T*>(indata),
+            intermediate.data<U>(),
+            block_step,
+            insize);
      });
    });

@@ -135,16 +136,20 @@ void all_reduce(
  }

  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    dispatch_all_types(dt, [&](auto type_tag) {
-      dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
-        using OP = MLX_GET_TYPE(reduce_type_tag);
-        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-        using U = typename cu::ReduceResult<OP, T>::type;
-        auto kernel = cu::all_reduce<T, U, OP, N_READS>;
-        kernel<<<blocks, threads, 0, stream>>>(
-            static_cast<T*>(indata), out.data<U>(), block_step, insize);
-      });
+  dispatch_all_types(dt, [&](auto type_tag) {
+    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+      using OP = MLX_GET_TYPE(reduce_type_tag);
+      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+      using U = typename cu::ReduceResult<OP, T>::type;
+      auto kernel = cu::all_reduce<T, U, OP, N_READS>;
+      encoder.add_kernel_node(
+          kernel,
+          blocks,
+          threads,
+          static_cast<T*>(indata),
+          out.data<U>(),
+          block_step,
+          insize);
    });
  });
 }
--- a/mlx/backend/cuda/reduce/col_reduce.cu
+++ b/mlx/backend/cuda/reduce/col_reduce.cu
@@ -3,7 +3,6 @@
 #include <numeric>

 #include "mlx/backend/cuda/device.h"
-#include "mlx/backend/cuda/device/cast_op.cuh"
 #include "mlx/backend/cuda/reduce/reduce.cuh"

 #include <cooperative_groups.h>
@@ -128,7 +127,7 @@ col_reduce_looped(T* in, U* out, const __grid_constant__ ColReduceArgs args) {
        T vals[N_READS];
        cub::LoadDirectBlockedVectorized(thread_x, in + loop.location(), vals);
        for (int i = 0; i < N_READS; i++) {
-          totals[i] = op(totals[i], __cast<U, T>(vals[i]));
+          totals[i] = op(totals[i], cast_to<U>(vals[i]));
        }
        loop.next(BM, args.reduce_shape.data(), args.reduce_strides.data());
      }
@@ -137,7 +136,7 @@ col_reduce_looped(T* in, U* out, const __grid_constant__ ColReduceArgs args) {
        T vals[N_READS];
        cub::LoadDirectBlocked(thread_x, in + loop.location(), vals);
        for (int i = 0; i < N_READS; i++) {
-          totals[i] = op(totals[i], __cast<U, T>(vals[i]));
+          totals[i] = op(totals[i], cast_to<U>(vals[i]));
        }
        loop.next(BM, args.reduce_shape.data(), args.reduce_strides.data());
      }
@@ -150,9 +149,9 @@ col_reduce_looped(T* in, U* out, const __grid_constant__ ColReduceArgs args) {
          in + loop.location(),
          vals,
          args.reduction_stride - tile_x * BN,
-          __cast<T, U>(ReduceInit<Op, T>::value()));
+          cast_to<T>(ReduceInit<Op, T>::value()));
      for (int i = 0; i < N_READS; i++) {
-        totals[i] = op(totals[i], __cast<U, T>(vals[i]));
+        totals[i] = op(totals[i], cast_to<U>(vals[i]));
      }
      loop.next(BM, args.reduce_shape.data(), args.reduce_strides.data());
    }
@@ -214,26 +213,24 @@ void col_reduce_looped(

  encoder.set_input_array(in);
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    dispatch_all_types(in.dtype(), [&](auto type_tag) {
-      dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
-        dispatch_reduce_ndim(args.reduce_ndim, [&](auto reduce_ndim) {
-          using OP = MLX_GET_TYPE(reduce_type_tag);
-          using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-          using U = typename cu::ReduceResult<OP, T>::type;
+  dispatch_all_types(in.dtype(), [&](auto type_tag) {
+    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+      dispatch_reduce_ndim(args.reduce_ndim, [&](auto reduce_ndim) {
+        using OP = MLX_GET_TYPE(reduce_type_tag);
+        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+        using U = typename cu::ReduceResult<OP, T>::type;
+        // Cub doesn't like const pointers for vectorized loads. (sigh)
+        T* indata = const_cast<T*>(in.data<T>());

-          // Cub doesn't like const pointers for vectorized loads. (sigh)
-          T* indata = const_cast<T*>(in.data<T>());
-
-          constexpr int N_READS = 4;
-          constexpr int BM = 32;
-          constexpr int BN = 32;
-          dim3 grid = output_grid_for_col_reduce(out, args, BN);
-          int blocks = BM * BN / N_READS;
-          auto kernel =
-              cu::col_reduce_looped<T, U, OP, reduce_ndim(), BM, BN, N_READS>;
-          kernel<<<grid, blocks, 0, stream>>>(indata, out.data<U>(), args);
-        });
+        constexpr int N_READS = 4;
+        constexpr int BM = 32;
+        constexpr int BN = 32;
+        dim3 grid = output_grid_for_col_reduce(out, args, BN);
+        int blocks = BM * BN / N_READS;
+        auto kernel =
+            cu::col_reduce_looped<T, U, OP, reduce_ndim(), BM, BN, N_READS>;
+        encoder.add_kernel_node(
+            kernel, grid, blocks, indata, out.data<U>(), args);
      });
    });
  });
--- a/mlx/backend/cuda/reduce/init_reduce.cu
+++ b/mlx/backend/cuda/reduce/init_reduce.cu
@@ -32,18 +32,16 @@ void init_reduce(
  }

  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    dispatch_all_types(in.dtype(), [&](auto type_tag) {
-      dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
-        using OP = MLX_GET_TYPE(reduce_type_tag);
-        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-        using U = typename cu::ReduceResult<OP, T>::type;
-        auto kernel = cu::init_reduce<T, U, OP>;
-        dim3 grid = get_2d_grid_dims(out.shape(), out.strides());
-        dim3 block(grid.x < 1024 ? grid.x : 1024, 1, 1);
-        grid.x = (grid.x + 1023) / 1024;
-        kernel<<<grid, block, 0, stream>>>(out.data<U>(), out.size());
-      });
+  dispatch_all_types(in.dtype(), [&](auto type_tag) {
+    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+      using OP = MLX_GET_TYPE(reduce_type_tag);
+      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+      using U = typename cu::ReduceResult<OP, T>::type;
+      auto kernel = cu::init_reduce<T, U, OP>;
+      dim3 grid = get_2d_grid_dims(out.shape(), out.strides());
+      dim3 block(grid.x < 1024 ? grid.x : 1024, 1, 1);
+      grid.x = (grid.x + 1023) / 1024;
+      encoder.add_kernel_node(kernel, grid, block, out.data<U>(), out.size());
    });
  });
 }
--- a/mlx/backend/cuda/reduce/reduce.cuh
+++ b/mlx/backend/cuda/reduce/reduce.cuh
@@ -3,7 +3,6 @@
 #include <type_traits>

 #include "mlx/backend/common/reduce.h"
-#include "mlx/backend/cuda/device/cucomplex_math.cuh"
 #include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/backend/cuda/reduce/reduce_ops.cuh"
 #include "mlx/dtype_utils.h"
--- a/mlx/backend/cuda/reduce/reduce_ops.cuh
+++ b/mlx/backend/cuda/reduce/reduce_ops.cuh
@@ -2,6 +2,8 @@

 #pragma once

+#include "mlx/backend/cuda/device/atomic_ops.cuh"
+#include "mlx/backend/cuda/device/cast_op.cuh"
 #include "mlx/backend/cuda/device/utils.cuh"
 #include "mlx/backend/cuda/reduce/reduce_utils.cuh"

@@ -40,15 +42,15 @@ struct Sum {
  }

  __device__ void atomic_update(__nv_bfloat16* x, __nv_bfloat16 y) {
-    atomicAdd(x, y);
+    atomic_add(x, y);
  }

  __device__ void atomic_update(int* x, int y) {
-    atomicAdd(x, y);
+    atomic_add(x, y);
  }

  __device__ void atomic_update(float* x, float y) {
-    atomicAdd(x, y);
+    atomic_add(x, y);
  }
 };

@@ -67,6 +69,18 @@ struct Prod {
 struct Min {
  template <typename T>
  __device__ __forceinline__ T operator()(T a, T b) {
+    if constexpr (is_complex_v<T>) {
+      if (isnan(a.real()) || isnan(a.imag())) {
+        return a;
+      }
+      if (isnan(b.real()) || isnan(b.imag())) {
+        return b;
+      }
+    } else if constexpr (!cuda::std::is_integral_v<T>) {
+      if (isnan(a) || isnan(b)) {
+        return cuda::std::numeric_limits<float>::quiet_NaN();
+      }
+    }
    return a < b ? a : b;
  }

@@ -79,6 +93,18 @@ struct Min {
 struct Max {
  template <typename T>
  __device__ __forceinline__ T operator()(T a, T b) {
+    if constexpr (is_complex_v<T>) {
+      if (isnan(a.real()) || isnan(a.imag())) {
+        return a;
+      }
+      if (isnan(b.real()) || isnan(b.imag())) {
+        return b;
+      }
+    } else if constexpr (!cuda::std::is_integral_v<T>) {
+      if (isnan(a) || isnan(b)) {
+        return cuda::std::numeric_limits<float>::quiet_NaN();
+      }
+    }
    return a > b ? a : b;
  }

@@ -149,10 +175,10 @@ struct ReduceInit<Or, T> {
 template <typename T>
 struct ReduceInit<Sum, T> {
  static constexpr __host__ __device__ auto value() {
-    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+    if constexpr (is_complex_v<T>) {
      return T{0, 0};
    } else {
-      return typename ReduceResult<Sum, T>::type{0};
+      return cast_to<typename ReduceResult<Sum, T>::type>(0);
    }
  }
 };
@@ -160,10 +186,10 @@ struct ReduceInit<Sum, T> {
 template <typename T>
 struct ReduceInit<Prod, T> {
  static constexpr __host__ __device__ auto value() {
-    if constexpr (cuda::std::is_same_v<T, cuComplex>) {
+    if constexpr (is_complex_v<T>) {
      return T{1, 0};
    } else {
-      return typename ReduceResult<Prod, T>::type{1};
+      return cast_to<typename ReduceResult<Prod, T>::type>(1);
    }
  }
 };
--- a/mlx/backend/cuda/reduce/reduce_utils.cuh
+++ b/mlx/backend/cuda/reduce/reduce_utils.cuh
@@ -4,6 +4,7 @@

 #include <numeric>

+#include "mlx/backend/common/utils.h"
 #include "mlx/backend/cuda/device/utils.cuh"

 #include <cooperative_groups.h>
@@ -55,22 +56,6 @@ __device__ void atomic_reduce(T* x, T y) {
  }
 }

-// TODO: Should make a custom complex type
-template <typename U, typename T>
-inline __device__ U __cast(T x) {
-  return static_cast<U>(x);
-}
-
-template <>
-inline __device__ bool __cast<bool, cuComplex>(cuComplex x) {
-  return x.x != 0 && x.y != 0;
-}
-
-template <>
-inline __device__ cuComplex __cast<cuComplex, bool>(bool x) {
-  return x ? make_cuFloatComplex(1, 1) : make_cuFloatComplex(0, 0);
-}
-
 template <typename T, int N, typename Block, typename Warp, typename Op>
 inline __device__ void
 block_reduce(Block block, Warp warp, T (&vals)[N], T* smem, Op op, T init) {
--- a/mlx/backend/cuda/reduce/row_reduce.cu
+++ b/mlx/backend/cuda/reduce/row_reduce.cu
@@ -3,7 +3,6 @@
 #include <numeric>

 #include "mlx/backend/cuda/device.h"
-#include "mlx/backend/cuda/device/cast_op.cuh"
 #include "mlx/backend/cuda/reduce/reduce.cuh"

 #include <cooperative_groups.h>
@@ -113,7 +112,7 @@ __global__ void row_reduce_simple(T* in, U* out, size_t n_rows, int size) {
            in + k * size + r * (block.size() * N),
            vals[k]);
        for (int j = 0; j < N; j++) {
-          accs[k] = op(accs[k], __cast<U, T>(vals[k][j]));
+          accs[k] = op(accs[k], cast_to<U>(vals[k][j]));
        }
      }
    }
@@ -125,7 +124,7 @@ __global__ void row_reduce_simple(T* in, U* out, size_t n_rows, int size) {
            in + k * size + r * (block.size() * N),
            vals[k]);
        for (int j = 0; j < N; j++) {
-          accs[k] = op(accs[k], __cast<U, T>(vals[k][j]));
+          accs[k] = op(accs[k], cast_to<U>(vals[k][j]));
        }
      }
    }
@@ -138,9 +137,9 @@ __global__ void row_reduce_simple(T* in, U* out, size_t n_rows, int size) {
          in + k * size + final_offset,
          vals[k],
          size,
-          __cast<T, U>(init));
+          cast_to<T>(init));
      for (int j = 0; j < N; j++) {
-        accs[k] = op(accs[k], __cast<U, T>(vals[k][j]));
+        accs[k] = op(accs[k], cast_to<U>(vals[k][j]));
      }
    }
  }
@@ -199,7 +198,7 @@ __global__ void row_reduce_looped(
          in + loop.location() + r * BLOCK_DIM * N_READS,
          vals);
      for (int i = 0; i < N_READS; i++) {
-        total[0] = op(total[0], __cast<U, T>(vals[i]));
+        total[0] = op(total[0], cast_to<U>(vals[i]));
      }
    }
    if (final_offset < args.row_size) {
@@ -209,9 +208,9 @@ __global__ void row_reduce_looped(
          in + loop.location() + final_offset,
          vals,
          args.row_size - final_offset,
-          __cast<T, U>(init));
+          cast_to<T>(init));
      for (int i = 0; i < N_READS; i++) {
-        total[0] = op(total[0], __cast<U, T>(vals[i]));
+        total[0] = op(total[0], cast_to<U>(vals[i]));
      }
    }
    // TODO: Maybe block.sync() here?
@@ -245,34 +244,32 @@ void row_reduce_simple(
  //       2 passes. Something like 32 * out.size() and then do a warp reduce.
  encoder.set_input_array(in);
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    dispatch_all_types(in.dtype(), [&](auto type_tag) {
-      dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
-        using OP = MLX_GET_TYPE(reduce_type_tag);
-        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-        using U = typename cu::ReduceResult<OP, T>::type;
+  dispatch_all_types(in.dtype(), [&](auto type_tag) {
+    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+      using OP = MLX_GET_TYPE(reduce_type_tag);
+      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+      using U = typename cu::ReduceResult<OP, T>::type;

-        // Cub doesn't like const pointers for vectorized loads. (sigh)
-        T* indata = const_cast<T*>(in.data<T>());
+      // Cub doesn't like const pointers for vectorized loads. (sigh)
+      T* indata = const_cast<T*>(in.data<T>());

-        // Calculate the grid and block dims
-        size_t reductions = (plan.shape.back() + N_READS - 1) / N_READS;
-        dim3 grid = get_2d_grid_dims(out.shape(), out.strides());
-        int threads = std::min(1024UL, reductions);
-        threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
-        dim3 block(threads, 1, 1);
+      // Calculate the grid and block dims
+      size_t reductions = (plan.shape.back() + N_READS - 1) / N_READS;
+      dim3 grid = get_2d_grid_dims(out.shape(), out.strides());
+      int threads = std::min(1024UL, reductions);
+      threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
+      dim3 block(threads, 1, 1);

-        // Pick the kernel
-        auto kernel = cu::row_reduce_simple<T, U, OP, N_READS>;
-        if (grid.x >= 1024) {
-          grid.x = (grid.x + 1) / 2;
-          kernel = cu::row_reduce_simple<T, U, OP, N_READS, 2>;
-        }
+      // Pick the kernel
+      auto kernel = cu::row_reduce_simple<T, U, OP, N_READS>;
+      if (grid.x >= 1024) {
+        grid.x = (grid.x + 1) / 2;
+        kernel = cu::row_reduce_simple<T, U, OP, N_READS, 2>;
+      }

-        // Launch
-        kernel<<<grid, block, 0, stream>>>(
-            indata, out.data<U>(), out.size(), plan.shape.back());
-      });
+      int size = plan.shape.back();
+      encoder.add_kernel_node(
+          kernel, grid, block, indata, out.data<U>(), out.size(), size);
    });
  });
 }
@@ -293,43 +290,39 @@ void row_reduce_looped(

  encoder.set_input_array(in);
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    dispatch_all_types(in.dtype(), [&](auto type_tag) {
-      dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
-        using OP = MLX_GET_TYPE(reduce_type_tag);
-        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-        using U = typename cu::ReduceResult<OP, T>::type;
+  dispatch_all_types(in.dtype(), [&](auto type_tag) {
+    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
+      using OP = MLX_GET_TYPE(reduce_type_tag);
+      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+      using U = typename cu::ReduceResult<OP, T>::type;
+      // Cub doesn't like const pointers for vectorized loads. (sigh)
+      T* indata = const_cast<T*>(in.data<T>());

-        // Cub doesn't like const pointers for vectorized loads. (sigh)
-        T* indata = const_cast<T*>(in.data<T>());
+      // Calculate the grid and block dims
+      args.sort_access_pattern(in, axes);
+      dim3 grid = get_2d_grid_dims(out.shape(), out.strides());
+      size_t reductions = (args.row_size + N_READS - 1) / N_READS;
+      int threads = std::min(1024UL, reductions);
+      threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
+      dim3 block(threads, 1, 1);

-        // Calculate the grid and block dims
-        args.sort_access_pattern(in, axes);
-        dim3 grid = get_2d_grid_dims(out.shape(), out.strides());
-        size_t reductions = (args.row_size + N_READS - 1) / N_READS;
-        int threads = std::min(1024UL, reductions);
-        threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
-        dim3 block(threads, 1, 1);
-
-        // Pick the kernel
-        auto kernel = cu::row_reduce_looped<T, U, OP, 1, 32, N_READS>;
-        dispatch_reduce_ndim(args.reduce_ndim, [&](auto reduce_ndim) {
-          dispatch_block_dim(threads, [&](auto threads_constant) {
-            kernel = cu::row_reduce_looped<
-                T,
-                U,
-                OP,
-                reduce_ndim(),
-                threads_constant(),
-                N_READS>;
-            block.x = threads_constant();
-          });
+      // Pick the kernel
+      auto kernel = cu::row_reduce_looped<T, U, OP, 1, 32, N_READS>;
+      dispatch_reduce_ndim(args.reduce_ndim, [&](auto reduce_ndim) {
+        dispatch_block_dim(threads, [&](auto threads_constant) {
+          kernel = cu::row_reduce_looped<
+              T,
+              U,
+              OP,
+              reduce_ndim.value,
+              threads_constant.value,
+              N_READS>;
+          block.x = threads_constant.value;
        });
-
-        // Launch
-        kernel<<<grid, block, 0, stream>>>(
-            indata, out.data<U>(), out.size(), args);
      });
+
+      encoder.add_kernel_node(
+          kernel, grid, block, indata, out.data<U>(), out.size(), args);
    });
  });
 }
--- a/mlx/backend/cuda/rms_norm.cu
+++ b/mlx/backend/cuda/rms_norm.cu
@@ -74,7 +74,7 @@ __global__ void rms_norm(
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); ++r) {
    auto index = r * BLOCK_DIM + block.thread_rank();
    T xn[N_READS];
-    cub::LoadDirectBlocked(index, x, xn, axis_size, 0);
+    cub::LoadDirectBlocked(index, x, xn, axis_size, cast_to<T>(0));
    for (int i = 0; i < N_READS; ++i) {
      float t = static_cast<float>(xn[i]);
      normalizer += t * t;
@@ -130,7 +130,7 @@ __global__ void rms_norm_vjp(
    T wn[N_READS] = {};
    T gn[N_READS] = {};
    auto index = r * BLOCK_DIM + block.thread_rank();
-    cub::LoadDirectBlocked(index, x, xn, axis_size, 0);
+    cub::LoadDirectBlocked(index, x, xn, axis_size, cast_to<T>(0));
    cub::LoadDirectBlocked(index, g, gn, axis_size);
    cub::LoadDirectBlocked(index, strided_iterator(w, w_stride), wn, axis_size);
    for (int i = 0; i < N_READS; i++) {
@@ -224,21 +224,21 @@ void RMSNorm::eval_gpu(
  encoder.set_input_array(x);
  encoder.set_input_array(w);
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    dispatch_float_types(out.dtype(), "rms_norm", [&](auto type_tag) {
-      constexpr uint32_t N_READS = 4;
-      dispatch_block_dim(
-          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
-            using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-            auto kernel = cu::rms_norm<DataType, block_dim(), N_READS>;
-            kernel<<<n_rows, block_dim(), 0, stream>>>(
-                x.data<DataType>(),
-                w.data<DataType>(),
-                out.data<DataType>(),
-                eps_,
-                axis_size,
-                w_stride);
-          });
+  dispatch_float_types(out.dtype(), "rms_norm", [&](auto type_tag) {
+    constexpr uint32_t N_READS = 4;
+    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+      auto kernel = cu::rms_norm<DataType, block_dim(), N_READS>;
+      encoder.add_kernel_node(
+          kernel,
+          n_rows,
+          block_dim(),
+          x.data<DataType>(),
+          w.data<DataType>(),
+          out.data<DataType>(),
+          eps_,
+          axis_size,
+          w_stride);
    });
  });
 }
@@ -253,20 +253,24 @@ void RMSNormVJP::eval_gpu(
  // Ensure row contiguity. We could relax this step by checking that the array
  // is contiguous (no broadcasts or holes) and that the input strides are the
  // same as the cotangent strides but for now this is simpler.
-  auto check_input = [&s](const array& x) -> std::pair<array, bool> {
+  auto check_input = [&s](const array& x, bool& copied) {
    if (x.flags().row_contiguous) {
-      return {x, false};
+      copied = false;
+      return x;
    }
+    copied = true;
    array x_copy(x.shape(), x.dtype(), nullptr, {});
    copy_gpu(x, x_copy, CopyType::General, s);
-    return {x_copy, true};
+    return x_copy;
  };
  bool donate_x = inputs[0].is_donatable();
  bool donate_g = inputs[2].is_donatable();
-  auto [x, copied] = check_input(inputs[0]);
+  bool copied;
+  auto x = check_input(inputs[0], copied);
  donate_x |= copied;
  const array& w = inputs[1];
-  auto [g, g_copied] = check_input(inputs[2]);
+  bool g_copied;
+  auto g = check_input(inputs[2], g_copied);
  donate_g |= g_copied;
  array& gx = outputs[0];
  array& gw = outputs[1];
@@ -310,30 +314,31 @@ void RMSNormVJP::eval_gpu(
  encoder.set_input_array(g);
  encoder.set_output_array(gx);
  encoder.set_output_array(gw_temp);
-  encoder.launch_kernel([&, x = x, g = g](cudaStream_t stream) {
-    dispatch_float_types(gx.dtype(), "rms_norm_vjp", [&](auto type_tag) {
-      dispatch_bool(has_w, [&](auto has_w_constant) {
-        constexpr int N_READS = 4;
-        dispatch_block_dim(
-            cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
-              using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-              constexpr int N_READS = 4;
-              auto kernel = cu::rms_norm_vjp<
-                  DataType,
-                  has_w_constant(),
-                  block_dim(),
-                  N_READS>;
-              kernel<<<n_rows, block_dim(), 0, stream>>>(
-                  x.data<DataType>(),
-                  w.data<DataType>(),
-                  g.data<DataType>(),
-                  gx.data<DataType>(),
-                  gw_temp.data<DataType>(),
-                  eps_,
-                  axis_size,
-                  w_stride);
-            });
-      });
+  dispatch_float_types(gx.dtype(), "rms_norm_vjp", [&](auto type_tag) {
+    dispatch_bool(has_w, [&](auto has_w_constant) {
+      constexpr int N_READS = 4;
+      dispatch_block_dim(
+          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+            using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+            constexpr int N_READS = 4;
+            auto kernel = cu::rms_norm_vjp<
+                DataType,
+                has_w_constant.value,
+                block_dim(),
+                N_READS>;
+            encoder.add_kernel_node(
+                kernel,
+                n_rows,
+                block_dim(),
+                x.data<DataType>(),
+                w.data<DataType>(),
+                g.data<DataType>(),
+                gx.data<DataType>(),
+                gw_temp.data<DataType>(),
+                eps_,
+                axis_size,
+                w_stride);
+          });
    });
  });

--- a/mlx/backend/cuda/rope.cu
+++ b/mlx/backend/cuda/rope.cu
@@ -308,74 +308,89 @@ void RoPE::eval_gpu(
  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(donated ? out : in);
  encoder.set_input_array(offset);
+  if (with_freqs) {
+    encoder.set_input_array(inputs[2]);
+  }
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    dispatch_float_types(out.dtype(), "rope", [&](auto type_tag) {
-      dispatch_bool(traditional_, [&](auto traditional) {
-        dispatch_bool(forward_, [&](auto forward) {
-          using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-          if (single && !with_freqs) {
-            auto kernel = cu::rope_single<DataType, traditional(), forward()>;
-            uint2 dims = make_uint2(dims_ / 2, in.size() / mat_size);
-            auto [grid, block] = get_grid_and_block(dims.x, dims.y, 1);
-            kernel<<<grid, block, 0, stream>>>(
-                (donated ? out : in).data<DataType>(),
-                out.data<DataType>(),
-                offset.data<int32_t>(),
-                scale_,
-                std::log2(base_),
-                mat_size,
-                dims);
-          } else if (single) {
-            auto kernel =
-                cu::rope_single_freqs<DataType, traditional(), forward()>;
-            uint2 dims = make_uint2(dims_ / 2, in.size() / mat_size);
-            auto [grid, block] = get_grid_and_block(dims.x, dims.y, 1);
-            kernel<<<grid, block, 0, stream>>>(
-                (donated ? out : in).data<DataType>(),
-                out.data<DataType>(),
-                offset.data<int32_t>(),
-                inputs[2].data<float>(),
-                scale_,
-                mat_size,
-                dims,
-                inputs[2].strides(0));
-          } else if (with_freqs) {
-            auto kernel = cu::rope_freqs<DataType, traditional(), forward()>;
-            uint3 dims =
-                make_uint3(dims_ / 2, in.shape(-2), in.size() / mat_size);
-            dims.z = (dims.z + 3) / 4;
-            auto [grid, block] = get_grid_and_block(dims.x, dims.y, dims.z);
-            kernel<<<grid, block, 0, stream>>>(
-                (donated ? out : in).data<DataType>(),
-                out.data<DataType>(),
-                offset.data<int32_t>(),
-                inputs[2].data<float>(),
-                scale_,
-                std::log2(base_),
-                strides,
-                out_strides,
-                in.size() / mat_size,
-                dims,
-                inputs[2].strides(0));
-          } else {
-            auto kernel = cu::rope<DataType, traditional(), forward()>;
-            uint3 dims =
-                make_uint3(dims_ / 2, in.shape(-2), in.size() / mat_size);
-            dims.z = (dims.z + 3) / 4;
-            auto [grid, block] = get_grid_and_block(dims.x, dims.y, dims.z);
-            kernel<<<grid, block, 0, stream>>>(
-                (donated ? out : in).data<DataType>(),
-                out.data<DataType>(),
-                offset.data<int32_t>(),
-                scale_,
-                std::log2(base_),
-                strides,
-                out_strides,
-                in.size() / mat_size,
-                dims);
-          }
-        });
+  dispatch_float_types(out.dtype(), "rope", [&](auto type_tag) {
+    dispatch_bool(traditional_, [&](auto traditional) {
+      dispatch_bool(forward_, [&](auto forward) {
+        using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+        if (single && !with_freqs) {
+          auto kernel =
+              cu::rope_single<DataType, traditional.value, forward.value>;
+          uint2 dims = make_uint2(dims_ / 2, in.size() / mat_size);
+          auto [grid, block] = get_grid_and_block(dims.x, dims.y, 1);
+          encoder.add_kernel_node(
+              kernel,
+              grid,
+              block,
+              (donated ? out : in).data<DataType>(),
+              out.data<DataType>(),
+              offset.data<int32_t>(),
+              scale_,
+              std::log2(base_),
+              mat_size,
+              dims);
+        } else if (single) {
+          auto kernel =
+              cu::rope_single_freqs<DataType, traditional.value, forward.value>;
+          uint2 dims = make_uint2(dims_ / 2, in.size() / mat_size);
+          auto [grid, block] = get_grid_and_block(dims.x, dims.y, 1);
+          encoder.add_kernel_node(
+              kernel,
+              grid,
+              block,
+              (donated ? out : in).data<DataType>(),
+              out.data<DataType>(),
+              offset.data<int32_t>(),
+              inputs[2].data<float>(),
+              scale_,
+              mat_size,
+              dims,
+              inputs[2].strides(0));
+        } else if (with_freqs) {
+          auto kernel =
+              cu::rope_freqs<DataType, traditional.value, forward.value>;
+          uint3 dims =
+              make_uint3(dims_ / 2, in.shape(-2), in.size() / mat_size);
+          dims.z = (dims.z + 3) / 4;
+          auto [grid, block] = get_grid_and_block(dims.x, dims.y, dims.z);
+          encoder.add_kernel_node(
+              kernel,
+              grid,
+              block,
+              (donated ? out : in).data<DataType>(),
+              out.data<DataType>(),
+              offset.data<int32_t>(),
+              inputs[2].data<float>(),
+              scale_,
+              std::log2(base_),
+              strides,
+              out_strides,
+              in.size() / mat_size,
+              dims,
+              inputs[2].strides(0));
+        } else {
+          auto kernel = cu::rope<DataType, traditional.value, forward.value>;
+          uint3 dims =
+              make_uint3(dims_ / 2, in.shape(-2), in.size() / mat_size);
+          dims.z = (dims.z + 3) / 4;
+          auto [grid, block] = get_grid_and_block(dims.x, dims.y, dims.z);
+          encoder.add_kernel_node(
+              kernel,
+              grid,
+              block,
+              (donated ? out : in).data<DataType>(),
+              out.data<DataType>(),
+              offset.data<int32_t>(),
+              scale_,
+              std::log2(base_),
+              strides,
+              out_strides,
+              in.size() / mat_size,
+              dims);
+        }
      });
    });
  });
--- a/mlx/backend/cuda/scan.cu
+++ b/mlx/backend/cuda/scan.cu
@@ -0,0 +1,467 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/device/binary_ops.cuh"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/backend/cuda/reduce/reduce_ops.cuh"
+#include "mlx/backend/gpu/copy.h"
+#include "mlx/dtype_utils.h"
+#include "mlx/primitives.h"
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/scan.h>
+#include <nvtx3/nvtx3.hpp>
+
+#include <cassert>
+
+namespace mlx::core {
+
+namespace cu {
+
+namespace cg = cooperative_groups;
+
+template <typename Op, typename T>
+struct ScanResult {
+  using type = T;
+};
+
+template <>
+struct ScanResult<Sum, bool> {
+  using type = int32_t;
+};
+
+template <typename T>
+struct ReduceInit<LogAddExp, T> {
+  static constexpr __host__ __device__ T value() {
+    return Limits<T>::min();
+  }
+};
+
+template <bool reverse, typename T, typename U, int N_READS>
+inline __device__ void
+load_values(int index, const T* in, U (&values)[N_READS], int size, U init) {
+  int remaining = size - index * N_READS;
+  if constexpr (reverse) {
+    in += remaining - N_READS;
+    if (remaining < N_READS) {
+      for (int i = 0; i < N_READS; ++i) {
+        values[N_READS - i - 1] =
+            (N_READS - i - 1 < remaining) ? cast_to<U>(in[i]) : init;
+      }
+    } else {
+      for (int i = 0; i < N_READS; ++i) {
+        values[N_READS - i - 1] = cast_to<U>(in[i]);
+      }
+    }
+  } else {
+    in += index * N_READS;
+    if (remaining < N_READS) {
+      for (int i = 0; i < N_READS; ++i) {
+        values[i] = (i < remaining) ? cast_to<U>(in[i]) : init;
+      }
+    } else {
+      for (int i = 0; i < N_READS; ++i) {
+        values[i] = cast_to<U>(in[i]);
+      }
+    }
+  }
+}
+
+template <bool reverse, int offset, typename T, int N_READS>
+inline __device__ void
+store_values(int index, T* out, T (&values)[N_READS], int size) {
+  int start = index * N_READS + offset;
+  int remaining = size - start;
+  if constexpr (reverse) {
+    out += remaining - N_READS;
+    if (remaining < N_READS) {
+      for (int i = 0; i < N_READS; ++i) {
+        if (N_READS - i - 1 < remaining) {
+          out[i] = values[N_READS - i - 1];
+        }
+      }
+    } else {
+      for (int i = 0; i < N_READS; ++i) {
+        out[i] = values[N_READS - i - 1];
+      }
+    }
+  } else {
+    out += start;
+    if (remaining < N_READS) {
+      for (int i = 0; i < N_READS; ++i) {
+        if (i < remaining) {
+          out[i] = values[i];
+        }
+      }
+    } else {
+      for (int i = 0; i < N_READS; ++i) {
+        out[i] = values[i];
+      }
+    }
+  }
+}
+
+template <
+    typename T,
+    typename U,
+    typename Op,
+    int N_READS,
+    bool inclusive,
+    bool reverse>
+__global__ void contiguous_scan(const T* in, U* out, int32_t axis_size) {
+  auto grid = cg::this_grid();
+  auto block = cg::this_thread_block();
+  auto warp = cg::tiled_partition<WARP_SIZE>(block);
+
+  in += grid.block_rank() * axis_size;
+  out += grid.block_rank() * axis_size;
+
+  __shared__ U warp_sums[WARP_SIZE];
+
+  Op op;
+  U init = ReduceInit<Op, T>::value();
+  U prefix = init;
+
+  // Scan per block.
+  for (int r = 0; r < cuda::ceil_div(axis_size, block.size() * N_READS); ++r) {
+    int32_t index = r * block.size() + block.thread_rank();
+    U values[N_READS];
+    load_values<reverse>(index, in, values, axis_size, init);
+
+    // Compute an inclusive scan per thread.
+    for (int i = 1; i < N_READS; ++i) {
+      values[i] = op(values[i], values[i - 1]);
+    }
+
+    // Compute exclusive scan of thread sums.
+    U prev_thread_sum = cg::exclusive_scan(warp, values[N_READS - 1], op);
+    if (warp.thread_rank() == 0) {
+      prev_thread_sum = init;
+    }
+
+    // Write wrap's sum to shared memory.
+    if (warp.thread_rank() == WARP_SIZE - 1) {
+      warp_sums[warp.meta_group_rank()] =
+          op(prev_thread_sum, values[N_READS - 1]);
+    }
+    block.sync();
+
+    // Compute exclusive scan of warp sums.
+    if (warp.meta_group_rank() == 0) {
+      U prev_warp_sum =
+          cg::exclusive_scan(warp, warp_sums[warp.thread_rank()], op);
+      if (warp.thread_rank() == 0) {
+        prev_warp_sum = init;
+      }
+      warp_sums[warp.thread_rank()] = prev_warp_sum;
+    }
+    block.sync();
+
+    // Compute the output.
+    for (int i = 0; i < N_READS; ++i) {
+      values[i] = op(values[i], prefix);
+      values[i] = op(values[i], warp_sums[warp.meta_group_rank()]);
+      values[i] = op(values[i], prev_thread_sum);
+    }
+
+    // Write the values.
+    if (inclusive) {
+      store_values<reverse, 0>(index, out, values, axis_size);
+    } else {
+      store_values<reverse, 1>(index, out, values, axis_size);
+      if (reverse) {
+        if (block.thread_rank() == 0 && index == 0) {
+          out[axis_size - 1] = init;
+        }
+      } else {
+        if (block.thread_rank() == 0 && index == 0) {
+          out[0] = init;
+        }
+      }
+    }
+    block.sync();
+
+    // Share the prefix.
+    if ((warp.meta_group_rank() == warp.meta_group_size() - 1) &&
+        (warp.thread_rank() == WARP_SIZE - 1)) {
+      warp_sums[0] = values[N_READS - 1];
+    }
+    block.sync();
+    prefix = warp_sums[0];
+  }
+}
+
+template <
+    typename T,
+    typename U,
+    typename Op,
+    int N_READS,
+    int BM,
+    int BN,
+    bool inclusive,
+    bool reverse>
+__global__ void strided_scan(
+    const T* in,
+    U* out,
+    int32_t axis_size,
+    int64_t stride,
+    int64_t stride_blocks) {
+  auto grid = cg::this_grid();
+  auto block = cg::this_thread_block();
+  auto warp = cg::tiled_partition<WARP_SIZE>(block);
+
+  constexpr int BN_pad = WARP_SIZE + 16 / sizeof(U);
+  constexpr int n_warps = BN / N_READS;
+  constexpr int n_scans = BN / n_warps;
+
+  __shared__ U read_buffer[BM * BN_pad];
+
+  Op op;
+  U init = ReduceInit<Op, T>::value();
+  U values[n_scans];
+  U prefix[n_scans];
+  for (int i = 0; i < n_scans; ++i) {
+    prefix[i] = init;
+  }
+
+  // Compute offsets.
+  int64_t offset = (grid.block_rank() / stride_blocks) * axis_size * stride;
+  int64_t global_index_x = (grid.block_rank() % stride_blocks) * BN;
+  uint read_offset_y = (block.thread_rank() * N_READS) / BN;
+  uint read_offset_x = (block.thread_rank() * N_READS) % BN;
+  uint scan_offset_y = warp.thread_rank();
+  uint scan_offset_x = warp.meta_group_rank() * n_scans;
+
+  uint stride_limit = stride - global_index_x;
+  in += offset + global_index_x + read_offset_x;
+  out += offset + global_index_x + read_offset_x;
+  U* read_into = read_buffer + read_offset_y * BN_pad + read_offset_x;
+  U* read_from = read_buffer + scan_offset_y * BN_pad + scan_offset_x;
+
+  for (uint j = 0; j < axis_size; j += BM) {
+    // Calculate the indices for the current thread.
+    uint index_y = j + read_offset_y;
+    uint check_index_y = index_y;
+    if (reverse) {
+      index_y = axis_size - 1 - index_y;
+    }
+
+    // Read in SM.
+    if (check_index_y < axis_size && (read_offset_x + N_READS) < stride_limit) {
+      for (int i = 0; i < N_READS; ++i) {
+        read_into[i] = in[index_y * stride + i];
+      }
+    } else {
+      for (int i = 0; i < N_READS; ++i) {
+        if (check_index_y < axis_size && (read_offset_x + i) < stride_limit) {
+          read_into[i] = in[index_y * stride + i];
+        } else {
+          read_into[i] = init;
+        }
+      }
+    }
+    block.sync();
+
+    // Read strided into registers.
+    for (int i = 0; i < n_scans; ++i) {
+      values[i] = read_from[i];
+    }
+
+    // Perform the scan.
+    for (int i = 0; i < n_scans; ++i) {
+      values[i] = cg::inclusive_scan(warp, values[i], op);
+      values[i] = op(values[i], prefix[i]);
+      prefix[i] = warp.shfl(values[i], WARP_SIZE - 1);
+    }
+
+    // Write to SM.
+    for (int i = 0; i < n_scans; ++i) {
+      read_from[i] = values[i];
+    }
+    block.sync();
+
+    // Write to device memory.
+    if (!inclusive) {
+      if (check_index_y == 0) {
+        if ((read_offset_x + N_READS) < stride_limit) {
+          for (int i = 0; i < N_READS; ++i) {
+            out[index_y * stride + i] = init;
+          }
+        } else {
+          for (int i = 0; i < N_READS; ++i) {
+            if ((read_offset_x + i) < stride_limit) {
+              out[index_y * stride + i] = init;
+            }
+          }
+        }
+      }
+      if (reverse) {
+        index_y -= 1;
+        check_index_y += 1;
+      } else {
+        index_y += 1;
+        check_index_y += 1;
+      }
+    }
+    if (check_index_y < axis_size && (read_offset_x + N_READS) < stride_limit) {
+      for (int i = 0; i < N_READS; ++i) {
+        out[index_y * stride + i] = read_into[i];
+      }
+    } else {
+      for (int i = 0; i < N_READS; ++i) {
+        if (check_index_y < axis_size && (read_offset_x + i) < stride_limit) {
+          out[index_y * stride + i] = read_into[i];
+        }
+      }
+    }
+  }
+}
+
+} // namespace cu
+
+template <typename F>
+void dispatch_scan_ops(Scan::ReduceType scan_op, F&& f) {
+  if (scan_op == Scan::ReduceType::Max) {
+    f(type_identity<cu::Max>{});
+  } else if (scan_op == Scan::ReduceType::Min) {
+    f(type_identity<cu::Min>{});
+  } else if (scan_op == Scan::ReduceType::Sum) {
+    f(type_identity<cu::Sum>{});
+  } else if (scan_op == Scan::ReduceType::Prod) {
+    f(type_identity<cu::Prod>{});
+  } else if (scan_op == Scan::ReduceType::LogAddExp) {
+    f(type_identity<cu::LogAddExp>{});
+  } else {
+    throw std::invalid_argument("Unknown reduce type.");
+  }
+}
+
+template <typename Op>
+const char* op_to_string() {
+  if (cuda::std::is_same_v<Op, cu::Max>) {
+    return "Max";
+  } else if (cuda::std::is_same_v<Op, cu::Min>) {
+    return "Min";
+  } else if (cuda::std::is_same_v<Op, cu::Sum>) {
+    return "Sum";
+  } else if (cuda::std::is_same_v<Op, cu::Prod>) {
+    return "Prod";
+  } else if (cuda::std::is_same_v<Op, cu::LogAddExp>) {
+    return "LogAddExp";
+  } else {
+    throw std::invalid_argument("Unknown op.");
+  }
+}
+
+template <typename Op, typename T>
+constexpr bool supports_scan_op() {
+  if constexpr (cuda::std::is_same_v<Op, LogAddExp>) {
+    return is_inexact_v<T>;
+  } else {
+    return true;
+  }
+}
+
+void Scan::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("Scan::eval_gpu");
+  assert(inputs.size() == 1);
+  auto in = inputs[0];
+  auto& s = stream();
+
+  if (in.flags().contiguous && in.strides()[axis_] != 0) {
+    if (in.is_donatable() && in.itemsize() == out.itemsize()) {
+      out.copy_shared_buffer(in);
+    } else {
+      out.set_data(
+          allocator::malloc(in.data_size() * out.itemsize()),
+          in.data_size(),
+          in.strides(),
+          in.flags());
+    }
+  } else {
+    array arr_copy(in.shape(), in.dtype(), nullptr, {});
+    copy_gpu(in, arr_copy, CopyType::General, s);
+    in = std::move(arr_copy);
+    out.copy_shared_buffer(in);
+  }
+
+  constexpr int N_READS = 4;
+  int32_t axis_size = in.shape(axis_);
+  bool contiguous = in.strides()[axis_] == 1;
+
+  auto& encoder = cu::get_command_encoder(s);
+  encoder.set_input_array(in);
+  encoder.set_output_array(out);
+
+  dispatch_all_types(in.dtype(), [&](auto type_tag) {
+    using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+    dispatch_scan_ops(reduce_type_, [&](auto scan_op_tag) {
+      using Op = MLX_GET_TYPE(scan_op_tag);
+      if constexpr (supports_scan_op<Op, T>) {
+        using U = typename cu::ScanResult<Op, T>::type;
+        dispatch_bool(inclusive_, [&](auto inclusive) {
+          dispatch_bool(reverse_, [&](auto reverse) {
+            if (contiguous) {
+              auto kernel = cu::contiguous_scan<
+                  T,
+                  U,
+                  Op,
+                  N_READS,
+                  inclusive.value,
+                  reverse.value>;
+              int block_dim = cuda::ceil_div(axis_size, N_READS);
+              block_dim = cuda::ceil_div(block_dim, WARP_SIZE) * WARP_SIZE;
+              block_dim = std::min(block_dim, WARP_SIZE * WARP_SIZE);
+              encoder.add_kernel_node(
+                  kernel,
+                  in.data_size() / axis_size,
+                  block_dim,
+                  in.data<T>(),
+                  out.data<U>(),
+                  axis_size);
+            } else {
+              constexpr int BM = WARP_SIZE;
+              constexpr int BN = WARP_SIZE;
+              auto kernel = cu::strided_scan<
+                  T,
+                  U,
+                  Op,
+                  N_READS,
+                  BM,
+                  BN,
+                  inclusive.value,
+                  reverse.value>;
+              int64_t stride = in.strides()[axis_];
+              int64_t stride_blocks = cuda::ceil_div(stride, BN);
+              dim3 num_blocks = get_2d_grid_dims(
+                  in.shape(), in.strides(), axis_size * stride);
+              if (num_blocks.x * stride_blocks <= UINT32_MAX) {
+                num_blocks.x *= stride_blocks;
+              } else {
+                num_blocks.y *= stride_blocks;
+              }
+              int block_dim = (BN / N_READS) * WARP_SIZE;
+              encoder.add_kernel_node(
+                  kernel,
+                  num_blocks,
+                  block_dim,
+                  in.data<T>(),
+                  out.data<U>(),
+                  axis_size,
+                  stride,
+                  stride_blocks);
+            }
+          });
+        });
+      } else {
+        throw std::runtime_error(fmt::format(
+            "Can not do scan op {} on inputs of {} with result of {}.",
+            op_to_string<Op>(),
+            dtype_to_string(in.dtype()),
+            dtype_to_string(out.dtype())));
+      }
+    });
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/softmax.cu
+++ b/mlx/backend/cuda/softmax.cu
@@ -43,7 +43,7 @@ __global__ void softmax(const T* in, T* out, int axis_size) {
  // Thread reduce.
  AccT prevmax;
  AccT maxval = Limits<AccT>::finite_min();
-  AccT normalizer = 0;
+  AccT normalizer = cast_to<AccT>(0);
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); r++) {
    AccT vals[N_READS];
    cub::LoadDirectBlocked(
@@ -141,19 +141,21 @@ void Softmax::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(in);
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    dispatch_float_types(out.dtype(), "softmax", [&](auto type_tag) {
-      constexpr int N_READS = 4;
-      dispatch_block_dim(
-          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
-            using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-            auto kernel = cu::softmax<DataType, DataType, block_dim(), N_READS>;
-            if (precise) {
-              kernel = cu::softmax<DataType, float, block_dim(), N_READS>;
-            }
-            kernel<<<n_rows, block_dim(), 0, stream>>>(
-                in.data<DataType>(), out.data<DataType>(), axis_size);
-          });
+  dispatch_float_types(out.dtype(), "softmax", [&](auto type_tag) {
+    constexpr int N_READS = 4;
+    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+      auto kernel = cu::softmax<DataType, DataType, block_dim(), N_READS>;
+      if (precise) {
+        kernel = cu::softmax<DataType, float, block_dim(), N_READS>;
+      }
+      encoder.add_kernel_node(
+          kernel,
+          n_rows,
+          block_dim(),
+          in.data<DataType>(),
+          out.data<DataType>(),
+          axis_size);
    });
  });
 }
--- a/mlx/backend/cuda/sort.cu
+++ b/mlx/backend/cuda/sort.cu
@@ -50,32 +50,6 @@ array swapaxes_in_eval(const array& in, int axis1, int axis2) {
  return out;
 }

-template <typename... Args>
-void segmented_sort_pairs(cu::CommandEncoder& encoder, Args&&... args) {
-  // Allocate temporary storage.
-  size_t size;
-  CHECK_CUDA_ERROR(
-      cub::DeviceSegmentedSort::StableSortPairs(nullptr, size, args...));
-  array temp(allocator::malloc(size), {static_cast<int>(size)}, uint8);
-  encoder.add_temporary(temp);
-  // Run op.
-  CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortPairs(
-      temp.data<void>(), size, args...));
-}
-
-template <typename... Args>
-void segmented_sort(cu::CommandEncoder& encoder, Args&&... args) {
-  // Allocate temporary storage.
-  size_t size;
-  CHECK_CUDA_ERROR(
-      cub::DeviceSegmentedSort::StableSortKeys(nullptr, size, args...));
-  array temp(allocator::malloc(size), {static_cast<int>(size)}, uint8);
-  encoder.add_temporary(temp);
-  // Run op.
-  CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortKeys(
-      temp.data<void>(), size, args...));
-}
-
 struct OffsetTransform {
  int nsort;

@@ -113,57 +87,94 @@ void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {

  encoder.set_input_array(in);
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    dispatch_all_types(in.dtype(), [&](auto type_tag) {
-      using CTYPE = MLX_GET_TYPE(type_tag);
-      if constexpr (!std::is_same_v<CTYPE, complex64_t>) {
-        using Type = cuda_type_t<CTYPE>;
-        auto offsets = thrust::make_transform_iterator(
-            thrust::make_counting_iterator(0), OffsetTransform{nsort});
-        if (argsort) {
-          // Indices in the sorted dimension.
-          array indices(
-              allocator::malloc(out.nbytes()), in.shape(), out.dtype());
-          encoder.add_temporary(indices);
-          thrust::transform(
-              cu::thrust_policy(stream),
-              thrust::counting_iterator<uint32_t>(0),
-              thrust::counting_iterator<uint32_t>(indices.data_size()),
-              thrust::device_pointer_cast(indices.data<uint32_t>()),
-              ModOp<uint32_t>{static_cast<uint32_t>(nsort)});
+  dispatch_all_types(in.dtype(), [&](auto type_tag) {
+    using CTYPE = MLX_GET_TYPE(type_tag);
+    auto& stream = encoder.stream();
+    if constexpr (!std::is_same_v<CTYPE, complex64_t>) {
+      using Type = cuda_type_t<CTYPE>;
+      auto offsets = thrust::make_transform_iterator(
+          thrust::make_counting_iterator(0), OffsetTransform{nsort});
+      if (argsort) {
+        // Indices in the sorted dimension.
+        array indices(allocator::malloc(out.nbytes()), in.shape(), out.dtype());
+        encoder.add_temporary(indices);

-          // In argsort though we don't need the result of sorted values, the
-          // API requires us to provide an array to store it.
-          array discard(allocator::malloc(in.nbytes()), in.shape(), in.dtype());
-          encoder.add_temporary(discard);
+        // In argsort though we don't need the result of sorted values, the
+        // API requires us to provide an array to store it.
+        array discard(allocator::malloc(in.nbytes()), in.shape(), in.dtype());
+        encoder.add_temporary(discard);

-          segmented_sort_pairs(
-              encoder,
-              in.data<Type>(),
-              discard.data<Type>(),
-              indices.data<uint32_t>(),
-              out.data<uint32_t>(),
-              in.data_size(),
-              in.data_size() / nsort,
-              offsets,
-              offsets + 1,
-              stream);
-        } else {
-          segmented_sort(
-              encoder,
-              in.data<Type>(),
-              out.data<Type>(),
-              in.data_size(),
-              in.data_size() / nsort,
-              offsets,
-              offsets + 1,
-              stream);
-        }
+        size_t size;
+        CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortPairs(
+            nullptr,
+            size,
+            in.data<Type>(),
+            discard.data<Type>(),
+            indices.data<uint32_t>(),
+            out.data<uint32_t>(),
+            in.data_size(),
+            in.data_size() / nsort,
+            offsets,
+            offsets + 1,
+            stream));
+
+        array temp(allocator::malloc(size), {static_cast<int>(size)}, uint8);
+        encoder.add_temporary(temp);
+
+        // Start capturing after allocations
+        auto capture = encoder.capture_context();
+        thrust::transform(
+            cu::thrust_policy(stream),
+            thrust::counting_iterator<uint32_t>(0),
+            thrust::counting_iterator<uint32_t>(indices.data_size()),
+            thrust::device_pointer_cast(indices.data<uint32_t>()),
+            ModOp<uint32_t>{static_cast<uint32_t>(nsort)});
+
+        CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortPairs(
+            temp.data<void>(),
+            size,
+            in.data<Type>(),
+            discard.data<Type>(),
+            indices.data<uint32_t>(),
+            out.data<uint32_t>(),
+            in.data_size(),
+            in.data_size() / nsort,
+            offsets,
+            offsets + 1,
+            stream));
      } else {
-        throw std::runtime_error(
-            "CUDA backend does not support sorting complex numbers");
+        size_t size;
+        CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortKeys(
+            nullptr,
+            size,
+            in.data<Type>(),
+            out.data<Type>(),
+            in.data_size(),
+            in.data_size() / nsort,
+            offsets,
+            offsets + 1,
+            stream));
+
+        array temp(allocator::malloc(size), {static_cast<int>(size)}, uint8);
+        encoder.add_temporary(temp);
+
+        // Start capturing after allocations
+        auto capture = encoder.capture_context();
+        CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortKeys(
+            temp.data<void>(),
+            size,
+            in.data<Type>(),
+            out.data<Type>(),
+            in.data_size(),
+            in.data_size() / nsort,
+            offsets,
+            offsets + 1,
+            stream));
      }
-    });
+    } else {
+      throw std::runtime_error(
+          "CUDA backend does not support sorting complex numbers");
+    }
  });

  if (!is_segmented_sort) {
--- a/mlx/backend/cuda/ternary.cu
+++ b/mlx/backend/cuda/ternary.cu
@@ -15,12 +15,27 @@ namespace cu {

 namespace cg = cooperative_groups;

-template <typename Op, typename T, typename IdxT>
+template <typename Op, typename T, typename IdxT, int N_READS>
 __global__ void
 ternary_v(const bool* a, const T* b, const T* c, T* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    out[index] = Op{}(a[index], b[index], c[index]);
+
+  if ((index + 1) * N_READS > size) {
+    for (IdxT i = index * N_READS; i < size; ++i) {
+      out[i] = Op{}(a[i], b[i], c[i]);
+    }
+  } else {
+    auto a_vec = load_vector<N_READS>(a, index);
+    auto b_vec = load_vector<N_READS>(b, index);
+    auto c_vec = load_vector<N_READS>(c, index);
+
+    AlignedVector<T, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = Op{}(a_vec.val[i], b_vec.val[i], c_vec.val[i]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
  }
 }

@@ -91,73 +106,87 @@ void ternary_op_gpu_inplace(
  encoder.set_input_array(b);
  encoder.set_input_array(c);
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    dispatch_all_types(out.dtype(), [&](auto type_tag) {
-      using DType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+  dispatch_all_types(out.dtype(), [&](auto type_tag) {
+    using DType = cuda_type_t<MLX_GET_TYPE(type_tag)>;

-      auto topt = get_ternary_op_type(a, b, c);
-      if (topt == TernaryOpType::General) {
-        dispatch_bool(
-            a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
-                c.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
-            [&](auto large) {
-              using IdxT = std::conditional_t<large(), int64_t, int32_t>;
-              Shape shape;
-              std::vector<Strides> strides;
-              std::tie(shape, strides) = collapse_contiguous_dims(a, b, c, out);
-              auto& a_strides = strides[0];
-              auto& b_strides = strides[1];
-              auto& c_strides = strides[2];
-              int ndim = shape.size();
-              if (ndim <= 3) {
-                dispatch_1_2_3(ndim, [&](auto dims_constant) {
-                  auto kernel =
-                      cu::ternary_g_nd<Op, DType, IdxT, dims_constant()>;
-                  auto [num_blocks, block_dims] =
-                      get_launch_args(kernel, out, large());
-                  kernel<<<num_blocks, block_dims, 0, stream>>>(
-                      a.data<bool>(),
-                      b.data<DType>(),
-                      c.data<DType>(),
-                      out.data<DType>(),
-                      out.size(),
-                      const_param<dims_constant()>(shape),
-                      const_param<dims_constant()>(a_strides),
-                      const_param<dims_constant()>(b_strides),
-                      const_param<dims_constant()>(c_strides));
-                });
-              } else {
-                auto kernel = cu::ternary_g<Op, DType, IdxT>;
+    auto topt = get_ternary_op_type(a, b, c);
+    if (topt == TernaryOpType::General) {
+      dispatch_bool(
+          a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
+              c.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
+          [&](auto large) {
+            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+            Shape shape;
+            std::vector<Strides> strides;
+            std::tie(shape, strides) = collapse_contiguous_dims(a, b, c, out);
+            auto& a_strides = strides[0];
+            auto& b_strides = strides[1];
+            auto& c_strides = strides[2];
+            int ndim = shape.size();
+            if (ndim <= 3) {
+              dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                auto kernel =
+                    cu::ternary_g_nd<Op, DType, IdxT, dims_constant()>;
                auto [num_blocks, block_dims] =
                    get_launch_args(kernel, out, large());
-                kernel<<<num_blocks, block_dims, 0, stream>>>(
+                encoder.add_kernel_node(
+                    kernel,
+                    num_blocks,
+                    block_dims,
                    a.data<bool>(),
                    b.data<DType>(),
                    c.data<DType>(),
                    out.data<DType>(),
-                    out.data_size(),
-                    const_param(shape),
-                    const_param(a_strides),
-                    const_param(b_strides),
-                    const_param(c_strides),
-                    ndim);
-              }
-            });
-      } else {
-        dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
-          using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
-          auto kernel = cu::ternary_v<Op, DType, IdxT>;
-          auto [num_blocks, block_dims] = get_launch_args(
-              kernel, out.data_size(), out.shape(), out.strides(), large());
-          kernel<<<num_blocks, block_dims, 0, stream>>>(
-              a.data<bool>(),
-              b.data<DType>(),
-              c.data<DType>(),
-              out.data<DType>(),
-              out.data_size());
-        });
-      }
-    });
+                    out.size(),
+                    const_param<dims_constant()>(shape),
+                    const_param<dims_constant()>(a_strides),
+                    const_param<dims_constant()>(b_strides),
+                    const_param<dims_constant()>(c_strides));
+              });
+            } else {
+              auto kernel = cu::ternary_g<Op, DType, IdxT>;
+              auto [num_blocks, block_dims] =
+                  get_launch_args(kernel, out, large());
+              encoder.add_kernel_node(
+                  kernel,
+                  num_blocks,
+                  block_dims,
+                  a.data<bool>(),
+                  b.data<DType>(),
+                  c.data<DType>(),
+                  out.data<DType>(),
+                  out.data_size(),
+                  const_param(shape),
+                  const_param(a_strides),
+                  const_param(b_strides),
+                  const_param(c_strides),
+                  ndim);
+            }
+          });
+    } else {
+      dispatch_bool(out.data_size() > UINT32_MAX, [&](auto large) {
+        using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
+        // TODO: Choose optimized value based on type size.
+        constexpr int N_READS = 4;
+        auto kernel = cu::ternary_v<Op, DType, IdxT, N_READS>;
+        auto [num_blocks, block_dims] = get_launch_args(
+            kernel,
+            out.data_size(),
+            out.shape(),
+            out.strides(),
+            large(),
+            N_READS);
+        encoder.add_kernel_node(
+            kernel,
+            num_blocks,
+            block_dims,
+            a.data<bool>(),
+            b.data<DType>(),
+            c.data<DType>(),
+            out.data<DType>(),
+            out.data_size());
+      });
+    }
  });
 }

--- a/mlx/backend/cuda/unary.cu
+++ b/mlx/backend/cuda/unary.cu
@@ -2,21 +2,57 @@

 #include "mlx/backend/common/unary.h"
 #include "mlx/backend/cuda/device.h"
-#include "mlx/backend/cuda/device/cucomplex_math.cuh"
 #include "mlx/backend/cuda/device/unary_ops.cuh"
 #include "mlx/backend/cuda/iterators/general_iterator.cuh"
 #include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/dtype_utils.h"
 #include "mlx/primitives.h"

+#include <cooperative_groups.h>
 #include <nvtx3/nvtx3.hpp>
-#include <thrust/device_ptr.h>
-#include <thrust/transform.h>

 namespace mlx::core {

 namespace cu {

+namespace cg = cooperative_groups;
+
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
+__global__ void unary_v(const In* in, Out* out, IdxT size) {
+  IdxT index = cg::this_grid().thread_rank();
+
+  if ((index + 1) * N_READS > size) {
+    for (IdxT i = index * N_READS; i < size; ++i) {
+      out[i] = Op{}(in[i]);
+    }
+  } else {
+    auto in_vec = load_vector<N_READS>(in, index);
+
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = Op{}(in_vec.val[i]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
+  }
+}
+
+template <typename Op, typename In, typename Out, typename IdxT>
+__global__ void unary_g(
+    const In* in,
+    Out* out,
+    IdxT size,
+    const __grid_constant__ Shape shape,
+    const __grid_constant__ Strides strides,
+    int ndim) {
+  IdxT index = cg::this_grid().thread_rank();
+  if (index < size) {
+    auto idx = elem_to_loc_4d(index, shape.data(), strides.data(), ndim);
+    out[index] = Op{}(in[idx]);
+  }
+}
+
 template <typename Op, typename In, typename Out>
 constexpr bool supports_unary_op() {
  if (std::is_same_v<Op, Abs> || std::is_same_v<Op, Negative> ||
@@ -34,10 +70,10 @@ constexpr bool supports_unary_op() {
        !std::is_same_v<In, bool>;
  }
  if (std::is_same_v<Op, Ceil> || std::is_same_v<Op, Floor>) {
-    return std::is_same_v<In, Out> && !std::is_same_v<In, complex64_t>;
+    return std::is_same_v<In, Out> && !mlx::core::is_complex_v<In>;
  }
  if (std::is_same_v<Op, Conjugate>) {
-    return std::is_same_v<In, Out> && std::is_same_v<In, complex64_t>;
+    return std::is_same_v<In, Out> && mlx::core::is_complex_v<In>;
  }
  if (std::is_same_v<Op, ArcCos> || std::is_same_v<Op, ArcSin> ||
      std::is_same_v<Op, ArcTan> || std::is_same_v<Op, Cos> ||
@@ -51,7 +87,7 @@ constexpr bool supports_unary_op() {
    return std::is_same_v<In, Out> && is_inexact_v<In>;
  }
  if (std::is_same_v<Op, Imag> || std::is_same_v<Op, Real>) {
-    return std::is_same_v<In, complex64_t> && std::is_same_v<Out, float>;
+    return mlx::core::is_complex_v<In> && std::is_same_v<Out, float>;
  }
  if (std::is_same_v<Op, LogicalNot>) {
    return std::is_same_v<In, Out> && std::is_same_v<In, bool>;
@@ -65,44 +101,74 @@ template <typename Op>
 void unary_op_gpu_inplace(
    const std::vector<array>& inputs,
    array& out,
-    const std::string& op,
+    const char* op,
    const Stream& s) {
  auto& in = inputs[0];
  if (in.size() == 0) {
    return;
  }
+  bool contig = in.flags().contiguous;
+  bool large;
+  if (!contig) {
+    large = in.data_size() > INT32_MAX || out.size() > INT32_MAX;
+  } else {
+    large = in.data_size() > UINT32_MAX;
+  }

  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(in);
  encoder.set_output_array(out);
-  encoder.launch_kernel([&](cudaStream_t stream) {
-    dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
-      dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
-        using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
-        using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
-        if constexpr (cu::supports_unary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
+  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
+    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+      using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
+      using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
+      if constexpr (cu::supports_unary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
+        dispatch_bool(large, [&](auto large) {
          using InType = cuda_type_t<CTYPE_IN>;
          using OutType = cuda_type_t<CTYPE_OUT>;
-          auto policy = cu::thrust_policy(stream);
-          auto in_ptr = thrust::device_pointer_cast(in.data<InType>());
-          auto out_ptr = thrust::device_pointer_cast(out.data<OutType>());
-          if (in.flags().contiguous) {
-            thrust::transform(
-                policy, in_ptr, in_ptr + in.data_size(), out_ptr, Op());
+          if (contig) {
+            using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
+            // TODO: Choose optimized value based on type size.
+            constexpr int N_READS = 4;
+            auto kernel = cu::unary_v<Op, InType, OutType, IdxT, N_READS>;
+            auto [num_blocks, block_dims] = get_launch_args(
+                kernel,
+                out.data_size(),
+                out.shape(),
+                out.strides(),
+                large,
+                N_READS);
+            encoder.add_kernel_node(
+                kernel,
+                num_blocks,
+                block_dims,
+                in.data<InType>(),
+                out.data<OutType>(),
+                out.data_size());
          } else {
+            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
            auto [shape, strides] = collapse_contiguous_dims(in);
-            auto [in_begin, in_end] = cu::make_general_iterators<int64_t>(
-                in_ptr, in.size(), shape, strides);
-            thrust::transform(policy, in_begin, in_end, out_ptr, Op());
+            auto kernel = cu::unary_g<Op, InType, OutType, IdxT>;
+            auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
+            encoder.add_kernel_node(
+                kernel,
+                num_blocks,
+                block_dims,
+                in.data<InType>(),
+                out.data<OutType>(),
+                out.data_size(),
+                const_param(shape),
+                const_param(strides),
+                shape.size());
          }
-        } else {
-          throw std::runtime_error(fmt::format(
-              "Can not do unary op {} on input of {} with output of {}.",
-              op,
-              dtype_to_string(in.dtype()),
-              dtype_to_string(out.dtype())));
-        }
-      });
+        });
+      } else {
+        throw std::runtime_error(fmt::format(
+            "Can not do unary op {} on input of {} with output of {}.",
+            op,
+            dtype_to_string(in.dtype()),
+            dtype_to_string(out.dtype())));
+      }
    });
  });
 }
@@ -111,17 +177,17 @@ template <typename Op>
 void unary_op_gpu(
    const std::vector<array>& inputs,
    array& out,
-    const std::string& op,
+    const char* op,
    const Stream& s) {
  set_unary_output_data(inputs[0], out);
  unary_op_gpu_inplace<Op>(inputs, out, op, s);
 }

-#define UNARY_GPU(func)                                                 \
-  void func::eval_gpu(const std::vector<array>& inputs, array& out) {   \
-    nvtx3::scoped_range r(#func "::eval_gpu");                          \
-    auto& s = out.primitive().stream();                                 \
-    unary_op_gpu<cu::func>(inputs, out, get_primitive_string(this), s); \
+#define UNARY_GPU(func)                                               \
+  void func::eval_gpu(const std::vector<array>& inputs, array& out) { \
+    nvtx3::scoped_range r(#func "::eval_gpu");                        \
+    auto& s = out.primitive().stream();                               \
+    unary_op_gpu<cu::func>(inputs, out, name(), s);                   \
  }

 UNARY_GPU(Abs)
@@ -157,16 +223,15 @@ UNARY_GPU(Tanh)
 void Log::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("Log::eval_gpu");
  auto& s = out.primitive().stream();
-  auto op = get_primitive_string(this);
  switch (base_) {
    case Base::e:
-      unary_op_gpu<cu::Log>(inputs, out, op, s);
+      unary_op_gpu<cu::Log>(inputs, out, name(), s);
      break;
    case Base::two:
-      unary_op_gpu<cu::Log2>(inputs, out, op, s);
+      unary_op_gpu<cu::Log2>(inputs, out, name(), s);
      break;
    case Base::ten:
-      unary_op_gpu<cu::Log10>(inputs, out, op, s);
+      unary_op_gpu<cu::Log10>(inputs, out, name(), s);
      break;
  }
 }
@@ -177,7 +242,7 @@ void Round::eval_gpu(const std::vector<array>& inputs, array& out) {
  const auto& in = inputs[0];
  auto& s = out.primitive().stream();
  if (issubdtype(in.dtype(), inexact)) {
-    unary_op_gpu<cu::Round>(inputs, out, get_primitive_string(this), s);
+    unary_op_gpu<cu::Round>(inputs, out, name(), s);
  } else {
    // No-op integer types
    out.copy_shared_buffer(in);
--- a/mlx/backend/cuda/utils.cpp
+++ b/mlx/backend/cuda/utils.cpp
@@ -24,6 +24,14 @@ void check_cuda_error(const char* name, cudaError_t err) {
  }
 }

+void check_cuda_error(const char* name, CUresult err) {
+  if (err != CUDA_SUCCESS) {
+    const char* err_str = "Unknown error";
+    cuGetErrorString(err, &err_str);
+    throw std::runtime_error(fmt::format("{} failed: {}", name, err_str));
+  }
+}
+
 const char* dtype_to_cuda_type(const Dtype& dtype) {
  switch (dtype) {
    case bool_:
@@ -53,7 +61,7 @@ const char* dtype_to_cuda_type(const Dtype& dtype) {
    case float64:
      return "double";
    case complex64:
-      return "cuComplex";
+      return "complex64_t";
    default:
      return "unknown";
  }
--- a/mlx/backend/cuda/utils.h
+++ b/mlx/backend/cuda/utils.h
@@ -4,6 +4,7 @@

 #pragma once

+#include <cuda.h>
 #include <cuda_runtime.h>

 namespace mlx::core {
@@ -33,6 +34,7 @@ class CudaStream {

 // Throw exception if the cuda API does not succeed.
 void check_cuda_error(const char* name, cudaError_t err);
+void check_cuda_error(const char* name, CUresult err);

 // The macro version that prints the command that failed.
 #define CHECK_CUDA_ERROR(cmd) check_cuda_error(#cmd, (cmd))
--- a/mlx/backend/metal/CMakeLists.txt
+++ b/mlx/backend/metal/CMakeLists.txt
@@ -63,6 +63,7 @@ if(MLX_METAL_JIT)
  make_jit_source(steel/gemm/kernels/steel_gemm_masked kernels/steel/defines.h)
  make_jit_source(steel/gemm/kernels/steel_gemm_gather)
  make_jit_source(steel/gemm/kernels/steel_gemm_splitk)
+  make_jit_source(steel/gemm/kernels/steel_gemm_segmented)
  make_jit_source(
    steel/conv/conv
    kernels/steel/utils.h
--- a/mlx/backend/metal/binary.cpp
+++ b/mlx/backend/metal/binary.cpp
@@ -7,20 +7,20 @@

 #define BINARY_GPU(func)                                              \
  void func::eval_gpu(const std::vector<array>& inputs, array& out) { \
-    binary_op_gpu(inputs, out, get_primitive_string(this));           \
+    binary_op_gpu(inputs, out, name());                               \
  }

 #define BINARY_GPU_MULTI(func)                                         \
  void func::eval_gpu(                                                 \
      const std::vector<array>& inputs, std::vector<array>& outputs) { \
-    binary_op_gpu(inputs, outputs, get_primitive_string(this));        \
+    binary_op_gpu(inputs, outputs, name());                            \
  }

 namespace mlx::core {

 std::string get_kernel_name(
    BinaryOpType bopt,
-    const std::string& op,
+    const char* op,
    const array& a,
    bool large,
    int ndim,
@@ -65,7 +65,7 @@ std::string get_kernel_name(
 void binary_op_gpu_inplace(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
-    const std::string& op,
+    const char* op,
    const Stream& s) {
  auto& a = inputs[0];
  auto& b = inputs[1];
@@ -165,7 +165,7 @@ void binary_op_gpu_inplace(
 void binary_op_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
-    const std::string& op,
+    const char* op,
    const Stream& s) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
@@ -179,7 +179,7 @@ void binary_op_gpu(
 void binary_op_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
-    const std::string& op) {
+    const char* op) {
  auto& s = outputs[0].primitive().stream();
  binary_op_gpu(inputs, outputs, op, s);
 }
@@ -187,7 +187,7 @@ void binary_op_gpu(
 void binary_op_gpu_inplace(
    const std::vector<array>& inputs,
    array& out,
-    const std::string& op,
+    const char* op,
    const Stream& s) {
  std::vector<array> outputs = {out};
  binary_op_gpu_inplace(inputs, outputs, op, s);
@@ -196,7 +196,7 @@ void binary_op_gpu_inplace(
 void binary_op_gpu(
    const std::vector<array>& inputs,
    array& out,
-    const std::string& op,
+    const char* op,
    const Stream& s) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
@@ -209,7 +209,7 @@ void binary_op_gpu(
 void binary_op_gpu(
    const std::vector<array>& inputs,
    array& out,
-    const std::string& op) {
+    const char* op) {
  auto& s = out.primitive().stream();
  binary_op_gpu(inputs, out, op, s);
 }
@@ -237,19 +237,19 @@ BINARY_GPU(Subtract)
 void BitwiseBinary::eval_gpu(const std::vector<array>& inputs, array& out) {
  switch (op_) {
    case BitwiseBinary::And:
-      binary_op_gpu(inputs, out, get_primitive_string(this));
+      binary_op_gpu(inputs, out, name());
      break;
    case BitwiseBinary::Or:
-      binary_op_gpu(inputs, out, get_primitive_string(this));
+      binary_op_gpu(inputs, out, name());
      break;
    case BitwiseBinary::Xor:
-      binary_op_gpu(inputs, out, get_primitive_string(this));
+      binary_op_gpu(inputs, out, name());
      break;
    case BitwiseBinary::LeftShift:
-      binary_op_gpu(inputs, out, get_primitive_string(this));
+      binary_op_gpu(inputs, out, name());
      break;
    case BitwiseBinary::RightShift:
-      binary_op_gpu(inputs, out, get_primitive_string(this));
+      binary_op_gpu(inputs, out, name());
      break;
  }
 }
--- a/mlx/backend/metal/binary.h
+++ b/mlx/backend/metal/binary.h
@@ -9,25 +9,25 @@ namespace mlx::core {
 void binary_op_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
-    const std::string& op,
+    const char* op,
    const Stream& s);

 void binary_op_gpu(
    const std::vector<array>& inputs,
    array& out,
-    const std::string& op,
+    const char* op,
    const Stream& s);

 void binary_op_gpu_inplace(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
-    const std::string& op,
+    const char* op,
    const Stream& s);

 void binary_op_gpu_inplace(
    const std::vector<array>& inputs,
    array& out,
-    const std::string& op,
+    const char* op,
    const Stream& s);

 } // namespace mlx::core
--- a/mlx/backend/metal/compiled.cpp
+++ b/mlx/backend/metal/compiled.cpp
@@ -212,9 +212,7 @@ inline void build_kernel(
          get_type_string(x.dtype()),
          namer.get_name(x.inputs()[0]));
    } else {
-      std::ostringstream ss;
-      x.primitive().print(ss);
-      os += ss.str();
+      os += x.primitive().name();
      os += "()(";
      for (int i = 0; i < x.inputs().size() - 1; i++) {
        os += fmt::format("tmp_{0}, ", namer.get_name(x.inputs()[i]));
--- a/mlx/backend/metal/copy.cpp
+++ b/mlx/backend/metal/copy.cpp
@@ -86,7 +86,7 @@ void copy_gpu_inplace(
    }
  } else {
    work_per_thread = get_work_per_thread(out.dtype(), out.data_size());
-    if (work_per_thread > 1) {
+    if (!large && work_per_thread > 1) {
      kernel_name += "n";
    }
  }
--- a/mlx/backend/metal/device.cpp
+++ b/mlx/backend/metal/device.cpp
@@ -1,20 +1,18 @@
 // Copyright © 2023-2024 Apple Inc.

 #include <cstdlib>
-#include <filesystem>
 #include <sstream>

 #define NS_PRIVATE_IMPLEMENTATION
 #define CA_PRIVATE_IMPLEMENTATION
 #define MTL_PRIVATE_IMPLEMENTATION

+#include "mlx/backend/common/utils.h"
 #include "mlx/backend/metal/device.h"
 #include "mlx/backend/metal/metal.h"
 #include "mlx/backend/metal/utils.h"
 #include "mlx/utils.h"

-namespace fs = std::filesystem;
-
 namespace mlx::core::metal {

 namespace {
@@ -80,12 +78,7 @@ MTL::Library* try_load_bundle(
 std::pair<MTL::Library*, NS::Error*> load_colocated_library(
    MTL::Device* device,
    const std::string& relative_path) {
-  std::string binary_dir = get_binary_directory();
-  if (binary_dir.size() == 0) {
-    return {nullptr, nullptr};
-  }
-
-  auto path = fs::path(binary_dir) / relative_path;
+  auto path = current_binary_dir() / relative_path;
  if (!path.has_extension()) {
    path.replace_extension(".metallib");
  }
@@ -197,7 +190,7 @@ MTL::Library* load_library(

  std::ostringstream msg;
  msg << "Failed to load the metallib " << lib_name << ".metallib. "
-      << "We attempted to load it from <" << get_binary_directory() << "/"
+      << "We attempted to load it from <" << current_binary_dir() << "/"
      << lib_name << ".metallib" << ">";
 #ifdef SWIFTPM_BUNDLE
  msg << " and from the Swift PM bundle.";
--- a/mlx/backend/metal/device.h
+++ b/mlx/backend/metal/device.h
@@ -3,8 +3,6 @@
 #pragma once

 #include <Metal/Metal.hpp>
-#include <dlfcn.h>
-#include <filesystem>
 #include <functional>
 #include <mutex>
 #include <shared_mutex>
@@ -15,22 +13,8 @@
 #include "mlx/array.h"
 #include "mlx/device.h"

-namespace fs = std::filesystem;
-
 namespace mlx::core::metal {

-// Note, this function must be left inline in a header so that it is not
-// dynamically linked.
-inline std::string get_binary_directory() {
-  Dl_info info;
-  std::string directory;
-  int success = dladdr((void*)get_binary_directory, &info);
-  if (success) {
-    directory = fs::path(info.dli_fname).remove_filename().c_str();
-  }
-  return directory;
-}
-
 using MTLFCList =
    std::vector<std::tuple<const void*, MTL::DataType, NS::UInteger>>;

--- a/mlx/backend/metal/indexing.cpp
+++ b/mlx/backend/metal/indexing.cpp
@@ -575,9 +575,17 @@ void ScatterAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
  compute_encoder.set_output_array(out, 2);

  // Set source info
-  compute_encoder.set_vector_bytes(remove_index(idx.shape(), axis_), 3);
-  compute_encoder.set_vector_bytes(remove_index(upd.strides(), axis_), 4);
-  compute_encoder.set_vector_bytes(remove_index(idx.strides(), axis_), 5);
+  if (ndim > 1) {
+    compute_encoder.set_vector_bytes(remove_index(idx.shape(), axis_), 3);
+    compute_encoder.set_vector_bytes(remove_index(upd.strides(), axis_), 4);
+    compute_encoder.set_vector_bytes(remove_index(idx.strides(), axis_), 5);
+  } else {
+    // The following will be ignored in the kernel but we still have to set
+    // some value so that metal validation passes.
+    compute_encoder.set_vector_bytes(idx.shape(), 3);
+    compute_encoder.set_vector_bytes(upd.strides(), 4);
+    compute_encoder.set_vector_bytes(idx.strides(), 5);
+  }
  compute_encoder.set_bytes(ndim - 1, 6);
  compute_encoder.set_bytes(axis_, 7);
  compute_encoder.set_bytes(out.shape(axis_), 8);
--- a/mlx/backend/metal/jit/includes.h
+++ b/mlx/backend/metal/jit/includes.h
@@ -34,6 +34,7 @@ const char* steel_gemm_fused();
 const char* steel_gemm_masked();
 const char* steel_gemm_splitk();
 const char* steel_gemm_gather();
+const char* steel_gemm_segmented();
 const char* conv();
 const char* steel_conv();
 const char* steel_conv_general();
--- a/mlx/backend/metal/jit_kernels.cpp
+++ b/mlx/backend/metal/jit_kernels.cpp
@@ -8,12 +8,6 @@ using namespace fmt::literals;

 namespace mlx::core {

-std::string op_name(const array& arr) {
-  std::ostringstream op_t;
-  arr.primitive().print(op_t);
-  return op_t.str();
-}
-
 MTL::ComputePipelineState* get_arange_kernel(
    metal::Device& d,
    const std::string& kernel_name,
@@ -33,7 +27,7 @@ MTL::ComputePipelineState* get_unary_kernel(
    const std::string& kernel_name,
    Dtype in_type,
    Dtype out_type,
-    const std::string op) {
+    const char* op) {
  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
  auto lib = d.get_library(lib_name, [&]() {
    auto in_t = get_type_string(in_type);
@@ -58,10 +52,10 @@ MTL::ComputePipelineState* get_unary_kernel(
 }

 void append_binary_kernels(
-    const std::string lib_name,
+    const std::string& lib_name,
    Dtype in_type,
    Dtype out_type,
-    const std::string op,
+    const char* op,
    std::string& kernel_source) {
  const std::array<std::pair<std::string, std::string>, 7> kernel_types = {{
      {"ss", "binary_ss"},
@@ -112,7 +106,7 @@ MTL::ComputePipelineState* get_binary_kernel(
    const std::string& kernel_name,
    Dtype in_type,
    Dtype out_type,
-    const std::string op) {
+    const char* op) {
  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
  auto lib = d.get_library(lib_name, [&]() {
    std::string kernel_source;
@@ -129,7 +123,7 @@ MTL::ComputePipelineState* get_binary_two_kernel(
    const std::string& kernel_name,
    Dtype in_type,
    Dtype out_type,
-    const std::string op) {
+    const char* op) {
  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
  auto lib = d.get_library(lib_name, [&]() {
    std::string kernel_source = metal::utils();
@@ -144,7 +138,7 @@ MTL::ComputePipelineState* get_ternary_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    Dtype type,
-    const std::string op) {
+    const char* op) {
  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
  auto lib = d.get_library(lib_name, [&]() {
    auto t_str = get_type_string(type);
@@ -652,6 +646,43 @@ MTL::ComputePipelineState* get_steel_gemm_gather_kernel(
  return d.get_kernel(kernel_name, lib, hash_name, func_consts);
 }

+MTL::ComputePipelineState* get_steel_gemm_segmented_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const std::string& hash_name,
+    const metal::MTLFCList& func_consts,
+    const array& out,
+    bool transpose_a,
+    bool transpose_b,
+    int bm,
+    int bn,
+    int bk,
+    int wm,
+    int wn) {
+  const auto& lib_name = kernel_name;
+  auto lib = d.get_library(lib_name, [&]() {
+    std::string kernel_source;
+    concatenate(
+        kernel_source,
+        metal::utils(),
+        metal::gemm(),
+        metal::steel_gemm_segmented(),
+        get_template_definition(
+            lib_name,
+            "segmented_mm",
+            get_type_string(out.dtype()),
+            bm,
+            bn,
+            bk,
+            wm,
+            wn,
+            transpose_a,
+            transpose_b));
+    return kernel_source;
+  });
+  return d.get_kernel(kernel_name, lib, hash_name, func_consts);
+}
+
 MTL::ComputePipelineState* get_gemv_masked_kernel(
    metal::Device& d,
    const std::string& kernel_name,
--- a/mlx/backend/metal/kernels.h
+++ b/mlx/backend/metal/kernels.h
@@ -19,27 +19,27 @@ MTL::ComputePipelineState* get_unary_kernel(
    const std::string& kernel_name,
    Dtype in_type,
    Dtype out_type,
-    const std::string op);
+    const char* op);

 MTL::ComputePipelineState* get_binary_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    Dtype in_type,
    Dtype out_type,
-    const std::string op);
+    const char* op);

 MTL::ComputePipelineState* get_binary_two_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    Dtype in_type,
    Dtype out_type,
-    const std::string op);
+    const char* op);

 MTL::ComputePipelineState* get_ternary_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    Dtype type,
-    const std::string op);
+    const char* op);

 MTL::ComputePipelineState* get_copy_kernel(
    metal::Device& d,
@@ -175,6 +175,20 @@ MTL::ComputePipelineState* get_steel_gemm_gather_kernel(
    int wn,
    bool rhs);

+MTL::ComputePipelineState* get_steel_gemm_segmented_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const std::string& hash_name,
+    const metal::MTLFCList& func_consts,
+    const array& out,
+    bool transpose_a,
+    bool transpose_b,
+    int bm,
+    int bn,
+    int bk,
+    int wm,
+    int wn);
+
 MTL::ComputePipelineState* get_steel_conv_kernel(
    metal::Device& d,
    const std::string& kernel_name,
@@ -243,8 +257,10 @@ MTL::ComputePipelineState* get_gather_qmm_kernel(

 // Create a GPU kernel template definition for JIT compilation
 template <typename... Args>
-std::string
-get_template_definition(std::string name, std::string func, Args... args) {
+std::string get_template_definition(
+    std::string_view name,
+    std::string_view func,
+    Args... args) {
  std::ostringstream s;
  s << func << "<";
  bool first = true;
--- a/mlx/backend/metal/kernels/CMakeLists.txt
+++ b/mlx/backend/metal/kernels/CMakeLists.txt
@@ -71,6 +71,7 @@ set(STEEL_HEADERS
    steel/gemm/kernels/steel_gemm_fused.h
    steel/gemm/kernels/steel_gemm_gather.h
    steel/gemm/kernels/steel_gemm_masked.h
+    steel/gemm/kernels/steel_gemm_segmented.h
    steel/gemm/kernels/steel_gemm_splitk.h
    steel/utils/type_traits.h
    steel/utils/integral_constant.h)
@@ -120,6 +121,7 @@ if(NOT MLX_METAL_JIT)
  build_kernel(steel/gemm/kernels/steel_gemm_gather ${STEEL_HEADERS})
  build_kernel(steel/gemm/kernels/steel_gemm_masked ${STEEL_HEADERS})
  build_kernel(steel/gemm/kernels/steel_gemm_splitk ${STEEL_HEADERS})
+  build_kernel(steel/gemm/kernels/steel_gemm_segmented ${STEEL_HEADERS})
  build_kernel(gemv_masked steel/utils.h)
 endif()

--- a/mlx/backend/metal/kernels/cexpf.h
+++ b/mlx/backend/metal/kernels/cexpf.h
@@ -0,0 +1,134 @@
+// Copyright © 2025 Apple Inc.
+// Copyright © 2008-2013 NVIDIA Corporation
+// Copyright © 2013 Filipe RNC Maia
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Forked from
+// https://github.com/NVIDIA/cccl/blob/main/thrust/thrust/detail/complex/cexpf.h
+
+// TODO: We should use thrust::exp but the thrust header in old CUDA versions
+// can not be used in JIT.
+
+#pragma once
+
+#include <metal_math>
+
+using ieee_float_shape_type = union {
+  float value;
+  uint32_t word;
+};
+
+inline void get_float_word(thread uint32_t& i, float d) {
+  ieee_float_shape_type gf_u;
+  gf_u.value = (d);
+  (i) = gf_u.word;
+}
+
+inline void get_float_word(thread int32_t& i, float d) {
+  ieee_float_shape_type gf_u;
+  gf_u.value = (d);
+  (i) = gf_u.word;
+}
+
+inline void set_float_word(thread float& d, uint32_t i) {
+  ieee_float_shape_type sf_u;
+  sf_u.word = (i);
+  (d) = sf_u.value;
+}
+
+inline float frexp_expf(float x, thread int* expt) {
+  const uint32_t k = 235;
+  const float kln2 = 162.88958740F;
+
+  float exp_x;
+  uint32_t hx;
+
+  exp_x = metal::exp(x - kln2);
+  get_float_word(hx, exp_x);
+  *expt = (hx >> 23) - (0x7f + 127) + k;
+  set_float_word(exp_x, (hx & 0x7fffff) | ((0x7f + 127) << 23));
+  return exp_x;
+}
+
+inline complex64_t ldexp_cexpf(complex64_t z, int expt) {
+  float x, y, exp_x, scale1, scale2;
+  int ex_expt, half_expt;
+
+  x = z.real;
+  y = z.imag;
+  exp_x = frexp_expf(x, &ex_expt);
+  expt += ex_expt;
+
+  half_expt = expt / 2;
+  set_float_word(scale1, (0x7f + half_expt) << 23);
+  half_expt = expt - half_expt;
+  set_float_word(scale2, (0x7f + half_expt) << 23);
+
+  return complex64_t{
+      metal::cos(y) * exp_x * scale1 * scale2,
+      metal::sin(y) * exp_x * scale1 * scale2};
+}
+
+inline complex64_t cexpf(const thread complex64_t& z) {
+  float x, y, exp_x;
+  uint32_t hx, hy;
+
+  const uint32_t exp_ovfl = 0x42b17218, cexp_ovfl = 0x43400074;
+
+  x = z.real;
+  y = z.imag;
+
+  get_float_word(hy, y);
+  hy &= 0x7fffffff;
+
+  /* cexp(x + I 0) = exp(x) + I 0 */
+  if (hy == 0) {
+    return complex64_t{metal::exp(x), y};
+  }
+  get_float_word(hx, x);
+  /* cexp(0 + I y) = cos(y) + I sin(y) */
+  if ((hx & 0x7fffffff) == 0) {
+    return complex64_t{metal::cos(y), metal::sin(y)};
+  }
+  if (hy >= 0x7f800000) {
+    if ((hx & 0x7fffffff) != 0x7f800000) {
+      /* cexp(finite|NaN +- I Inf|NaN) = NaN + I NaN */
+      return complex64_t{y - y, y - y};
+    } else if (hx & 0x80000000) {
+      /* cexp(-Inf +- I Inf|NaN) = 0 + I 0 */
+      return complex64_t{0.0, 0.0};
+    } else {
+      /* cexp(+Inf +- I Inf|NaN) = Inf + I NaN */
+      return complex64_t{x, y - y};
+    }
+  }
+
+  if (hx >= exp_ovfl && hx <= cexp_ovfl) {
+    /*
+     * x is between 88.7 and 192, so we must scale to avoid
+     * overflow in expf(x).
+     */
+    return ldexp_cexpf(z, 0);
+  } else {
+    /*
+     * Cases covered here:
+     *  -  x < exp_ovfl and exp(x) won't overflow (common case)
+     *  -  x > cexp_ovfl, so exp(x) * s overflows for all s > 0
+     *  -  x = +-Inf (generated by exp())
+     *  -  x = NaN (spurious inexact exception from y)
+     */
+    exp_x = metal::exp(x);
+    return complex64_t{exp_x * metal::cos(y), exp_x * metal::sin(y)};
+  }
+}
--- a/mlx/backend/metal/kernels/layer_norm.metal
+++ b/mlx/backend/metal/kernels/layer_norm.metal
@@ -31,6 +31,7 @@ inline void threadgroup_sum(
  for (int i = 0; i < N; i++) {
    x[i] = simd_sum(x[i]);
  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
  if (simd_lane_id == 0) {
    for (int i = 0; i < N; i++) {
      xs[N * simd_group_id + i] = x[i];
--- a/mlx/backend/metal/kernels/quantized.h
+++ b/mlx/backend/metal/kernels/quantized.h
@@ -643,14 +643,14 @@ struct QuantizedBlockLoader {
      return;
    }

-    if (reduction_dim == 1 && bi >= src_tile_dim.y) {
+    if (reduction_dim == 1 && bi >= src_tile_dim.x) {
      for (int i = 0; i < n_reads * pack_factor; i++) {
        dst[i] = T(0);
      }
      return;
    }

-    if (reduction_dim == 0 && bi >= src_tile_dim.x) {
+    if (reduction_dim == 0 && bi >= src_tile_dim.y) {
      for (int i = 0; i < n_reads * pack_factor; i++) {
        dst[i] = T(0);
      }
--- a/mlx/backend/metal/kernels/reduction/ops.h
+++ b/mlx/backend/metal/kernels/reduction/ops.h
@@ -164,7 +164,15 @@ struct Min {
  DEFINE_SIMD_REDUCE()

  template <typename T>
-  T simd_reduce_impl(T val) {
+  metal::enable_if_t<metal::is_integral_v<T>, T> simd_reduce_impl(T val) {
+    return simd_min(val);
+  }
+
+  template <typename T>
+  metal::enable_if_t<!metal::is_integral_v<T>, T> simd_reduce_impl(T val) {
+    if (simd_any(val != val)) {
+      return static_cast<T>(NAN);
+    }
    return simd_min(val);
  }

@@ -176,17 +184,52 @@ struct Min {
  }

  // Operator
-  U operator()(U a, U b) {
+  template <typename T>
+  metal::enable_if_t<metal::is_integral_v<T>, T> operator()(T a, T b) {
    return a < b ? a : b;
  }
-};

+  template <typename T>
+  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T a, T b) {
+    if (metal::isnan(a) || metal::isnan(b)) {
+      return static_cast<T>(NAN);
+    } else {
+      return a < b ? a : b;
+    }
+  }
+
+  template <>
+  complex64_t operator()(complex64_t a, complex64_t b) {
+    bool real_is_nan = metal::isnan(a.real) || metal::isnan(b.real);
+    bool imag_is_nan = metal::isnan(a.imag) || metal::isnan(b.imag);
+
+    if (!real_is_nan && !imag_is_nan) {
+      return a < b ? a : b;
+    } else if (real_is_nan && !imag_is_nan) {
+      return complex64_t(
+          static_cast<float>(NAN), a.imag < b.imag ? a.imag : b.imag);
+    } else if (!real_is_nan && imag_is_nan) {
+      return complex64_t(
+          a.real < b.real ? a.real : b.real, static_cast<float>(NAN));
+    } else {
+      return complex64_t(static_cast<float>(NAN), static_cast<float>(NAN));
+    }
+  };
+};
 template <typename U>
 struct Max {
  DEFINE_SIMD_REDUCE()

  template <typename T>
-  T simd_reduce_impl(T val) {
+  metal::enable_if_t<metal::is_integral_v<T>, T> simd_reduce_impl(T val) {
+    return simd_max(val);
+  }
+
+  template <typename T>
+  metal::enable_if_t<!metal::is_integral_v<T>, T> simd_reduce_impl(T val) {
+    if (simd_any(val != val)) {
+      return static_cast<T>(NAN);
+    }
    return simd_max(val);
  }

@@ -198,7 +241,35 @@ struct Max {
  }

  // Operator
-  U operator()(U a, U b) {
+  template <typename T>
+  metal::enable_if_t<metal::is_integral_v<T>, T> operator()(T a, T b) {
    return a > b ? a : b;
  }
+
+  template <typename T>
+  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T a, T b) {
+    if (metal::isnan(a) || metal::isnan(b)) {
+      return static_cast<T>(NAN);
+    } else {
+      return a > b ? a : b;
+    }
+  }
+
+  template <>
+  complex64_t operator()(complex64_t a, complex64_t b) {
+    bool real_is_nan = metal::isnan(a.real) || metal::isnan(b.real);
+    bool imag_is_nan = metal::isnan(a.imag) || metal::isnan(b.imag);
+
+    if (!real_is_nan && !imag_is_nan) {
+      return a > b ? a : b;
+    } else if (real_is_nan && !imag_is_nan) {
+      return complex64_t(
+          static_cast<float>(NAN), a.imag > b.imag ? a.imag : b.imag);
+    } else if (!real_is_nan && imag_is_nan) {
+      return complex64_t(
+          a.real > b.real ? a.real : b.real, static_cast<float>(NAN));
+    } else {
+      return complex64_t(static_cast<float>(NAN), static_cast<float>(NAN));
+    }
+  }
 };
--- a/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.h
+++ b/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.h
@@ -0,0 +1,266 @@
+// Copyright © 2025 Apple Inc.
+
+using namespace mlx::steel;
+
+constant bool segments_contiguous [[function_constant(199)]];
+constant bool align_M [[function_constant(200)]];
+constant bool align_N [[function_constant(201)]];
+
+template <
+    typename T,
+    int BM,
+    int BN,
+    int BK,
+    int WM,
+    int WN,
+    bool transpose_a,
+    bool transpose_b,
+    typename AccumType = float>
+[[kernel, max_total_threads_per_threadgroup(WM* WN * 32)]] void segmented_mm(
+    const device T* A [[buffer(0)]],
+    const device T* B [[buffer(1)]],
+    const device uint32_t* segments [[buffer(2)]],
+    device T* C [[buffer(3)]],
+    const constant GEMMParams* params [[buffer(4)]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]],
+    uint3 tid [[threadgroup_position_in_grid]]) {
+  using gemm_kernel = GEMMKernel<
+      T,
+      T,
+      BM,
+      BN,
+      BK,
+      WM,
+      WN,
+      transpose_a,
+      transpose_b,
+      true,
+      true,
+      AccumType>;
+
+  using loader_a_t = typename gemm_kernel::loader_a_t;
+  using loader_b_t = typename gemm_kernel::loader_b_t;
+  using mma_t = typename gemm_kernel::mma_t;
+
+  if (params->tiles_n <= static_cast<int>(tid.x) ||
+      params->tiles_m <= static_cast<int>(tid.y)) {
+    return;
+  }
+
+  // Prepare threadgroup memory
+  threadgroup T As[gemm_kernel::tgp_mem_size_a];
+  threadgroup T Bs[gemm_kernel::tgp_mem_size_b];
+
+  // Find the block in A, B, C
+  const int c_row = tid.y * BM;
+  const int c_col = tid.x * BN;
+  const size_t c_row_long = size_t(c_row);
+  const size_t c_col_long = size_t(c_col);
+
+  // Prepare threadgroup bounds
+  const short tgp_bm = align_M ? BM : short(min(BM, params->M - c_row));
+  const short tgp_bn = align_N ? BN : short(min(BN, params->N - c_col));
+
+  // Move the pointers to the output tile
+  A += transpose_a ? c_row_long : c_row_long * params->lda;
+  B += transpose_b ? c_col_long * params->ldb : c_col_long;
+  C += c_row_long * params->ldd + c_col_long;
+
+  // Move the pointers to the start of the segment
+  uint32_t k_start, k_end;
+  if (segments_contiguous) {
+    k_start = segments[2 * tid.z];
+    k_end = segments[2 * tid.z + 1];
+  } else {
+    // We accept either contiguous (above) or weird strides where the beginning
+    // of the next one is the previous one. Basically the last two strides are
+    // both 1!
+    k_start = segments[tid.z];
+    k_end = segments[tid.z + 1];
+  }
+  A += transpose_a ? k_start * params->lda : k_start;
+  B += transpose_b ? k_start : k_start * params->ldb;
+  C += tid.z * params->batch_stride_d;
+
+  // Prepare threadgroup mma operation
+  thread mma_t mma_op(simd_group_id, simd_lane_id);
+
+  // Prepare threadgroup loading operations
+  thread loader_a_t loader_a(A, params->lda, As, simd_group_id, simd_lane_id);
+  thread loader_b_t loader_b(B, params->ldb, Bs, simd_group_id, simd_lane_id);
+
+  // Matrix level alignment so only check K
+  if (align_M && align_N) {
+    uint32_t k = k_start + BK;
+    for (; k <= k_end; k += BK) {
+      threadgroup_barrier(mem_flags::mem_threadgroup);
+
+      // Load elements into threadgroup
+      loader_a.load_unsafe();
+      loader_b.load_unsafe();
+
+      threadgroup_barrier(mem_flags::mem_threadgroup);
+
+      // Multiply and accumulate threadgroup elements
+      mma_op.mma(As, Bs);
+
+      // Prepare for next iteration
+      loader_a.next();
+      loader_b.next();
+    }
+    short k_remain = BK - short(k - k_end);
+    const short2 tile_dims_A =
+        transpose_a ? short2(tgp_bm, k_remain) : short2(k_remain, tgp_bm);
+    const short2 tile_dims_B =
+        transpose_b ? short2(k_remain, tgp_bn) : short2(tgp_bn, k_remain);
+    if (k_remain > 0) {
+      threadgroup_barrier(mem_flags::mem_threadgroup);
+      loader_a.load_safe(tile_dims_A);
+      loader_b.load_safe(tile_dims_B);
+      threadgroup_barrier(mem_flags::mem_threadgroup);
+      mma_op.mma(As, Bs);
+    }
+    mma_op.store_result(C, params->ldd);
+  } else {
+    // Tile aligned do the same as above
+    if ((align_M || tgp_bm == BM) && (align_N || tgp_bn == BN)) {
+      uint32_t k = k_start + BK;
+      for (; k <= k_end; k += BK) {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        // Load elements into threadgroup
+        loader_a.load_unsafe();
+        loader_b.load_unsafe();
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        // Multiply and accumulate threadgroup elements
+        mma_op.mma(As, Bs);
+
+        // Prepare for next iteration
+        loader_a.next();
+        loader_b.next();
+      }
+      short k_remain = BK - short(k - k_end);
+      const short2 tile_dims_A =
+          transpose_a ? short2(tgp_bm, k_remain) : short2(k_remain, tgp_bm);
+      const short2 tile_dims_B =
+          transpose_b ? short2(k_remain, tgp_bn) : short2(tgp_bn, k_remain);
+      if (k_remain > 0) {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        loader_a.load_safe(tile_dims_A);
+        loader_b.load_safe(tile_dims_B);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        mma_op.mma(As, Bs);
+      }
+      mma_op.store_result(C, params->ldd);
+    }
+
+    // Tile partially aligned check rows
+    else if (align_N || tgp_bn == BN) {
+      uint32_t k = k_start + BK;
+      for (; k <= k_end; k += BK) {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        // Load elements into threadgroup
+        loader_a.load_safe(
+            transpose_a ? short2(tgp_bm, BK) : short2(BK, tgp_bm));
+        loader_b.load_unsafe();
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        // Multiply and accumulate threadgroup elements
+        mma_op.mma(As, Bs);
+
+        // Prepare for next iteration
+        loader_a.next();
+        loader_b.next();
+      }
+      short k_remain = BK - short(k - k_end);
+      const short2 tile_dims_A =
+          transpose_a ? short2(tgp_bm, k_remain) : short2(k_remain, tgp_bm);
+      const short2 tile_dims_B =
+          transpose_b ? short2(k_remain, tgp_bn) : short2(tgp_bn, k_remain);
+      if (k_remain > 0) {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        loader_a.load_safe(tile_dims_A);
+        loader_b.load_safe(tile_dims_B);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        mma_op.mma(As, Bs);
+      }
+      mma_op.store_result_safe(C, params->ldd, short2(tgp_bn, tgp_bm));
+    }
+
+    // Tile partially aligned check cols
+    else if (align_M || tgp_bm == BM) {
+      uint32_t k = k_start + BK;
+      for (; k <= k_end; k += BK) {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        // Load elements into threadgroup
+        loader_a.load_unsafe();
+        loader_b.load_safe(
+            transpose_b ? short2(BK, tgp_bn) : short2(tgp_bn, BK));
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        // Multiply and accumulate threadgroup elements
+        mma_op.mma(As, Bs);
+
+        // Prepare for next iteration
+        loader_a.next();
+        loader_b.next();
+      }
+      short k_remain = BK - short(k - k_end);
+      const short2 tile_dims_A =
+          transpose_a ? short2(tgp_bm, k_remain) : short2(k_remain, tgp_bm);
+      const short2 tile_dims_B =
+          transpose_b ? short2(k_remain, tgp_bn) : short2(tgp_bn, k_remain);
+      if (k_remain > 0) {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        loader_a.load_safe(tile_dims_A);
+        loader_b.load_safe(tile_dims_B);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        mma_op.mma(As, Bs);
+      }
+      mma_op.store_result_safe(C, params->ldd, short2(tgp_bn, tgp_bm));
+    }
+
+    // Nothing aligned so check both rows and cols
+    else {
+      uint32_t k = k_start + BK;
+      for (; k <= k_end; k += BK) {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        // Load elements into threadgroup
+        loader_a.load_safe(
+            transpose_a ? short2(tgp_bm, BK) : short2(BK, tgp_bm));
+        loader_b.load_safe(
+            transpose_b ? short2(BK, tgp_bn) : short2(tgp_bn, BK));
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        // Multiply and accumulate threadgroup elements
+        mma_op.mma(As, Bs);
+
+        // Prepare for next iteration
+        loader_a.next();
+        loader_b.next();
+      }
+      short k_remain = BK - short(k - k_end);
+      const short2 tile_dims_A =
+          transpose_a ? short2(tgp_bm, k_remain) : short2(k_remain, tgp_bm);
+      const short2 tile_dims_B =
+          transpose_b ? short2(k_remain, tgp_bn) : short2(tgp_bn, k_remain);
+      if (k_remain > 0) {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        loader_a.load_safe(tile_dims_A);
+        loader_b.load_safe(tile_dims_B);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        mma_op.mma(As, Bs);
+      }
+      mma_op.store_result_safe(C, params->ldd, short2(tgp_bn, tgp_bm));
+    }
+  }
+}
--- a/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.metal
+++ b/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.metal
@@ -0,0 +1,43 @@
+// Copyright © 2024 Apple Inc.
+
+// clang-format off
+#include "mlx/backend/metal/kernels/utils.h"
+#include "mlx/backend/metal/kernels/steel/gemm/gemm.h"
+#include "mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.h"
+
+#define instantiate_segmented_mm(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
+  instantiate_kernel(                                                         \
+      "steel_segmented_mm_" #tname "_" #iname "_" #oname "_bm" #bm "_bn" #bn  \
+      "_bk" #bk "_wm" #wm "_wn" #wn,                                          \
+      segmented_mm,                                                           \
+      itype,                                                                  \
+      bm,                                                                     \
+      bn,                                                                     \
+      bk,                                                                     \
+      wm,                                                                     \
+      wn,                                                                     \
+      trans_a,                                                                \
+      trans_b,                                                                \
+      float)
+
+#define instantiate_segmented_mm_transpose_helper(iname, itype, oname, otype, bm, bn, bk, wm, wn) \
+  instantiate_segmented_mm(nn, false, false, iname, itype, oname, otype, bm, bn, bk, wm, wn)      \
+  instantiate_segmented_mm(nt, false, true , iname, itype, oname, otype, bm, bn, bk, wm, wn)      \
+  instantiate_segmented_mm(tn, true , false, iname, itype, oname, otype, bm, bn, bk, wm, wn)      \
+  instantiate_segmented_mm(tt, true , true , iname, itype, oname, otype, bm, bn, bk, wm, wn)
+
+#define instantiate_segmented_mm_shapes_helper(iname, itype, oname, otype)                 \
+  instantiate_segmented_mm_transpose_helper(iname, itype, oname, otype, 64, 64, 16, 2, 2)  \
+  instantiate_segmented_mm_transpose_helper(iname, itype, oname, otype, 64, 64, 16, 1, 2)  \
+  instantiate_segmented_mm_transpose_helper(iname, itype, oname, otype, 64, 32, 32, 2, 2)  \
+  instantiate_segmented_mm_transpose_helper(iname, itype, oname, otype, 32, 64, 16, 1, 2)  \
+  instantiate_segmented_mm_transpose_helper(iname, itype, oname, otype, 32, 32, 16, 2, 2)
+// clang-format on
+
+instantiate_segmented_mm_shapes_helper(float16, half, float16, half);
+instantiate_segmented_mm_shapes_helper(
+    bfloat16,
+    bfloat16_t,
+    bfloat16,
+    bfloat16_t);
+instantiate_segmented_mm_shapes_helper(float32, float, float32, float);
--- a/mlx/backend/metal/kernels/unary_ops.h
+++ b/mlx/backend/metal/kernels/unary_ops.h
@@ -5,6 +5,7 @@
 #include <metal_integer>
 #include <metal_math>

+#include "mlx/backend/metal/kernels/cexpf.h"
 #include "mlx/backend/metal/kernels/erf.h"
 #include "mlx/backend/metal/kernels/expm1f.h"

@@ -178,8 +179,7 @@ struct Exp {
    return metal::precise::exp(x);
  };
  complex64_t operator()(complex64_t x) {
-    auto m = metal::precise::exp(x.real);
-    return {m * metal::precise::cos(x.imag), m * metal::precise::sin(x.imag)};
+    return cexpf(x);
  }
 };

--- a/mlx/backend/metal/matmul.cpp
+++ b/mlx/backend/metal/matmul.cpp
@@ -1864,4 +1864,166 @@ void GatherMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  gather_mm(a, b, lhs_indices, rhs_indices, out, M, N, K, d, s);
 }

+void segmented_mm(
+    const array& a_,
+    const array& b_,
+    const array& segments_,
+    array& out,
+    int M,
+    int N,
+    int K,
+    metal::Device& d,
+    const Stream& s) {
+  auto check_segments_layout = [&d, &s](const array& x) {
+    // Contiguous so return early
+    if (x.flags().row_contiguous) {
+      return std::make_tuple(true, x);
+    }
+
+    bool rc = true;
+    for (int i = 0; i < x.ndim() - 2; i++) {
+      rc &=
+          (x.strides(i + 1) * x.shape(i) == x.strides(i)) || (x.shape(i) == 1);
+    }
+    rc &= x.strides(x.ndim() - 1) == 1;
+    if (x.ndim() > 1) {
+      rc &= x.strides(x.ndim() - 2) == 1;
+    }
+
+    if (rc) {
+      return std::make_tuple(false, x);
+    }
+
+    array x_copy(x.shape(), x.dtype(), nullptr, {});
+    copy_gpu(x, x_copy, CopyType::General, s);
+    d.add_temporary(x_copy, s.index);
+    return std::make_tuple(true, x_copy);
+  };
+
+  // Copy if needed
+  std::vector<array> copies;
+  auto [transpose_a, lda, a] = check_transpose(copies, s, a_, false);
+  auto [transpose_b, ldb, b] = check_transpose(copies, s, b_, false);
+  auto [segments_contiguous, segments] = check_segments_layout(segments_);
+  d.add_temporaries(std::move(copies), s.index);
+
+  // Determine dispatch kernel
+  int bm = 64, bn = 64, bk = 16;
+  int wm = 2, wn = 2;
+  size_t batch_size_out = out.size() / M / N;
+
+  char devc = d.get_architecture().back();
+  GEMM_TPARAM_MACRO(devc)
+
+  const bool align_M = (M % bm) == 0;
+  const bool align_N = (N % bn) == 0;
+
+  // Define the kernel name
+  std::string base_name;
+  base_name.reserve(128);
+  concatenate(
+      base_name,
+      "steel_segmented_mm_",
+      transpose_a ? 't' : 'n',
+      transpose_b ? 't' : 'n',
+      "_",
+      type_to_name(a),
+      "_",
+      type_to_name(out),
+      "_bm",
+      bm,
+      "_bn",
+      bn,
+      "_bk",
+      bk,
+      "_wm",
+      wm,
+      "_wn",
+      wn);
+
+  metal::MTLFCList func_consts = {
+      {&segments_contiguous, MTL::DataType::DataTypeBool, 199},
+      {&align_M, MTL::DataType::DataTypeBool, 200},
+      {&align_N, MTL::DataType::DataTypeBool, 201},
+  };
+
+  // And the kernel hash that includes the function constants
+  std::string hash_name;
+  hash_name.reserve(128);
+  concatenate(
+      hash_name,
+      base_name,
+      "_segments_contiguous_",
+      segments_contiguous ? 't' : 'n',
+      "_align_M_",
+      align_M ? 't' : 'n',
+      "_align_N_",
+      align_N ? 't' : 'n');
+
+  // Get and set the kernel
+  auto& compute_encoder = d.get_command_encoder(s.index);
+  auto kernel = get_steel_gemm_segmented_kernel(
+      d,
+      base_name,
+      hash_name,
+      func_consts,
+      out,
+      transpose_a,
+      transpose_b,
+      bm,
+      bn,
+      bk,
+      wm,
+      wn);
+  compute_encoder.set_compute_pipeline_state(kernel);
+
+  // Prepare the matmul params
+  steel::GEMMParams params{
+      /* const int M = */ M,
+      /* const int N = */ N,
+      /* const int K = */ K,
+      /* const int lda = */ static_cast<int>(lda),
+      /* const int ldb = */ static_cast<int>(ldb),
+      /* const int ldd = */ N,
+      /* const int tiles_n = */ (N + bn - 1) / bn,
+      /* const int tiles_m = */ (M + bm - 1) / bm,
+      /* const int64_t batch_stride_a = */ 0,
+      /* const int64_t batch_stride_b = */ 0,
+      /* const int64_t batch_stride_d = */ M * N,
+      /* const int swizzle_log = */ 0,
+      /* const int gemm_k_iterations_aligned = */ 0,
+      /* const int batch_ndim = */ 0};
+
+  // Prepare the grid
+  MTL::Size group_dims = MTL::Size(32, wn, wm);
+  MTL::Size grid_dims =
+      MTL::Size(params.tiles_n, params.tiles_m, batch_size_out);
+
+  // Launch kernel
+  compute_encoder.set_input_array(a, 0);
+  compute_encoder.set_input_array(b, 1);
+  compute_encoder.set_input_array(segments, 2);
+  compute_encoder.set_output_array(out, 3);
+  compute_encoder.set_bytes(params, 4);
+  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
+}
+
+void SegmentedMM::eval_gpu(const std::vector<array>& inputs, array& out) {
+  auto& s = stream();
+  auto& d = metal::device(s.device);
+
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+  auto& segments = inputs[2];
+
+  out.set_data(allocator::malloc(out.nbytes()));
+
+  // Extract shapes from inputs.
+  int M = a.shape(-2);
+  int N = b.shape(-1);
+  int K = a.shape(-1);
+
+  segmented_mm(a, b, segments, out, M, N, K, d, s);
+}
+
 } // namespace mlx::core
--- a/mlx/backend/metal/nojit_kernels.cpp
+++ b/mlx/backend/metal/nojit_kernels.cpp
@@ -18,7 +18,7 @@ MTL::ComputePipelineState* get_unary_kernel(
    const std::string& kernel_name,
    Dtype,
    Dtype,
-    const std::string) {
+    const char*) {
  return d.get_kernel(kernel_name);
 }

@@ -27,7 +27,7 @@ MTL::ComputePipelineState* get_binary_kernel(
    const std::string& kernel_name,
    Dtype,
    Dtype,
-    const std::string) {
+    const char*) {
  return d.get_kernel(kernel_name);
 }

@@ -36,7 +36,7 @@ MTL::ComputePipelineState* get_binary_two_kernel(
    const std::string& kernel_name,
    Dtype,
    Dtype,
-    const std::string) {
+    const char*) {
  return d.get_kernel(kernel_name);
 }

@@ -44,7 +44,7 @@ MTL::ComputePipelineState* get_ternary_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    Dtype,
-    const std::string) {
+    const char*) {
  return d.get_kernel(kernel_name);
 }

@@ -210,6 +210,22 @@ MTL::ComputePipelineState* get_steel_gemm_gather_kernel(
  return d.get_kernel(kernel_name, hash_name, func_consts);
 }

+MTL::ComputePipelineState* get_steel_gemm_segmented_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const std::string& hash_name,
+    const metal::MTLFCList& func_consts,
+    const array&,
+    bool,
+    bool,
+    int,
+    int,
+    int,
+    int,
+    int) {
+  return d.get_kernel(kernel_name, hash_name, func_consts);
+}
+
 MTL::ComputePipelineState* get_gemv_masked_kernel(
    metal::Device& d,
    const std::string& kernel_name,
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Awni Hannun	b2273733ea	Test with CUDA 12.2 (#2375 ) * Test with CUDA 12.0 * try older image * fix cpu sort	2025-07-16 13:00:37 -07:00
Awni Hannun	f409b229a4	fix ring distributed test (#2380 )	2025-07-16 11:25:24 -07:00
Cheng	30571e2326	Rename the copy util in cpu/copy.h to copy_cpu (#2378 )	2025-07-16 07:34:24 -07:00
Awni Hannun	d7734edd9f	fix complex reduce + nan propagation in min and max (#2377 )	2025-07-15 18:19:47 -07:00
Awni Hannun	2ba69bc8fa	lower memory uniform sampling (#2361 ) * lower memory uniform * use fp32 * fix	2025-07-15 14:22:07 -07:00
Cheng	cb349a291c	[CUDA] Use cuda::std::complex in place of cuComplex (#2372 )	2025-07-15 00:36:13 -07:00
Awni Hannun	f0a0b077a0	Install linux with mlx[cuda] and mlx[cpu] (#2356 ) * install linux with mlx[cuda] and mlx[cpu] * temp for testing * cleanup circle, fix cuda repair * update circle * update circle * decouple python bindings from core libraries	2025-07-14 17:17:33 -07:00
Awni Hannun	49114f28ab	fix flaky test (#2371 )	2025-07-14 17:16:18 -07:00
Awni Hannun	e7d2ebadd2	[CUDA] Affine quantize (#2354 ) * affine quantize and dequantize kernels * format * fix * format	2025-07-14 15:45:44 -07:00
Awni Hannun	e569803d7c	update linux build (#2370 )	2025-07-14 15:13:56 -07:00
Cheng	d34f887abc	Add Primitive::name and remove Primitive::print (#2365 )	2025-07-14 14:06:35 -07:00
Angelos Katharopoulos	5201df5030	Fix imag() vjp (#2367 )	2025-07-14 13:11:16 -07:00
Cheng	2d3c26c565	[CUDA] Do not put kernels in annoymous namespace (#2362 )	2025-07-12 14:24:45 -07:00
Cheng	6325f60d52	[CUDA] Bundle CCCL for JIT compilation (#2357 ) * Ship CCCL for JIT compilation * Remove cexpf	2025-07-11 18:45:37 -07:00
Awni Hannun	42cc9cfbc7	fix copy dispatch (#2360 )	2025-07-11 10:59:35 -07:00
Cheng	8347575ba1	[CUDA] Implement Scan kernel (#2347 ) * Contiguous scan * Strided scan * Enable tests * Fix failing logaddexp test * Use cexpf in Metal	2025-07-10 16:54:12 -07:00
Angelos Katharopoulos	b6eec20260	Fix edge check in qmm_n QuantizedLoader (#2355 )	2025-07-10 16:28:50 -07:00
Angelos Katharopoulos	0eb035b4b1	Fix type promotion in Adam with bias correction (#2350 )	2025-07-10 11:14:42 -07:00
Cheng	afb9817599	[CUDA] Put version in ptx cache dir path (#2352 )	2025-07-10 07:24:21 -07:00
Cheng	8fb3e7a26c	[CUDA] Set current device before cudaGraphLaunch (#2351 )	2025-07-10 07:24:02 -07:00
jhavukainen	8c7bc30ce4	Align mlx::core::min op nan propagation with NumPy (#2346 )	2025-07-10 06:20:43 -07:00
Cheng	85873cb162	[CUDA] Do vectorized store/load in contiguous elementwise ops (#2342 ) * Do vectorized store/load in unary ops * Do vectorized store/load in binary_two ops * Do vectorized store/load in copy ops * Do vectorized store/load in ternary ops * Use int32_t for IdxT * binary => binary_two in binary_two.cu * Fix tests on large arrays * Use uint as index type * Contig uses uint as index and non-contig uses int	2025-07-09 18:48:43 -07:00
Awni Hannun	e14ee12491	add zero for argsort vjp (#2345 )	2025-07-09 14:37:14 -07:00
jhavukainen	8b9a3f3cea	Align mlx::core::max op nan propagation with NumPy (#2339 ) * Make max op NaN propagation rules align with numpy * Adding benchmarks and testing for max op nanpropagation * Pre-commit formatting * Fix max complex64 nan propagation and add test * Improve the cpp unittest * Only check nans on non-integral types in simd_reduce_impl. * Cleanup using namespace alias * Add cpu Max nanpropagation. Fix a small fib in cpu max dispatch data types for int8/int16. * Make the max nanpropagation test more meaningful for integer types * Remove tuple unpacking syntax to comply with earlier python versions. Add cuda skip to nanpropagation tests, fix cuda implementation in a separate PR.	2025-07-09 11:26:27 -07:00
Awni Hannun	fb4e8b896b	patch bump (#2343 )	2025-07-08 14:26:07 -07:00
Cheng	2ca533b279	Fix compilation with CUDA 11 (#2331 )	2025-07-07 20:00:43 -07:00
Angelos Katharopoulos	4a9b29a875	MoE backward improvements (#2335 )	2025-07-07 17:59:53 -07:00
Awni Hannun	a4fcc893cd	auto build linux release (#2341 )	2025-07-07 09:29:23 -07:00
Cheng	9d10239af7	[CUDA] Do vectorized store/load in binary ops (#2330 )	2025-07-07 08:44:14 -07:00
Cheng	19facd4b20	Build with all cpu cores by default (#2336 )	2025-07-07 06:06:45 -07:00
Angelos Katharopoulos	f5299f72cd	Fix layernorm race condition (#2340 )	2025-07-07 06:06:01 -07:00
Cheng	0e0d9ac522	[CUDA] Add MLX_CUDA_GRAPH_CACHE_SIZE env for setting graph cache size (#2329 )	2025-07-05 08:33:29 -07:00
Awni Hannun	8917022deb	fix graphs for older cuda (#2328 )	2025-07-02 19:37:58 -07:00
Awni Hannun	ec0d5db67b	[CUDA] Switch to CUDA graphs (#2317 ) * cuda graph prototype fix signal bug + start to add dependencies capture more capture more ops remaining ops fix reduce and rope deps add concurrent context try update, but not working cosistent topology order use node api use node api directly to reduce overhead fix bug use kernels in unary cache graph format fix synchronization format * comment	2025-07-02 15:59:13 -07:00
Cheng	e76e9b87f0	Fix compilation error from integral_constant (#2326 )	2025-07-02 06:04:38 -07:00
Awni Hannun	cfb6a244ea	allow parameters to be deleted (#2325 )	2025-07-01 21:27:23 -07:00