fix

remove use of cuda pool, use cuda free async
2025-12-16 01:49:05 +08:00 · 2025-11-03 16:43:19 -08:00 · 2025-11-03 15:07:01 -08:00 · 2025-11-03 09:14:17 -08:00 · 2025-11-01 13:18:57 -07:00 · 2025-10-31 14:12:15 -07:00
178 changed files with 2498 additions and 11233 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -0,0 +1,579 @@
+version: 2.1
+
+orbs:
+  apple: ml-explore/pr-approval@0.1.0
+
+parameters:
+  nightly_build:
+    type: boolean
+    default: false
+  test_release:
+    type: boolean
+    default: false
+
+jobs:
+  build_documentation:
+    parameters:
+      upload-docs:
+        type: boolean
+        default: false
+    macos:
+      xcode: "26.0.0"
+    resource_class: m4pro.medium
+    steps:
+      - checkout
+      - run:
+          name: Install
+          command: |
+            xcodebuild -downloadComponent MetalToolchain
+            brew install python@3.10
+            brew install doxygen
+            python3.10 -m venv env
+            source env/bin/activate
+            pip install --upgrade pip
+            pip install --upgrade cmake
+            pip install -r docs/requirements.txt
+            pip install . -v
+      - when:
+          condition:
+            not: << parameters.upload-docs >>
+          steps:
+            - run:
+               name: Build documentation
+               command: |
+                 source env/bin/activate
+                 cd docs && doxygen && make html O=-W
+      - when:
+          condition: << parameters.upload-docs >>
+          steps:
+            - add_ssh_keys:
+                fingerprints:
+                  - "SHA256:OhcVVMovbT0pkgMeiVRyxMnjV9R2t+hKBsNcuxq9h+0"
+            - run:
+               name: Upload documentation
+               command: |
+                 source env/bin/activate
+                 git config user.email "mlx@group.apple.com"
+                 git config user.name "CircleCI Docs"
+                 git checkout gh-pages
+                 git rebase main
+                 cd docs
+                 git rm -rf build/html
+                 doxygen && make html O=-W
+                 git add -f build/html
+                 git commit -m "rebase"
+                 git push -f origin gh-pages
+
+  linux_build_and_test:
+    machine:
+      image: ubuntu-2204:current
+      resource_class: large
+    steps:
+      - checkout
+      - run:
+          name: Run style checks
+          command: |
+            pip install pre-commit
+            pre-commit run --all
+            if ! git diff --quiet; then echo 'Style checks failed, please install pre-commit and run pre-commit run --all and push the change'; exit 1; fi
+      - run:
+          name: Install dependencies
+          command: |
+            export DEBIAN_FRONTEND=noninteractive
+            export NEEDRESTART_MODE=a
+            sudo apt-get update
+            sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
+            sudo apt-get install openmpi-bin openmpi-common libopenmpi-dev
+            curl -LsSf https://astral.sh/uv/install.sh | sh
+      - run:
+          name: Install Python package
+          command: |
+            uv venv
+            uv pip install cmake
+            DEBUG=1 CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
+              uv pip install -e ".[dev]" -v
+      - run:
+          name: Generate package stubs
+          command: |
+            uv pip install typing_extensions
+            uv run --no-project setup.py generate_stubs
+      - run:
+          name: Run Python tests
+          command: |
+            source .venv/bin/activate
+            python -m unittest discover python/tests -v
+            mpirun --bind-to none -host localhost:8 -np 8 python python/tests/mpi_test_distributed.py
+            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
+            if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
+      - run:
+          name: Build CPP only
+          command: |
+            source .venv/bin/activate
+            mkdir -p build && cd build
+            cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
+            make -j `nproc`
+      - run:
+          name: Run CPP tests
+          command: ./build/tests/tests
+
+  mac_build_and_test:
+    parameters:
+      xcode_version:
+        type: string
+        default: "26.0.0"
+      macosx_deployment_target:
+        type: string
+        default: ""
+    macos:
+      xcode: << parameters.xcode_version >>
+    environment:
+      MACOSX_DEPLOYMENT_TARGET: << parameters.macosx_deployment_target >>
+    resource_class: m4pro.medium
+    steps:
+      - checkout
+      - run:
+          name: Install dependencies
+          command: |
+            xcodebuild -downloadComponent MetalToolchain
+            HOMEBREW_NO_AUTO_UPDATE=1 HOMEBREW_NO_INSTALL_CLEANUP=1 \
+              brew install openmpi uv
+      - run:
+          name: Install Python package
+          command: |
+            uv venv --python 3.10
+            uv pip install \
+              nanobind==2.4.0 \
+              cmake \
+              numpy \
+              torch \
+              tensorflow \
+              unittest-xml-reporting
+            DEBUG=1 CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
+              uv pip install -e . -v
+      - run:
+          name: Generate package stubs
+          command: |
+            uv pip install typing_extensions
+            uv run --no-project setup.py generate_stubs
+      - run:
+          name: Run Python tests
+          command: |
+            source .venv/bin/activate
+            LOW_MEMORY=1 DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
+            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu
+            mpirun --bind-to none -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
+            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
+            if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
+      - run:
+          name: Build example extension
+          command: |
+            source .venv/bin/activate
+            cd examples/extensions
+            uv pip install -r requirements.txt
+            uv run --no-project setup.py build_ext --inplace
+            uv run --no-project python test.py
+      - store_test_results:
+          path: test-results
+      - run:
+          name: Build CPP only
+          command: |
+            source .venv/bin/activate
+            mkdir -p build && cd build && cmake .. && make -j `sysctl -n hw.ncpu`
+      - run:
+          name: Run CPP tests
+          command: |
+            DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 ./build/tests/tests
+      - run:
+          name: Build small binary
+          command: |
+            source .venv/bin/activate
+            cd build/
+            cmake .. -DCMAKE_BUILD_TYPE=MinSizeRel \
+              -DBUILD_SHARED_LIBS=ON \
+              -DMLX_BUILD_CPU=OFF \
+              -DMLX_BUILD_SAFETENSORS=OFF \
+              -DMLX_BUILD_GGUF=OFF \
+              -DMLX_METAL_JIT=ON
+            make -j `sysctl -n hw.ncpu`
+      - run:
+          name: Run Python tests with JIT
+          command: |
+            CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
+              uv pip install -e . -v
+            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 \
+              METAL_DEBUG_ERROR_MODE=0 \
+              uv run --no-project python -m xmlrunner discover \
+                -v python/tests \
+                -o test-results/gpu_jit
+
+  cuda_build_and_test:
+    parameters:
+      image_date:
+        type: string
+        default: "2023.11.1"
+    machine:
+      image: "linux-cuda-12:<< parameters.image_date >>"
+      resource_class: gpu.nvidia.small.gen2
+    steps:
+      - checkout
+      - restore_cache:
+          keys:
+            - cuda-<< parameters.image_date >>-{{ arch }}-
+      - run:
+          name: Install dependencies
+          command: |
+            sudo apt-get update
+            sudo apt-get install libcudnn9-dev-cuda-12
+            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
+            sudo apt-get install libnccl2 libnccl-dev
+            curl -sL https://github.com/ccache/ccache/releases/download/v4.11.3/ccache-4.11.3-linux-x86_64.tar.xz | tar xJf -
+            sudo mv ccache-4.11.3-linux-x86_64/ccache /usr/bin/ccache
+            rm -rf ccache-4.11.3-linux-x86_64
+            curl -LsSf https://astral.sh/uv/install.sh | sh
+      - run:
+          name: Set CCache size
+          command: ccache --max-size 1G
+      - run:
+          name: Install Python package
+          command: |
+            uv venv
+            uv pip install cmake
+            DEBUG=1 CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+              uv pip install -e ".[dev]" -v
+      - run:
+          name: Run Python tests
+          command: |
+            source .venv/bin/activate
+            LOW_MEMORY=1 DEVICE=cpu python -m unittest discover python/tests -v
+            LOW_MEMORY=1 DEVICE=gpu python -m tests discover python/tests -v
+      - run:
+          name: Build CPP only
+          command: |
+            source .venv/bin/activate
+            cmake . -B build \
+              -DMLX_BUILD_CUDA=ON \
+              -DCMAKE_CUDA_COMPILER=`which nvcc` \
+              -DCMAKE_BUILD_TYPE=DEBUG
+            cmake --build build -j `nproc`
+      - run:
+          name: Run CPP tests
+          command: ./build/tests/tests -sfe="*fft_tests.cpp,*linalg_tests.cpp"
+      - run:
+          name: CCache report
+          command: |
+            ccache --show-stats
+            ccache --zero-stats
+            ccache --cleanup
+      - save_cache:
+          key: cuda-<< parameters.image_date >>-{{ arch }}-{{ epoch }}
+          paths:
+            - /home/circleci/.cache/ccache
+
+  build_release:
+    parameters:
+      python_version:
+        type: string
+        default: "3.10"
+      xcode_version:
+        type: string
+        default: "26.0.0"
+      build_env:
+        type: string
+        default: ""
+      macosx_deployment_target:
+        type: string
+        default: ""
+    macos:
+      xcode: << parameters.xcode_version >>
+    resource_class: m4pro.medium
+    environment:
+      MACOSX_DEPLOYMENT_TARGET: << parameters.macosx_deployment_target >>
+    steps:
+      - checkout
+      - run:
+          name: Install dependencies
+          command: |
+            xcodebuild -downloadComponent MetalToolchain
+            mkdir -p ~/miniconda3
+            curl https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh -o ~/miniconda3/miniconda.sh
+            bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
+            rm ~/miniconda3/miniconda.sh
+            source ~/miniconda3/bin/activate
+            conda init --all
+            conda create -n env python=<< parameters.python_version >> -y
+            conda activate env
+            pip install --upgrade cmake
+            pip install nanobind==2.4.0
+            pip install --upgrade setuptools
+            pip install numpy
+            pip install twine
+            pip install build
+      - run:
+          name: Install Python package
+          command: |
+            conda activate env
+            env -u MACOSX_DEPLOYMENT_TARGET DEV_RELEASE=1 \
+              pip install . -v
+      - run:
+          name: Generate package stubs
+          command: |
+            conda activate env
+            pip install typing_extensions
+            python setup.py generate_stubs
+      - run:
+          name: Build Python package
+          command: |
+            conda activate env
+            python setup.py clean --all
+            << parameters.build_env >> MLX_BUILD_STAGE=1 python -m build -w
+      - when:
+          condition:
+            equal: ["3.10", << parameters.python_version >>]
+          steps:
+            - run:
+                name: Build common package
+                command: |
+                  conda activate env
+                  python setup.py clean --all
+                  << parameters.build_env >> MLX_BUILD_STAGE=2 python -m build -w
+      - when:
+          condition: << parameters.build_env >>
+          steps:
+            - run:
+                name: Upload package
+                command: |
+                  conda activate env
+                  twine upload dist/*
+      - store_artifacts:
+          path: dist/
+
+  build_linux_release:
+    parameters:
+      python_version:
+        type: string
+        default: "3.10"
+      build_env:
+        type: string
+        default: ""
+    machine:
+      image: ubuntu-2204:current
+      resource_class: large
+    steps:
+      - checkout
+      - run:
+          name: Build wheel
+          command: |
+            PYTHON=python<< parameters.python_version >>
+            export DEBIAN_FRONTEND=noninteractive
+            export NEEDRESTART_MODE=a
+            sudo apt-get update
+            TZ=Etc/UTC sudo apt-get -y install tzdata
+            sudo add-apt-repository -y ppa:deadsnakes/ppa
+            sudo apt-get install -y $PYTHON $PYTHON-dev $PYTHON-full
+            sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
+            $PYTHON -m venv env
+            source env/bin/activate
+            pip install --upgrade pip
+            pip install --upgrade cmake
+            pip install auditwheel
+            pip install patchelf
+            pip install build
+            pip install twine
+            << parameters.build_env >> pip install ".[dev]" -v
+            pip install typing_extensions
+            python setup.py generate_stubs
+            python setup.py clean --all
+            MLX_BUILD_STAGE=1 << parameters.build_env >> python -m build -w
+            bash python/scripts/repair_linux.sh
+      - when:
+          condition:
+            equal: ["3.10", << parameters.python_version >>]
+          steps:
+            - run:
+                name: Build common package
+                command: |
+                  source env/bin/activate
+                  python setup.py clean --all
+                  << parameters.build_env >> MLX_BUILD_STAGE=2 \
+                    python -m build -w
+                  auditwheel repair dist/mlx_cpu*.whl --plat manylinux_2_35_x86_64
+      - when:
+          condition: << parameters.build_env >>
+          steps:
+            - run:
+                name: Upload packages
+                command: |
+                  source env/bin/activate
+                  twine upload wheelhouse/*.whl
+      - store_artifacts:
+          path: wheelhouse/
+
+  build_cuda_release:
+    parameters:
+      build_env:
+        type: string
+        default: ""
+    machine:
+      image: ubuntu-2204:current
+      resource_class: xlarge
+    steps:
+      - checkout
+      - run:
+          name: Build wheel
+          command: |
+            export DEBIAN_FRONTEND=noninteractive
+            export NEEDRESTART_MODE=a
+            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
+            sudo dpkg -i cuda-keyring_1.1-1_all.deb
+            sudo apt-get update
+            sudo apt-get install cuda-toolkit-12-9 libcudnn9-dev-cuda-12
+            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
+            sudo apt-get install zip
+            pip install auditwheel
+            pip install patchelf
+            pip install build
+            pip install twine
+            export PATH=/usr/local/cuda/bin${PATH:+:${PATH}}
+            export LD_LIBRARY_PATH=/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+            << parameters.build_env >> MLX_BUILD_STAGE=2 \
+              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+              python -m build -w
+            bash python/scripts/repair_cuda.sh
+      - when:
+          condition: << parameters.build_env >>
+          steps:
+            - run:
+                name: Upload package
+                command: |
+                  twine upload wheelhouse/*.whl
+      - store_artifacts:
+          path: wheelhouse/
+
+workflows:
+  build_and_test:
+    when:
+      and:
+        - matches:
+            pattern: "^(?!pull/)[-\\w]+$"
+            value: << pipeline.git.branch >>
+        - not: << pipeline.parameters.nightly_build >>
+        - not: << pipeline.parameters.test_release >>
+    jobs:
+      - mac_build_and_test:
+          matrix:
+            parameters:
+              macosx_deployment_target: ["13.5", "15.0"]
+      - linux_build_and_test
+      - cuda_build_and_test:
+          matrix:
+            parameters:
+              image_date: ["2023.11.1", "2025.05.1"]
+      - build_documentation 
+
+  build_pypi_release:
+    when:
+      and:
+        - not: << pipeline.parameters.nightly_build >>
+        - not: << pipeline.parameters.test_release >>
+    jobs:
+      - build_release:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          matrix:
+            parameters:
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+              macosx_deployment_target: ["13.5", "14.0", "15.0"]
+              build_env: ["PYPI_RELEASE=1"]
+              xcode_version: ["26.0.0"]
+      - build_documentation:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          upload-docs: true
+      - build_linux_release:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          matrix:
+            parameters:
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+              build_env: ["PYPI_RELEASE=1"]
+      - build_cuda_release:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          matrix:
+            parameters:
+              build_env: ["PYPI_RELEASE=1"]
+
+  prb:
+    when:
+      matches:
+        pattern: "^pull/\\d+(/head)?$"
+        value: << pipeline.git.branch >>
+    jobs:
+      - hold:
+          type: approval
+      - apple/authenticate:
+          context: pr-approval
+      - mac_build_and_test:
+          requires: [ hold ]
+          matrix:
+            parameters:
+              macosx_deployment_target: ["13.5", "15.0"]
+      - linux_build_and_test:
+          requires: [ hold ]
+      - cuda_build_and_test:
+          requires: [ hold ]
+          matrix:
+            parameters:
+              image_date: ["2023.11.1", "2025.05.1"]
+  nightly_build:
+    when:
+      and:
+        - equal: [ main, << pipeline.git.branch >> ]
+        - << pipeline.parameters.nightly_build >>
+    jobs:
+      - build_release:
+          matrix:
+            parameters:
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+              macosx_deployment_target: ["13.5", "14.0", "15.0"]
+              xcode_version: ["26.0.0"]
+      - build_linux_release:
+          matrix:
+            parameters:
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+      - build_cuda_release
+
+  build_dev_release:
+    when:
+      and:
+        - equal: [ main, << pipeline.git.branch >> ]
+        - << pipeline.parameters.test_release >>
+    jobs:
+      - build_release:
+          matrix:
+            parameters:
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+              macosx_deployment_target: ["13.5", "14.0", "15.0"]
+              build_env: ["DEV_RELEASE=1"]
+              xcode_version: ["26.0.0"]
+      - build_linux_release:
+          matrix:
+            parameters:
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+              build_env: ["DEV_RELEASE=1"]
+      - build_cuda_release:
+          matrix:
+            parameters:
+              build_env: ["DEV_RELEASE=1"]
--- a/.github/actions/build-cuda-release/action.yml
+++ b/.github/actions/build-cuda-release/action.yml
@@ -2,8 +2,8 @@ name: 'Build CUDA wheel'
 description: 'Build CUDA wheel'

 inputs:
-  toolkit:
-    description: 'The CUDA toolkit'
+  nvcc-location:
+    description: 'Location of nvcc compiler'
    required: true

 runs:
@@ -12,9 +12,13 @@ runs:
    - name: Build package
      shell: bash
      env:
-        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc
+        MLX_BUILD_STAGE: 2
+        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }}
      run: |
        pip install auditwheel build patchelf setuptools
        python setup.py clean --all
-        MLX_BUILD_STAGE=2 python -m build -w
-        bash python/scripts/repair_cuda.sh
+        python -m build -w
+
+        if [ -f "python/scripts/repair_cuda.sh" ]; then
+          bash python/scripts/repair_cuda.sh
+        fi
--- a/.github/actions/build-cuda/action.yml
+++ b/.github/actions/build-cuda/action.yml
@@ -2,9 +2,19 @@ name: 'Build and Test with CUDA'
 description: 'Build and test MLX with CUDA'

 inputs:
-  toolkit:
-    description: 'The CUDA toolkit'
+  build-type:
+    description: 'Build type (debug, release)'
+    required: false
+    default: 'debug'
+  run-tests:
+    description: 'Whether to run tests'
+    required: false
+    default: 'true'
+  nvcc-location:
+    description: 'Location of nvcc compiler'
    required: true
+    default: '/usr/local/cuda-12.9/bin/nvcc'
+    # this value is dependent on the CUDA tools installed in the setup-linux workflow

 runs:
  using: "composite"
@@ -13,14 +23,46 @@ runs:
      shell: bash
      env:
        DEBUG: 1
-        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc
-      run: pip install --no-build-isolation -e ".[dev]" -v
+        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }}
+      run: pip install -e ".[dev]" -v
+
+    - name: Check if build actually worked
+      shell: bash
+      run: python -c "import mlx.core"
+
+    - name: Run Python tests - CPU
+      if: inputs.run-tests == 'true'
+      shell: bash
+      env:
+        LOW_MEMORY: 1
+        DEVICE: cpu
+      run: python -m unittest discover python/tests -v
+
+    - name: Run Python tests - GPU
+      if: inputs.run-tests == 'true'
+      shell: bash
+      env:
+        LOW_MEMORY: 1
+        DEVICE: gpu
+      run: python -m tests discover python/tests -v

    - name: Build CPP only
+      if: inputs.build-type == 'debug'
      shell: bash
      run: |
        cmake . -B build \
          -DMLX_BUILD_CUDA=ON \
-          -DCMAKE_CUDA_COMPILER=/usr/local/${{ inputs.toolkit }}/bin/nvcc \
+          -DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }} \
          -DCMAKE_BUILD_TYPE=DEBUG
        cmake --build build -j $(nproc)
+    
+    - name: Run CPP tests
+      if: ${{ inputs.build-type == 'debug' && inputs.run-tests == 'true' }}
+      shell: bash
+      run: ./build/tests/tests -sfe="*fft_tests.cpp,*linalg_tests.cpp"
+
+    - name: Build Python package
+      if: inputs.build-type == 'release'
+      uses: ./.github/actions/build-cuda-release
+      with:
+        nvcc-location: ${{ inputs.nvcc-location }}
--- a/.github/actions/build-docs/action.yml
+++ b/.github/actions/build-docs/action.yml
@@ -1,19 +1,19 @@
 name: 'Build Documentation'
-description: 'Build documentation'
+description: 'Build documentation on a mac'

 runs:
  using: "composite"
  steps:
    - name: Setup machine
-      uses: ./.github/actions/setup-linux
+      uses: ./.github/actions/setup-macos

    - name: Install dependencies
-      shell: bash
+      shell: sh
      run: |
-        sudo apt-get install -y doxygen
-        source .venv/bin/activate
-        pip install -r docs/requirements.txt
-        pip install . -v
+        brew install doxygen
+        uv pip install --upgrade pip cmake
+        uv pip install -r docs/requirements.txt
+        uv pip install . -v
  
    - name: Build documentation
      shell: bash
@@ -24,8 +24,8 @@ runs:
        make html O=-W
    
    - name: Create artifact tar
-      shell: bash
-      run: tar -cf artifact.tar -C docs --dereference build/html index.html
+      shell: sh
+      run: tar -cf artifact.tar --cd docs/build/html -L .

    # Do it manually because upload-pages-artifact requires gtar
    - name: Upload artifact
@@ -35,4 +35,4 @@ runs:
        name: github-pages
        path: artifact.tar
        retention-days: 1
-        if-no-files-found: error
+        if-no-files-found: error
--- a/.github/actions/build-linux-release/action.yml
+++ b/.github/actions/build-linux-release/action.yml
@@ -1,40 +0,0 @@
-name: 'Build Linux wheel'
-description: 'Build Linux wheel'
-
-inputs:
-  build-backend:
-    description: 'Build the backend mlx-cpu package'
-    type: boolean
-    required: false
-    default: false
-  arch:
-    description: 'Platform architecture tag'
-    required: true
-    type: choice
-    options:
-      - x86_64
-      - aarch64
-
-runs:
-  using: "composite"
-  steps:
-    - name: Generate package stubs
-      shell: bash
-      run: |
-        pip install -e ".[dev]" -v
-        pip install typing_extensions
-        python setup.py generate_stubs
-    - name: Build Python package
-      shell: bash
-      run: |
-        pip install auditwheel patchelf build
-        python setup.py clean --all
-        MLX_BUILD_STAGE=1 python -m build -w
-        bash python/scripts/repair_linux.sh ${{ inputs.arch }}
-    - name: Build backend package
-      if: ${{ inputs.build-backend }}
-      shell: bash
-      run: |
-        python setup.py clean --all
-        MLX_BUILD_STAGE=2 python -m build -w
-        auditwheel repair dist/mlx_cpu*.whl --plat manylinux_2_35_${{ inputs.arch }}
--- a/.github/actions/build-linux/action.yml
+++ b/.github/actions/build-linux/action.yml
@@ -1,25 +1,78 @@
 name: 'Build and Test on Linux'
 description: 'Build and test MLX on Linux'

+inputs:
+  build-type:
+    description: 'Build type'
+    required: false
+    default: 'debug'
+    type: choice
+    options:
+      - debug
+      - release
+  run-tests:
+    description: 'Whether to run tests'
+    required: false
+    default: 'true'
+    type: boolean
+
 runs:
  using: "composite"
  steps:
+    - name: Set DEBUG
+      shell: sh
+      if: inputs.build-type == 'debug'
+      run: echo "DEBUG=1" >> $GITHUB_ENV
+
    - name: Install Python package
      shell: sh
      env:
        CMAKE_ARGS: "-DCMAKE_COMPILE_WARNING_AS_ERROR=ON"
-        DEBUG: 1
-      run: pip install --no-build-isolation -e ".[dev]" -v
-
+      run: pip install -e ".[dev]" -v
+    
    - name: Generate package stubs
      shell: sh
      run: |
        pip install typing_extensions
        python setup.py generate_stubs
    
+    - name: Run Python tests
+      if: inputs.run-tests == 'true'
+      shell: bash
+      run: |
+        python -m unittest discover python/tests -v
+        mpirun --bind-to none --allow-run-as-root -host localhost:8 -np 8 python python/tests/mpi_test_distributed.py
+        mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
+        if grep -Fq '[WARN]' stderr.log ; then
+          grep -F '[WARN]' stderr.log
+          echo "Distributed ring test failed";
+          exit 1;
+        fi
+    
    - name: Build CPP only
+      if: inputs.build-type == 'debug'
      shell: bash
      run: |
        mkdir -p build && cd build
        cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
        make -j $(nproc)
+    
+    - name: Run CPP tests
+      if: ${{ inputs.build-type == 'debug' && inputs.run-tests == 'true' }}
+      shell: sh
+      run: ./build/tests/tests
+    
+    - name: Build Python package
+      if: inputs.build-type == 'release'
+      shell: bash
+      run: |
+        pip install auditwheel patchelf build
+        python setup.py clean --all
+        MLX_BUILD_STAGE=1 python -m build -w
+        if [ -f "python/scripts/repair_linux.sh" ]; then
+          bash python/scripts/repair_linux.sh
+        fi
+
+        python setup.py clean --all
+        MLX_BUILD_STAGE=2 python -m build -w
+        auditwheel repair dist/mlx_cpu*.whl --plat manylinux_2_35_x86_64
--- a/.github/actions/build-macos-release/action.yml
+++ b/.github/actions/build-macos-release/action.yml
@@ -6,25 +6,17 @@ inputs:
    description: 'macOS build target'
    required: false
    default: '15.0'
-  build-backend:
-    description: 'Build the backend mlx-metal package'
-    type: boolean
-    required: false
-    default: false

 runs:
  using: "composite"
  steps:
-    - name: Build Python package
-      shell: bash -l {0}
+    - name: Build Python package(s)
+      shell: bash
+      env:
+        MACOSX_DEPLOYMENT_TARGET: ${{ inputs.macos-target }}
      run: |
-        pip install build
-        python setup.py clean --all
-        MLX_BUILD_STAGE=1 python -m build -w
-
-    - name: Build backend package
-      if: ${{ inputs.build-backend }}
-      shell: bash -l {0}
-      run: |
-        python setup.py clean --all
-        MLX_BUILD_STAGE=2 python -m build -w
+        uv pip install build
+        uv run --no-project setup.py clean --all
+        MLX_BUILD_STAGE=1 uv run -m build -w
+        uv run --no-project setup.py clean --all
+        MLX_BUILD_STAGE=2 uv run -m build -w
--- a/.github/actions/build-macos/action.yml
+++ b/.github/actions/build-macos/action.yml
@@ -1,51 +1,68 @@
 name: 'Build and Test on macOS'
 description: 'Build and test MLX on macOS'

+inputs:
+  build-type:
+    description: 'Build type (debug, release)'
+    required: false
+    default: 'debug'
+    type: choice
+    options:
+      - debug
+      - release
+  run-tests:
+    description: 'Whether to run tests'
+    required: false
+    default: 'true'
+  build-jit:
+    description: 'Whether to build with JIT'
+    required: false
+    default: 'true'
+
 runs:
  using: "composite"
  steps:
    - name: Install dependencies
+      shell: sh
      env:
        DEBUG: 1
-        CMAKE_ARGS: "-DCMAKE_COMPILE_WARNING_AS_ERROR=ON"
-      shell: bash -l {0}
+        DEV_RELEASE: 1
      run: |
-        pip install --upgrade pip
-        pip install cmake setuptools nanobind==2.4.0
-        pip install -e . -v
+        uv pip install --upgrade pip cmake setuptools
+        uv pip install nanobind==2.4.0 \
+          numpy torch tensorflow unittest-xml-reporting
+        uv pip install -e . -v

    - name: Generate package stubs
-      shell: bash -l {0}
+      shell: bash
      run: |
-        pip install typing_extensions
-        python setup.py generate_stubs
-
-    - name: Install tests dependencies
-      shell: bash -l {0}
-      run: |
-        pip install numpy torch tensorflow unittest-xml-reporting
+        uv pip install typing_extensions
+        uv run --no-project setup.py generate_stubs

    - name: Run Python tests
-      shell: bash -l {0}
+      if: inputs.run-tests == 'true'
+      shell: bash
      env:
        LOW_MEMORY: 1
      run: |
-        DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
-        DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu
+        DEVICE=cpu uv run -m xmlrunner discover -v python/tests -o test-results/cpu
+        DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 uv run -m xmlrunner discover -v python/tests -o test-results/gpu
        mpirun --bind-to none -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
        mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
        if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
    
    - name: Build example extension
-      shell: bash -l {0}
+      if: inputs.run-tests == 'true'
+      shell: bash
      run: |
        cd examples/extensions
-        pip install -r requirements.txt
-        python setup.py build_ext --inplace
-        python test.py
+        uv pip install -r requirements.txt
+        uv run --no-project setup.py build_ext --inplace
+        uv run --no-project test.py
    
    - name: Build CPP only
-      shell: bash -l {0}
+      if: inputs.build-type == 'debug'
+      shell: bash
      run: |
        mkdir -p build
        cd build
@@ -53,7 +70,8 @@ runs:
        make -j $(sysctl -n hw.ncpu)
    
    - name: Run CPP tests
-      shell: bash -l {0}
+      if: ${{ inputs.build-type == 'debug' && inputs.run-tests == 'true' }}
+      shell: bash
      env:
        DEVICE: gpu
        METAL_DEVICE_WRAPPER_TYPE: 1
@@ -61,7 +79,8 @@ runs:
      run: ./build/tests/tests
    
    - name: Build small binary with JIT
-      shell: bash -l {0}
+      if: inputs.build-jit == 'true'
+      shell: bash
      run: |
        mkdir -p build
        cd build
@@ -74,7 +93,8 @@ runs:
        make -j $(sysctl -n hw.ncpu)
    
    - name: Run Python tests with JIT
-      shell: bash -l {0}
+      if: ${{ inputs.build-jit == 'true' && inputs.run-tests == 'true' }}
+      shell: bash
      env:
        LOW_MEMORY: 1
        DEVICE: gpu
@@ -82,7 +102,23 @@ runs:
        METAL_DEBUG_ERROR_MODE: 0
      run: |
        CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
-          pip install -e . -v
-        python -m xmlrunner discover \
+          uv pip install -e . -v
+        uv run -m xmlrunner discover \
            -v python/tests \
            -o test-results/gpu_jit
+
+    - name: Build macOS 13 package
+      if: inputs.build-type == 'release'
+      uses: ./.github/actions/build-macos-release
+      with:
+        macos-target: 13.0
+    - name: Build macOS 14 package
+      if: inputs.build-type == 'release'
+      uses: ./.github/actions/build-macos-release
+      with:
+        macos-target: 14.0
+    - name: Build macOS 15 package
+      if: inputs.build-type == 'release'
+      uses: ./.github/actions/build-macos-release
+      with:
+        macos-target: 15.0
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@@ -2,10 +2,14 @@ name: 'Setup Linux Environment'
 description: 'Install dependencies for Linux builds'

 inputs:
-  toolkit:
-    description: 'Which toolkit to install'
+  runner-type:
+    description: 'Whether to set this up as a linux or CUDA runner'
    required: false
-    default: 'cpu'
+    default: 'linux'
+    type: choice
+    options:
+      - linux
+      - cuda
  python-version:
    description: 'Version of python to set up'
    required: false
@@ -14,62 +18,56 @@ inputs:
 runs:
  using: "composite"
  steps:
-    - name: Use ccache
-      uses: hendrikmuhs/ccache-action@v1.2
-      with:
-        key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ inputs.toolkit }}-py${{ inputs.python-version }}
-        max-size: 1GB
+    - name: Free disk space
+      shell: sh
+      if: inputs.runner-type == 'linux'
+      run: sudo rm -rf "$AGENT_TOOLSDIRECTORY"

    - name: Install common dependencies
+      env:
+        TZ: Etc/UTC
      shell: bash
      run: |
        sudo apt-get update
-        sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev zip
+        sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev tzdata zip
+        sudo apt autoremove -y

    - uses: actions/setup-python@v6
      with:
        python-version: ${{ inputs.python-version }}
+        cache: 'pip'

-    - name: Setup Python venv
+    - name: setup python venv
      shell: bash
      run: |
        python -m venv .venv
        source .venv/bin/activate
-        pip install setuptools cmake nanobind==2.4.0
        echo PATH=$PATH >> $GITHUB_ENV
-        # Make cmake search .venv for nanobind
-        echo PYTHONPATH=`python -c 'import sys; print(sys.path[-1])'` >> $GITHUB_ENV
+        pip install --upgrade pip cmake

    - name: Install MPI
+      if: inputs.runner-type == 'linux'
      shell: bash
      run: sudo apt-get install -y openmpi-bin openmpi-common libopenmpi-dev

-    - name: Install CUDA toolkit
-      if: ${{ startsWith(inputs.toolkit, 'cuda') }}
-      shell: bash
+    - name: Network CUDA installation from packages
+      id: install-cuda
+      if: inputs.runner-type == 'cuda'
      env:
-        # Note: the CI machine does not meet CUDA 13's driver requirement.
-        # Compatibility matrix:
-        # https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html
-        # The `nvcc` is installed into `/usr/local/cuda-VERSION/bin/nvcc` - but
-        # it's *not* on the default toolkit path.
-        PACKAGES: |
-          {
-            "cuda-12.6": "libcudnn9-dev-cuda-12 cuda-toolkit-12-6",
-            "cuda-12.9": "libcudnn9-dev-cuda-12 cuda-toolkit-12-9",
-            "cuda-13.0": "libcudnn9-dev-cuda-13 cuda-toolkit-13-0"
-          }
+        TZ: Etc/UTC
+      shell: bash ## Specific to Ubuntu 22.04 & Architecture x86_64
      run: |
-        export ARCH=${{ runner.arch == 'arm64' && 'arm64' || 'x86_64' }}
-        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$ARCH/cuda-keyring_1.1-1_all.deb
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
        sudo dpkg -i cuda-keyring_1.1-1_all.deb
        sudo apt-get update
-        sudo apt-get install -y \
-            libnccl2 libnccl-dev \
-            ${{ fromJson(env.PACKAGES)[inputs.toolkit] }}
+        sudo apt-get install -y libcudnn9-dev-cuda-12 libnccl2 libnccl-dev cuda-toolkit-12-9
+      # Note: This installs CUDA 12.9, which is the latest supported by cuDNN 9.x and works with the NVidia 570 drivers
+      # cuda-toolkit by itself installs version 13 (+) and requires updated drives (580+), which require a reboot to function properly.
+      # Compatibility matrix: https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html
+      # This also drops `nvcc` into `/usr/local/cuda-12.9/bin/nvcc` - but it's *not* on the default PATH

-    - name: CUDA packages and driver report
-      if: ${{ startsWith(inputs.toolkit, 'cuda') }}
+    - name: Package and Driver Report
+      if: inputs.runner-type == 'cuda'
      shell: bash
      run: |
        sudo apt-get install -y ubuntu-drivers-common dkms
--- a/.github/actions/setup-macos/action.yml
+++ b/.github/actions/setup-macos/action.yml
@@ -2,6 +2,11 @@ name: 'Setup macOS Environment'
 description: 'Install dependencies for macOS builds'

 inputs:
+  install-mpi:
+    description: 'Whether to install MPI'
+    required: false
+    default: 'true'
+    type: boolean
  python-version:
    description: 'Python version to use'
    required: false
@@ -12,13 +17,15 @@ runs:
  steps:
    - name: Install Homebrew packages
      shell: sh
+      if: inputs.install-mpi == 'true'
      run: /opt/homebrew/bin/brew install openmpi
    
    - name: Verify MetalToolchain installed
      shell: bash
      run: xcodebuild -showComponent MetalToolchain
-
-    - uses: conda-incubator/setup-miniconda@v3
+    
+    - name: Setup uv
+      uses: astral-sh/setup-uv@v6
      with:
-        miniconda-version: "latest"
-        python-version: ${{ inputs.python-version }}
+          python-version: ${{ inputs.python-version }}
+          activate-environment: true
--- a/.github/actions/test-linux/action.yml
+++ b/.github/actions/test-linux/action.yml
@@ -1,69 +0,0 @@
-name: 'Run Linux tests'
-
-inputs:
-  cpu-only:
-    description: 'Skip GPU tests'
-    required: false
-    default: false
-
-runs:
-  using: "composite"
-  steps:
-    - name: Run MPI tests
-      shell: bash
-      run: |
-        echo "::group::MPI tests"
-        mpirun --bind-to none --allow-run-as-root -host localhost:8 -np 8 python python/tests/mpi_test_distributed.py
-        echo "::endgroup::"
-
-    - name: Run distributed tests
-      if: ${{ inputs.cpu-only == 'true' }}
-      shell: bash
-      run: |
-        echo "::group::Distributed tests"
-        mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
-        if grep -Fq '[WARN]' stderr.log ; then
-          grep -F '[WARN]' stderr.log
-          echo "Distributed ring test failed";
-          exit 1;
-        fi
-        echo "::endgroup::"
-
-    - name: Run Python tests - CPU
-      if: ${{ inputs.cpu-only == 'true' }}
-      shell: bash
-      env:
-        DEVICE: cpu
-      run: |
-        echo "::group::Python tests - CPU"
-        python -m unittest discover python/tests -v
-        echo "::endgroup::"
-
-    - name: Run Python tests - GPU
-      if: ${{ inputs.cpu-only == 'false' }}
-      shell: bash
-      env:
-        DEVICE: gpu
-      run: |
-        echo "::group::Python tests - GPU"
-        python -m tests discover python/tests -v
-        echo "::endgroup::"
-
-    - name: Run CPP tests - CPU
-      shell: bash
-      env:
-        DEVICE: cpu
-      run: |
-        echo "::group::CPP tests - CPU"
-        ./build/tests/tests
-        echo "::endgroup::"
-
-    - name: Run CPP tests - GPU
-      if: ${{ inputs.cpu-only == 'false' }}
-      shell: bash
-      env:
-        DEVICE: gpu
-      run: |
-        echo "::group::CPP tests - GPU"
-        ./build/tests/tests -sfe="*fft_tests.cpp,*linalg_tests.cpp"
-        echo "::endgroup::"
--- a/.github/scripts/setup+build-cpp-linux-fedora-container.sh
+++ b/.github/scripts/setup+build-cpp-linux-fedora-container.sh
@@ -1,27 +0,0 @@
-#!/bin/bash
-set -ex
-
-# [Setup] Install dependencies inside the container.
-dnf update -y
-dnf install -y \
-  blas-devel \
-  lapack-devel \
-  openblas-devel \
-  make \
-  cmake \
-  clang \
-  git
-dnf clean all
-
-# [C++] CI Build Sanity Check: Verifies code compilation, not for release.
-export CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON"
-export DEBUG=1
-export CMAKE_C_COMPILER=/usr/bin/clang
-export CMAKE_CXX_COMPILER=/usr/bin/clang++
-
-mkdir -p build
-pushd build
-cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
-make -j $(nproc)
-./tests/tests
-popd
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -8,7 +8,7 @@ permissions:

 jobs:
  build:
-    runs-on: ubuntu-22.04
+    runs-on: [self-hosted, macos]
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/build-docs
@@ -25,4 +25,4 @@ jobs:
    steps:
      - name: Deploy to GitHub Pages
        id: deployment
-        uses: actions/deploy-pages@v4
+        uses: actions/deploy-pages@v4
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -18,10 +18,10 @@ jobs:
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
-      - uses: ./.github/actions/build-linux-release
+      - uses: ./.github/actions/build-linux
        with:
-          build-backend: ${{ matrix.python-version == '3.10' }}
-          arch: "x86_64"
+          build-type: release
+          run-tests: false
      - name: Upload mlx artifacts
        uses: actions/upload-artifact@v5
        with:
@@ -35,31 +35,27 @@ jobs:
          name: mlx-cpu
          path: wheelhouse/mlx_cpu-*.whl
          retention-days: 7
-
+  
  build_linux_with_tests:
    strategy:
      fail-fast: false
      matrix:
-        python_version: ["3.11", "3.12", "3.13", "3.14"]
-        runner:
-          - ubuntu-22.04
-          - ubuntu-22.04-arm
-    runs-on: ${{ matrix.runner }}
+        python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
        with:
          python-version: ${{ matrix.python_version }}
      - uses: ./.github/actions/build-linux
-      - uses: ./.github/actions/test-linux
-        with:
-          cpu-only: true

  build_mac_release:
-    if: github.repository == 'ml-explore/mlx'
    strategy:
      matrix:
        python-version: ["3.10", "3.13"]
+        # TODO: 3.14 had issues finding a compatible tensorflow
+    env:
+      MACOSX_DEPLOYMENT_TARGET: "15.0"
    runs-on: [self-hosted, macos]
    steps:
      - uses: actions/checkout@v5
@@ -67,32 +63,31 @@ jobs:
        with:
          python-version: ${{ matrix.python-version }}
      - uses: ./.github/actions/build-macos
-      - name: Build macOS 15 package
-        uses: ./.github/actions/build-macos-release
+
+  build_cuda_with_tests:
+    runs-on: gpu-t4-4-core
+    steps:
+      - uses: actions/checkout@v5
+      - uses: ./.github/actions/setup-linux
        with:
-          macos-target: 15.0
-          build-backend: ${{ matrix.python-version == '3.10' }}
-      - name: Build macOS 14 package
-        uses: ./.github/actions/build-macos-release
-        with:
-          macos-target: 14.0
-          build-backend: ${{ matrix.python-version == '3.10' }}
+          runner-type: 'cuda'
+      - uses: ./.github/actions/build-cuda

  build_cuda_release:
-    if: github.repository == 'ml-explore/mlx'
    runs-on: ubuntu-22-large
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
        with:
-          toolkit: 'cuda-12.9'
+          runner-type: 'cuda'
      - name: Build Python package
        uses: ./.github/actions/build-cuda-release
        with:
-          toolkit: 'cuda-12.9'
+          nvcc-location: '/usr/local/cuda-12.9/bin/nvcc'
      - name: Upload artifacts
        uses: actions/upload-artifact@v5
        with:
          name: mlx-cuda
          path: wheelhouse/mlx_cuda-*.whl
          retention-days: 7
+
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -1,52 +1,27 @@
 name: Build and Test

-on:
-  pull_request:
-  push:
-    branches:
-      - main
-      # For testing CI without starting a pull request:
-      - test/*
+on: pull_request  

 permissions:
  contents: read

-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/head/main' }}
-
 jobs:
  check_lint:
    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v5
+      - uses: ./.github/actions/setup-linux
      - uses: pre-commit/action@v3.0.1

  linux_build_and_test:
-    needs: check_lint
-    strategy:
-      matrix:
-        runner:
-          - ubuntu-22.04
-          - ubuntu-22.04-arm
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
+    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
      - uses: ./.github/actions/build-linux
-      - uses: ./.github/actions/test-linux
-        with:
-          cpu-only: true

  mac_build_and_test:
-    if: github.repository == 'ml-explore/mlx'
-    strategy:
-      matrix:
-        macos-target: ["14.0", "15.0"]
    runs-on: [self-hosted, macos]
-    env:
-      MACOSX_DEPLOYMENT_TARGET: ${{ matrix.macos-target }}
    needs: check_lint
    steps:
      - uses: actions/checkout@v5
@@ -54,50 +29,18 @@ jobs:
      - uses: ./.github/actions/build-macos

  cuda_build_and_test:
-    if: github.repository == 'ml-explore/mlx'
-    strategy:
-      fail-fast: false
-      matrix:
-        toolkit: ['cuda-12.6', 'cuda-12.9']
    runs-on: gpu-t4-4-core
    needs: check_lint
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
        with:
-          toolkit: ${{ matrix.toolkit }}
+          runner-type: 'cuda'
      - uses: ./.github/actions/build-cuda
-        with:
-          toolkit: ${{ matrix.toolkit }}
-      - uses: ./.github/actions/test-linux

  build_documentation:
-    if: github.repository == 'ml-explore/mlx'
-    runs-on: ubuntu-22.04
+    runs-on: [self-hosted, macos]
    needs: check_lint
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/build-docs
-
-  linux_fedora_build_cpp:
-    name: Linux Fedora CPP Build (${{ matrix.arch }})
-    needs: check_lint
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - host: ubuntu-22.04
-            arch: x86_64
-          - host: ubuntu-22.04-arm
-            arch: aarch64
-
-    runs-on: ${{ matrix.host }}
-    container:
-      image: fedora:42
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v5
-
-      - name: CPP Build Test - No Release
-        run: |
-          bash ./.github/scripts/setup+build-cpp-linux-fedora-container.sh
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -5,24 +5,12 @@ on:
    tags:
      - 'v*'
  workflow_dispatch:
-    inputs:
-      dev_release:
-        description: "Do a dev release or regular release"
-        required: true
-        default: "false"

 permissions:
  contents: read

 jobs:
-  setup:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Set publishing variables
-        run: echo "Publishing setup complete"
-
  build_documentation:
-    if: github.repository == 'ml-explore/mlx'
    runs-on: [self-hosted, macos]
    steps:
      - uses: actions/checkout@v5
@@ -43,115 +31,84 @@ jobs:
        uses: actions/deploy-pages@v4

  build_linux_release:
-    if: github.repository == 'ml-explore/mlx'
    strategy:
      matrix:
        python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
-        arch: ['x86_64', 'aarch64']
-    runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }}
+    runs-on: ubuntu-22.04
    env:
      PYPI_RELEASE: 1
-      DEV_RELEASE: ${{ github.event.inputs.dev_release == 'true' && 1 || 0 }}
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
        with:
          python-version: ${{ matrix.python_version }}
-      - uses: ./.github/actions/build-linux-release
+      - uses: ./.github/actions/build-linux
        with:
-          build-backend: ${{ matrix.python-version == '3.10' }}
-          arch: ${{ matrix.arch }}
+          build-type: release
+          run-tests: false
      - name: Upload MLX artifacts
        uses: actions/upload-artifact@v5
        with:
-          overwrite: true
          name: linux-wheels-${{ matrix.python_version }}
          path: wheelhouse/mlx-*.whl
      - name: Upload CPU artifacts
        if: matrix.python_version == '3.10'
        uses: actions/upload-artifact@v5
        with:
-          overwrite: true
          name: mlx-cpu
          path: wheelhouse/mlx_cpu-*.whl
  
  build_mac_release:
-    if: github.repository == 'ml-explore/mlx'
    strategy:
      matrix:
-        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+        python-version: ["3.10", "3.11", "3.12", "3.13"]
+        # TODO: 3.14 had issues finding a compatible tensorflow
    runs-on: [self-hosted, macos]
    env:
      PYPI_RELEASE: 1
-      DEV_RELEASE: ${{ github.event.inputs.dev_release == 'true' && 1 || 0 }}
-
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-macos
        with:
          python-version: ${{ matrix.python-version }}
-
-      - name: Install dependencies
-        shell: bash -l {0}
-        run: |
-          pip install --upgrade pip
-          pip install cmake setuptools nanobind==2.4.0
-          pip install -e . -v
-      - name: Generate package stubs
-        shell: bash -l {0}
-        run: |
-          pip install typing_extensions
-          python setup.py generate_stubs
-      - name: Build macOS 14 package
-        uses: ./.github/actions/build-macos-release
+      - uses: ./.github/actions/build-macos
        with:
-          macos-target: 14.0
-          build-backend: ${{ matrix.python-version == '3.10' }}
-      - name: Build macOS 15 package
-        uses: ./.github/actions/build-macos-release
-        with:
-          macos-target: 15.0
-          build-backend: ${{ matrix.python-version == '3.10' }}
+          build-type: release
      - name: Upload MLX artifacts
        uses: actions/upload-artifact@v5
        with:
-          overwrite: true
          name: mac-wheels-${{ matrix.python-version }}
          path: dist/mlx-*.whl
      - name: Upload Metal artifacts
        if: matrix.python-version == '3.10'
        uses: actions/upload-artifact@v5
        with:
-          overwrite: true
          name: mlx-metal
          path: dist/mlx_metal-*.whl

  build_cuda_release:
-    if: github.repository == 'ml-explore/mlx'
    runs-on: ubuntu-22-large
    env:
      PYPI_RELEASE: 1
-      DEV_RELEASE: ${{ github.event.inputs.dev_release == 'true' && 1 || 0 }}
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
        with:
-          toolkit: 'cuda-12.9'
+          runner-type: 'cuda'
      - name: Build Python package
        uses: ./.github/actions/build-cuda-release
        with:
-          toolkit: 'cuda-12.9'
+          nvcc-location: '/usr/local/cuda-12.9/bin/nvcc'
      - name: Upload artifacts
        uses: actions/upload-artifact@v5
        with:
-          overwrite: true
          name: mlx-cuda
          path: wheelhouse/mlx_cuda-*.whl

  pypi-publish:
    name: Upload release to PyPI
    runs-on: ubuntu-latest
-    needs: [setup, build_linux_release, build_mac_release]
+    needs: [build_linux_release, build_mac_release]
    permissions:
      id-token: write
    environment:
@@ -161,24 +118,22 @@ jobs:
      - uses: actions/download-artifact@v6
        with:
          pattern: linux-wheels-*
-          merge-multiple: true
-          path: dist
+          merge-multiples: true
+          path: artifacts
      - uses: actions/download-artifact@v6
        with:
          pattern: mac-wheels-*
-          merge-multiple: true
-          path: dist
+          merge-multiples: true
+          path: artifacts
      - name: Display structure of downloaded files
-        run: ls -R dist
-      - name: Publish package distributions to PyPI
-        uses: pypa/gh-action-pypi-publish@release/v1
-        with:
-          repository-url: https://upload.pypi.org/legacy/
+        run: ls -R artifacts
+      # - name: Publish package distributions to PyPI
+      #  uses: pypa/gh-action-pypi-publish@release/v1
  
  pypi-publish-cuda:
    name: Upload CUDA release to PyPI
    runs-on: ubuntu-latest
-    needs: [setup, build_cuda_release]
+    needs: build_cuda_release
    permissions:
      id-token: write
    environment:
@@ -188,18 +143,16 @@ jobs:
      - uses: actions/download-artifact@v6
        with:
          name: mlx-cuda
-          path: dist
+          path: artifacts
      - name: Display structure of downloaded files
-        run: ls -R dist
-      - name: Publish package distributions to PyPI
-        uses: pypa/gh-action-pypi-publish@release/v1
-        with:
-          repository-url: https://upload.pypi.org/legacy/
+        run: ls -R artifacts
+      # - name: Publish package distributions to PyPI
+      #  uses: pypa/gh-action-pypi-publish@release/v1

  pypi-publish-cpu:
    name: Upload CPU release to PyPI
    runs-on: ubuntu-latest
-    needs: [setup, build_linux_release]
+    needs: build_linux_release
    permissions:
      id-token: write
    environment:
@@ -209,18 +162,16 @@ jobs:
      - uses: actions/download-artifact@v6
        with:
          name: mlx-cpu
-          path: dist
+          path: artifacts
      - name: Display structure of downloaded files
-        run: ls -R dist
-      - name: Publish package distributions to PyPI
-        uses: pypa/gh-action-pypi-publish@release/v1
-        with:
-          repository-url: https://upload.pypi.org/legacy/
+        run: ls -R artifacts
+      # - name: Publish package distributions to PyPI
+      #  uses: pypa/gh-action-pypi-publish@release/v1

  pypi-publish-metal:
    name: Upload Metal release to PyPI
    runs-on: ubuntu-latest
-    needs: [setup, build_mac_release]
+    needs: build_mac_release
    permissions:
      id-token: write
    environment:
@@ -230,10 +181,8 @@ jobs:
      - uses: actions/download-artifact@v6
        with:
          name: mlx-metal
-          path: dist
+          path: artifacts
      - name: Display structure of downloaded files
-        run: ls -R dist
-      - name: Publish package distributions to PyPI
-        uses: pypa/gh-action-pypi-publish@release/v1
-        with:
-          repository-url: https://upload.pypi.org/legacy/
+        run: ls -R artifacts
+      # - name: Publish package distributions to PyPI
+      #  uses: pypa/gh-action-pypi-publish@release/v1
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -74,7 +74,6 @@ endif()
 if(MLX_USE_CCACHE)
  find_program(CCACHE_PROGRAM ccache)
  if(CCACHE_PROGRAM)
-    message(STATUS "Found CCache: ${CCACHE_PROGRAM}")
    set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
    set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
    set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
@@ -128,12 +127,9 @@ if(MLX_BUILD_METAL)
  message(STATUS "Building with macOS SDK version ${MACOS_SDK_VERSION}")

  set(METAL_CPP_URL
-      https://developer.apple.com/metal/cpp/files/metal-cpp_26.zip)
+      https://developer.apple.com/metal/cpp/files/metal-cpp_macOS15_iOS18.zip)

  if(NOT CMAKE_OSX_DEPLOYMENT_TARGET STREQUAL "")
-    if(${CMAKE_OSX_DEPLOYMENT_TARGET} LESS 14.0)
-      message(FATAL_ERROR "MLX requires macOS >= 14.0")
-    endif()
    set(XCRUN_FLAGS "-mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET}")
  endif()
  execute_process(
@@ -142,6 +138,7 @@ if(MLX_BUILD_METAL)
      "echo \"__METAL_VERSION__\" | xcrun -sdk macosx metal ${XCRUN_FLAGS} -E -x metal -P - | tail -1 | tr -d '\n'"
    OUTPUT_VARIABLE MLX_METAL_VERSION COMMAND_ERROR_IS_FATAL ANY)
  FetchContent_Declare(metal_cpp URL ${METAL_CPP_URL})
+
  FetchContent_MakeAvailable(metal_cpp)
  target_include_directories(
    mlx PUBLIC $<BUILD_INTERFACE:${metal_cpp_SOURCE_DIR}>
--- a/benchmarks/cpp/irregular_strides.cpp
+++ b/benchmarks/cpp/irregular_strides.cpp
@@ -75,7 +75,7 @@ void time_irregular_binary_ops_3D() {

 void time_irregular_binary_ops_4D() {
  auto device = mx::default_device();
-  mx::Shape shape = {8, 8, 512, 512};
+  std::vector<int> shape = {8, 8, 512, 512};
  auto a = mx::random::uniform(shape);
  auto b = mx::random::uniform(shape);

@@ -115,7 +115,7 @@ void time_irregular_binary_ops_4D() {

 void time_irregular_reshape() {
  auto device = mx::default_device();
-  mx::Shape shape;
+  std::vector<int> shape;
  auto reshape_fn = [&shape, device](const mx::array& a) {
    return mx::reshape(a, shape, device);
  };
@@ -170,7 +170,7 @@ void time_irregular_astype_1D() {
 void time_irregular_astype_2D() {
  auto device = mx::default_device();
  int size = 2048;
-  mx::Shape shape = {size, size};
+  std::vector<int> shape = {size, size};

  auto a = mx::random::uniform(shape);
  TIMEM("2D regular", mx::astype, a, mx::int32, device);
--- a/benchmarks/python/blas/bench_gemv.py
+++ b/benchmarks/python/blas/bench_gemv.py
@@ -1,5 +1,6 @@
 # Copyright © 2023 Apple Inc.

+import argparse
 import os
 import subprocess
 import time
--- a/benchmarks/python/masked_scatter.py
+++ b/benchmarks/python/masked_scatter.py
@@ -1,212 +0,0 @@
-import math
-import os
-import subprocess
-import time
-from copy import copy
-from functools import partial
-
-import matplotlib.pyplot as plt
-import mlx.core as mx
-import numpy as np
-import torch
-from matplotlib.ticker import FuncFormatter
-
-RESULTS_DIR = "./results"
-
-
-if not os.path.isdir(RESULTS_DIR):
-    os.mkdir(RESULTS_DIR)
-
-DEVICE_NAME = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"])
-DEVICE_NAME = DEVICE_NAME.decode("utf-8").strip("\n")
-
-TORCH_DEVICE = torch.device(
-    "mps"
-    if torch.backends.mps.is_available()
-    else ("cuda" if torch.cuda.is_available() else "cpu")
-)
-
-
-N_WARMUP = 5
-N_ITER_BENCH = 50
-N_ITER_FUNC = 20
-
-VECTOR_LENGTHS = [4096 * (2**i) for i in range(10)]
-MASK_DENSITIES = [0.01, 0.1, 0.25, 0.5]
-D_TYPES = ("float32", "float16")
-
-
-def _power_of_two_formatter(value, _position):
-    if value <= 0:
-        return ""
-    exponent = int(round(math.log2(value)))
-    if abs(value - (1 << exponent)) / value > 1e-6:
-        return f"{value:g}"
-    return f"$2^{{{exponent}}}$"
-
-
-def torch_sync():
-    if TORCH_DEVICE.type == "cuda":
-        torch.cuda.synchronize()
-    elif TORCH_DEVICE.type == "mps":
-        torch.mps.synchronize()
-
-
-def masked_scatter_mlx(self_arr, mask_arr, src_arr):
-    outs = []
-    for _ in range(N_ITER_FUNC):
-        out = copy(self_arr)
-        out[mask_arr] = src_arr
-        outs.append(out)
-    mx.eval(outs)
-    return outs
-
-
-@torch.no_grad()
-def masked_scatter_torch(self_tensor, mask_tensor, src_tensor):
-    outs = []
-    for _ in range(N_ITER_FUNC):
-        out = self_tensor.clone()
-        out.masked_scatter_(mask_tensor, src_tensor)
-        outs.append(out)
-    torch_sync()
-    return outs
-
-
-def measure(fn):
-    for _ in range(N_WARMUP):
-        fn()
-    start = time.perf_counter_ns()
-    for _ in range(N_ITER_BENCH):
-        fn()
-    end = time.perf_counter_ns()
-    return (end - start) * 1e-9
-
-
-def bytes_touched(length, true_count, item_size):
-    mask_bytes = length
-    self_bytes = length * item_size * 2  # read + write
-    src_bytes = true_count * item_size
-    return (mask_bytes + self_bytes + src_bytes) * N_ITER_FUNC * N_ITER_BENCH
-
-
-def build_case(length, density, np_dtype, torch_dtype):
-    true_count = max(1, int(round(length * density)))
-
-    rng = np.random.default_rng()
-    self_np = rng.normal(0.0, 1.0, length).astype(np_dtype)
-    mask_np = np.zeros(length, dtype=bool)
-    mask_np[:true_count] = True
-    rng.shuffle(mask_np)
-    src_np = rng.normal(0.0, 1.0, true_count).astype(np_dtype)
-
-    self_mlx = mx.array(self_np)
-    mask_mlx = mx.array(mask_np)
-    src_mlx = mx.array(src_np)
-
-    self_torch = torch.from_numpy(self_np).to(device=TORCH_DEVICE, dtype=torch_dtype)
-    mask_torch = torch.from_numpy(mask_np).to(device=TORCH_DEVICE)
-    src_torch = torch.from_numpy(src_np).to(device=TORCH_DEVICE, dtype=torch_dtype)
-
-    # Correctness check once per configuration
-    mx_out = mx.array(self_np)
-    mx_out[mask_mlx] = src_mlx
-    mx.eval(mx_out)
-    torch_out = self_torch.clone()
-    torch_out.masked_scatter_(mask_torch, src_torch)
-
-    atol = 5e-3 if np_dtype == np.float16 else 1e-5
-    if not np.allclose(np.array(mx_out), torch_out.cpu().numpy(), atol=atol):
-        raise AssertionError("masked_scatter results diverged between MLX and Torch")
-
-    return (self_mlx, mask_mlx, src_mlx, self_torch, mask_torch, src_torch, true_count)
-
-
-def bench_case(length, density, dtype):
-    np_dtype = getattr(np, dtype)
-    torch_dtype = getattr(torch, dtype)
-    (
-        self_mlx,
-        mask_mlx,
-        src_mlx,
-        self_torch,
-        mask_torch,
-        src_torch,
-        true_count,
-    ) = build_case(length, density, np_dtype, torch_dtype)
-
-    time_mlx = measure(partial(masked_scatter_mlx, self_mlx, mask_mlx, src_mlx))
-    time_torch = measure(
-        partial(masked_scatter_torch, self_torch, mask_torch, src_torch)
-    )
-
-    total_bytes = bytes_touched(length, true_count, np_dtype().itemsize)
-    bytes_per_gb = float(1024**3)
-    mlx_gbps = (total_bytes / bytes_per_gb) / time_mlx
-    torch_gbps = (total_bytes / bytes_per_gb) / time_torch
-
-    return time_mlx, time_torch, mlx_gbps, torch_gbps
-
-
-def plot_density(ax_perf, ax_speedup, density, dtype):
-    mlx_gbps = []
-    torch_gbps = []
-    mlx_times = []
-    torch_times = []
-
-    for length in VECTOR_LENGTHS:
-        t_mlx, t_torch, gbps_mlx, gbps_torch = bench_case(length, density, dtype)
-        mlx_gbps.append(gbps_mlx)
-        torch_gbps.append(gbps_torch)
-        mlx_times.append(t_mlx)
-        torch_times.append(t_torch)
-
-    ax_perf.plot(VECTOR_LENGTHS, mlx_gbps, "tab:blue", label="MLX")
-    ax_perf.plot(VECTOR_LENGTHS, torch_gbps, "tab:red", label="Torch")
-    ax_perf.set_xscale("log", base=2)
-    ax_perf.set_xticks(VECTOR_LENGTHS)
-    formatter = FuncFormatter(_power_of_two_formatter)
-    ax_perf.xaxis.set_major_formatter(formatter)
-    ax_perf.set_title(f"density={density:.2f}")
-    ax_perf.set_ylabel("GB/s")
-    ax_perf.grid(True, which="both", linestyle=":", alpha=0.4)
-    ax_perf.legend()
-
-    speedup = np.array(torch_times) / np.array(mlx_times)
-    ax_speedup.plot(VECTOR_LENGTHS, speedup, "tab:green")
-    ax_speedup.axhline(1.0, color="tab:gray", linestyle="--")
-    ax_speedup.set_xscale("log", base=2)
-    ax_speedup.set_xticks(VECTOR_LENGTHS)
-    ax_speedup.xaxis.set_major_formatter(formatter)
-    ax_speedup.set_ylabel("Speedup (Torch_t / MLX_t)")
-    ax_speedup.grid(True, which="both", linestyle=":", alpha=0.4)
-
-
-def main():
-    for dtype in D_TYPES:
-        fig, axs = plt.subplots(
-            len(MASK_DENSITIES),
-            2,
-            figsize=(10, 12),
-            layout="constrained",
-            sharex=True,
-        )
-
-        for i, density in enumerate(MASK_DENSITIES):
-            plot_density(axs[i][0], axs[i][1], density, dtype)
-            axs[i][0].set_xlabel("vector length")
-            axs[i][1].set_xlabel("vector length")
-
-        fig.suptitle(
-            f"{DEVICE_NAME.replace('Apple ', '')} ({TORCH_DEVICE.type}) | dtype={dtype}"
-        )
-        output_path = os.path.join(
-            RESULTS_DIR,
-            f"{DEVICE_NAME.replace(' ', '_')}_masked_scatter_{dtype}.pdf",
-        )
-        fig.savefig(output_path)
-        plt.close(fig)
-
-
-if __name__ == "__main__":
-    main()
--- a/cmake/Findnvpl.cmake
+++ b/cmake/Findnvpl.cmake
@@ -1,3 +0,0 @@
-# This file does nothing but to suppress the cmake warning: "By not providing
-# Findnvpl.cmake in CMAKE_MODULE_PATH...", which is caused by the
-# find_package(nvpl) from cmake's builtin FindLAPACK.cmake module.
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -17,10 +17,11 @@ To install from PyPI your system must meet the following requirements:

 - Using an M series chip (Apple silicon)
 - Using a native Python >= 3.10
- macOS >= 14.0
+- macOS >= 13.5

 .. note::
-    MLX is only available on devices running macOS >= 14.0 and higher.
+    MLX is only available on devices running macOS >= 13.5
+    It is highly recommended to use macOS 14 (Sonoma)

 CUDA
 ^^^^
--- a/docs/src/usage/distributed.rst
+++ b/docs/src/usage/distributed.rst
@@ -7,13 +7,12 @@ Distributed Communication

 MLX supports distributed communication operations that allow the computational cost
 of training or inference to be shared across many physical machines. At the
-moment we support three different communication backends:
+moment we support two different communication backends:

 * `MPI <https://en.wikipedia.org/wiki/Message_Passing_Interface>`_ a
  full-featured and mature distributed communications library
-* A **ring** backend of our own that uses native TCP sockets. It should be
-  faster for thunderbolt connections, but it also works over Ethernet.
-* `nccl <https://developer.nvidia.com/nccl>`_, for use in CUDA environments.
+* A **ring** backend of our own that uses native TCP sockets and should be
+  faster for thunderbolt connections.

 The list of all currently supported operations and their documentation can be
 seen in the :ref:`API docs<distributed>`.
@@ -85,8 +84,9 @@ Selecting Backend
 ^^^^^^^^^^^^^^^^^

 You can select the backend you want to use when calling :func:`init` by passing
-one of ``{'any', 'ring', 'mpi', 'nccl'}``. When passing ``any``, MLX will try all
-available backends. If they all fail then a singleton group is created.
+one of ``{'any', 'ring', 'mpi'}``. When passing ``any``, MLX will try to
+initialize the ``ring`` backend and if it fails the ``mpi`` backend. If they
+both fail then a singleton group is created.

 .. note::
   After a distributed backend is successfully initialized :func:`init` will
@@ -220,7 +220,7 @@ print 4 etc.
 Installing MPI
 ^^^^^^^^^^^^^^

-MPI can be installed with Homebrew, pip, using the Anaconda package manager, or
+MPI can be installed with Homebrew, using the Anaconda package manager or
 compiled from source. Most of our testing is done using ``openmpi`` installed
 with the Anaconda package manager as follows:

@@ -228,16 +228,14 @@ with the Anaconda package manager as follows:

    $ conda install conda-forge::openmpi

-Installing with Homebrew or pip requires specifying the location of ``libmpi.dyld``
+Installing with Homebrew may require specifying the location of ``libmpi.dyld``
 so that MLX can find it and load it at runtime. This can simply be achieved by
 passing the ``DYLD_LIBRARY_PATH`` environment variable to ``mpirun`` and it is
-done automatically by ``mlx.launch``. Some environments use a non-standard
-library filename that can be specified using the ``MPI_LIBNAME`` environment
-variable. This is automatically taken care of by ``mlx.launch`` as well.
+done automatically by ``mlx.launch``.

 .. code:: shell

-    $ mpirun -np 2 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ -x MPI_LIBNAME=libmpi.40.dylib python test.py
+    $ mpirun -np 2 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python test.py
    $ # or simply
    $ mlx.launch -n 2 test.py

--- a/docs/src/usage/indexing.rst
+++ b/docs/src/usage/indexing.rst
@@ -70,8 +70,7 @@ Differences from NumPy

  * Indexing does not perform bounds checking. Indexing out of bounds is
    undefined behavior.
-  * Boolean mask based indexing is supported for assignment only (see
-    :ref:`boolean-mask-assignment`).
+  * Boolean mask based indexing is not yet supported.

 The reason for the lack of bounds checking is that exceptions cannot propagate
 from the GPU. Performing bounds checking for array indices before launching the
@@ -144,51 +143,3 @@ expected. For example:

 In the above ``dfdx`` will have the correct gradient, namely zeros at ``idx``
 and ones elsewhere.
-
-.. _boolean-mask-assignment:
-
-Boolean Mask Assignment
-----------------------
-
-MLX supports boolean indices using NumPy syntax. A mask must already be
-a :class:`bool_` MLX :class:`array` or a NumPy ``ndarray`` with ``dtype=bool``.
-Other index types are routed through the standard scatter code.
-
-.. code-block:: shell
-
-   >>> a = mx.array([1.0, 2.0, 3.0])
-   >>> mask = mx.array([True, False, True])
-   >>> updates = mx.array([5.0, 6.0])
-   >>> a[mask] = updates
-   >>> a
-   array([5.0, 2.0, 6.0], dtype=float32)
-
-Scalar assignments broadcast to every ``True`` entry in ``mask``. For non-scalar
-assignments, ``updates`` must provide at least as many elements as there are
-``True`` entries in ``mask``.
-
-.. code-block:: shell
-
-   >>> a = mx.zeros((2, 3))
-   >>> mask = mx.array([[True, False, True],
-                        [False, False, True]])
-   >>> a[mask] = 1.0
-   >>> a
-   array([[1.0, 0.0, 1.0],
-          [0.0, 0.0, 1.0]], dtype=float32)
-
-Boolean masks follow NumPy semantics:
-
- The mask shape must match the shape of the axes it indexes exactly. No mask
-  broadcasting occurs.
- Any axes not covered by the mask are taken in full.
-
-.. code-block:: shell
-
-   >>> a = mx.arange(1000).reshape(10, 10, 10)
-   >>> a[mx.random.randn(10, 10) > 0.0] = 0  # valid: mask covers axes 0 and 1
-
-The mask of shape ``(10, 10)`` applies to the first two axes, so ``a[mask]``
-selects the 1-D slices ``a[i, j, :]`` where ``mask[i, j]`` is ``True``.
-Shapes such as ``(1, 10, 10)`` or ``(10, 10, 1)`` do not match the indexed
-axes and therefore raise errors.
--- a/mlx/array.cpp
+++ b/mlx/array.cpp
@@ -167,7 +167,7 @@ void array::copy_shared_buffer(
    const Strides& strides,
    Flags flags,
    size_t data_size,
-    int64_t offset /* = 0 */) {
+    size_t offset /* = 0 */) {
  array_desc_->data = other.array_desc_->data;
  array_desc_->strides = strides;
  array_desc_->flags = flags;
--- a/mlx/array.h
+++ b/mlx/array.h
@@ -294,11 +294,6 @@ class array {
    return array_desc_->siblings;
  }

-  /** The array's position in the sibling list. */
-  int sibling_position() const {
-    return array_desc_->position;
-  }
-
  void set_siblings(std::vector<array> siblings, uint16_t position) {
    array_desc_->siblings = std::move(siblings);
    array_desc_->position = position;
@@ -439,7 +434,7 @@ class array {
      const Strides& strides,
      Flags flags,
      size_t data_size,
-      int64_t offset = 0);
+      size_t offset = 0);

  void copy_shared_buffer(const array& other);

--- a/mlx/backend/common/slicing.cpp
+++ b/mlx/backend/common/slicing.cpp
@@ -14,13 +14,17 @@ std::tuple<int64_t, Strides> prepare_slice(
    data_offset += start_indices[i] * in.strides()[i];
    inp_strides[i] = in.strides()[i] * strides[i];
  }
+  // Normalize the offset
+  if (data_offset < 0) {
+    data_offset += in.data_size();
+  }
  return std::make_tuple(data_offset, inp_strides);
 }

 void shared_buffer_slice(
    const array& in,
    const Strides& out_strides,
-    int64_t data_offset,
+    size_t data_offset,
    size_t data_size,
    array& out) {
  // Compute row/col contiguity
@@ -47,24 +51,17 @@ void slice(

  // Calculate out strides, initial offset
  auto [data_offset, inp_strides] = prepare_slice(in, start_indices, strides);
-
-  // Get the location of the end based on the inp strides and out.shape()
-  int64_t low_idx = 0;
-  int64_t high_idx = 0;
-  for (int i = 0; i < inp_strides.size(); ++i) {
-    auto delta = inp_strides[i] * (out.shape()[i] - 1);
-    if (inp_strides[i] > 0) {
-      high_idx += delta;
-    } else {
-      low_idx += delta;
+  int64_t data_end = 1;
+  for (int i = 0; i < start_indices.size(); ++i) {
+    if (in.shape()[i] > 1) {
+      auto end_idx = start_indices[i] + out.shape()[i] * strides[i] - 1;
+      data_end += end_idx * in.strides()[i];
    }
  }
-  int64_t data_size = (high_idx - low_idx) + 1;
-  if (data_size < 0) {
-    std::ostringstream msg;
-    msg << "[slice] Computed invalid data size: " << data_size << ".";
-    throw std::runtime_error(msg.str());
+  if (data_end < 0) {
+    data_end += in.data_size();
  }
+  size_t data_size = (data_end - data_offset);
  shared_buffer_slice(in, inp_strides, data_offset, data_size, out);
 }

--- a/mlx/backend/cpu/binary.cpp
+++ b/mlx/backend/cpu/binary.cpp
@@ -14,11 +14,233 @@

 namespace mlx::core {

+namespace {
+
+template <typename Op>
+void binary(const array& a, const array& b, array& out, Op op, Stream stream) {
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out, bopt);
+
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_output_array(out);
+  encoder.dispatch([a = array::unsafe_weak_copy(a),
+                    b = array::unsafe_weak_copy(b),
+                    out = array::unsafe_weak_copy(out),
+                    bopt]() mutable {
+    switch (out.dtype()) {
+      case bool_:
+        binary_op<bool, Op>(a, b, out, bopt);
+        break;
+      case uint8:
+        binary_op<uint8_t, Op>(a, b, out, bopt);
+        break;
+      case uint16:
+        binary_op<uint16_t, Op>(a, b, out, bopt);
+        break;
+      case uint32:
+        binary_op<uint32_t, Op>(a, b, out, bopt);
+        break;
+      case uint64:
+        binary_op<uint64_t, Op>(a, b, out, bopt);
+        break;
+      case int8:
+        binary_op<int8_t, Op>(a, b, out, bopt);
+        break;
+      case int16:
+        binary_op<int16_t, Op>(a, b, out, bopt);
+        break;
+      case int32:
+        binary_op<int32_t, Op>(a, b, out, bopt);
+        break;
+      case int64:
+        binary_op<int64_t, Op>(a, b, out, bopt);
+        break;
+      case float16:
+        binary_op<float16_t, Op>(a, b, out, bopt);
+        break;
+      case float32:
+        binary_op<float, Op>(a, b, out, bopt);
+        break;
+      case float64:
+        binary_op<double, Op>(a, b, out, bopt);
+        break;
+      case bfloat16:
+        binary_op<bfloat16_t, Op>(a, b, out, bopt);
+        break;
+      case complex64:
+        binary_op<complex64_t, Op>(a, b, out, bopt);
+        break;
+    }
+  });
+}
+
+template <typename Op>
+void comparison_op(
+    const array& a,
+    const array& b,
+    array& out,
+    Op op,
+    Stream stream) {
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out, bopt);
+
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_output_array(out);
+  encoder.dispatch([a = array::unsafe_weak_copy(a),
+                    b = array::unsafe_weak_copy(b),
+                    out = array::unsafe_weak_copy(out),
+                    bopt]() mutable {
+    switch (a.dtype()) {
+      case bool_:
+        binary_op<bool, bool, Op>(a, b, out, bopt);
+        break;
+      case uint8:
+        binary_op<uint8_t, bool, Op>(a, b, out, bopt);
+        break;
+      case uint16:
+        binary_op<uint16_t, bool, Op>(a, b, out, bopt);
+        break;
+      case uint32:
+        binary_op<uint32_t, bool, Op>(a, b, out, bopt);
+        break;
+      case uint64:
+        binary_op<uint64_t, bool, Op>(a, b, out, bopt);
+        break;
+      case int8:
+        binary_op<int8_t, bool, Op>(a, b, out, bopt);
+        break;
+      case int16:
+        binary_op<int16_t, bool, Op>(a, b, out, bopt);
+        break;
+      case int32:
+        binary_op<int32_t, bool, Op>(a, b, out, bopt);
+        break;
+      case int64:
+        binary_op<int64_t, bool, Op>(a, b, out, bopt);
+        break;
+      case float16:
+        binary_op<float16_t, bool, Op>(a, b, out, bopt);
+        break;
+      case float32:
+        binary_op<float, bool, Op>(a, b, out, bopt);
+        break;
+      case float64:
+        binary_op<double, bool, Op>(a, b, out, bopt);
+        break;
+      case bfloat16:
+        binary_op<bfloat16_t, bool, Op>(a, b, out, bopt);
+        break;
+      case complex64:
+        binary_op<complex64_t, bool, Op>(a, b, out, bopt);
+        break;
+    }
+  });
+}
+
+template <typename Op>
+void binary_float(
+    const array& a,
+    const array& b,
+    array& out,
+    Op op,
+    Stream stream) {
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out, bopt);
+
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_output_array(out);
+  encoder.dispatch([a = array::unsafe_weak_copy(a),
+                    b = array::unsafe_weak_copy(b),
+                    out = array::unsafe_weak_copy(out),
+                    bopt]() mutable {
+    switch (out.dtype()) {
+      case float16:
+        binary_op<float16_t, Op>(a, b, out, bopt);
+        break;
+      case float32:
+        binary_op<float, Op>(a, b, out, bopt);
+        break;
+      case float64:
+        binary_op<double, Op>(a, b, out, bopt);
+        break;
+      case bfloat16:
+        binary_op<bfloat16_t, Op>(a, b, out, bopt);
+        break;
+      case complex64:
+        binary_op<complex64_t, Op>(a, b, out, bopt);
+        break;
+      default:
+        throw std::runtime_error(
+            "[binary_float] Only supports floating point types.");
+    }
+  });
+}
+
+template <typename Op>
+void binary_int(
+    const array& a,
+    const array& b,
+    array& out,
+    Op op,
+    Stream stream) {
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out, bopt);
+
+  auto& encoder = cpu::get_command_encoder(stream);
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_output_array(out);
+  encoder.dispatch([a = array::unsafe_weak_copy(a),
+                    b = array::unsafe_weak_copy(b),
+                    out = array::unsafe_weak_copy(out),
+                    bopt]() mutable {
+    switch (out.dtype()) {
+      case bool_:
+        binary_op<bool, Op>(a, b, out, bopt);
+      case uint8:
+        binary_op<uint8_t, Op>(a, b, out, bopt);
+        break;
+      case uint16:
+        binary_op<uint16_t, Op>(a, b, out, bopt);
+        break;
+      case uint32:
+        binary_op<uint32_t, Op>(a, b, out, bopt);
+        break;
+      case uint64:
+        binary_op<uint64_t, Op>(a, b, out, bopt);
+        break;
+      case int8:
+        binary_op<int8_t, Op>(a, b, out, bopt);
+        break;
+      case int16:
+        binary_op<int16_t, Op>(a, b, out, bopt);
+        break;
+      case int32:
+        binary_op<int32_t, Op>(a, b, out, bopt);
+        break;
+      case int64:
+        binary_op<int64_t, Op>(a, b, out, bopt);
+        break;
+      default:
+        throw std::runtime_error("[binary_int] Type not supported");
+        break;
+    }
+  });
+}
+
+} // namespace
+
 void Add::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary_op_cpu(a, b, out, detail::Add(), stream());
+  binary(a, b, out, detail::Add(), stream());
 }

 void DivMod::eval_cpu(
@@ -102,14 +324,14 @@ void Divide::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary_op_cpu(a, b, out, detail::Divide(), stream());
+  binary(a, b, out, detail::Divide(), stream());
 }

 void Remainder::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary_op_cpu(a, b, out, detail::Remainder(), stream());
+  binary(a, b, out, detail::Remainder(), stream());
 }

 void Equal::eval_cpu(const std::vector<array>& inputs, array& out) {
@@ -150,90 +372,89 @@ void Equal::eval_cpu(const std::vector<array>& inputs, array& out) {
      }
    });
  } else {
-    comparison_op_cpu(a, b, out, detail::Equal(), stream());
+    comparison_op(a, b, out, detail::Equal(), stream());
  }
 }

 void Greater::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
-  comparison_op_cpu(inputs[0], inputs[1], out, detail::Greater(), stream());
+  comparison_op(inputs[0], inputs[1], out, detail::Greater(), stream());
 }

 void GreaterEqual::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
-  comparison_op_cpu(
-      inputs[0], inputs[1], out, detail::GreaterEqual(), stream());
+  comparison_op(inputs[0], inputs[1], out, detail::GreaterEqual(), stream());
 }

 void Less::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
-  comparison_op_cpu(inputs[0], inputs[1], out, detail::Less(), stream());
+  comparison_op(inputs[0], inputs[1], out, detail::Less(), stream());
 }

 void LessEqual::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
-  comparison_op_cpu(inputs[0], inputs[1], out, detail::LessEqual(), stream());
+  comparison_op(inputs[0], inputs[1], out, detail::LessEqual(), stream());
 }

 void LogAddExp::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary_float_op_cpu(a, b, out, detail::LogAddExp(), stream());
+  binary_float(a, b, out, detail::LogAddExp(), stream());
 }

 void LogicalAnd::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2); // LogicalAnd requires two input arrays
  auto& in1 = inputs[0];
  auto& in2 = inputs[1];
-  binary_op_cpu(in1, in2, out, detail::LogicalAnd(), stream());
+  binary(in1, in2, out, detail::LogicalAnd(), stream());
 }

 void LogicalOr::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2); // LogicalOr requires two input arrays
  auto& in1 = inputs[0];
  auto& in2 = inputs[1];
-  binary_op_cpu(in1, in2, out, detail::LogicalOr(), stream());
+  binary(in1, in2, out, detail::LogicalOr(), stream());
 }

 void Maximum::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary_op_cpu(a, b, out, detail::Maximum(), stream());
+  binary(a, b, out, detail::Maximum(), stream());
 }

 void Minimum::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary_op_cpu(a, b, out, detail::Minimum(), stream());
+  binary(a, b, out, detail::Minimum(), stream());
 }

 void Multiply::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary_op_cpu(a, b, out, detail::Multiply(), stream());
+  binary(a, b, out, detail::Multiply(), stream());
 }

 void NotEqual::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
-  comparison_op_cpu(inputs[0], inputs[1], out, detail::NotEqual(), stream());
+  comparison_op(inputs[0], inputs[1], out, detail::NotEqual(), stream());
 }

 void Power::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary_op_cpu(a, b, out, detail::Power(), stream());
+  binary(a, b, out, detail::Power(), stream());
 }

 void Subtract::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary_op_cpu(a, b, out, detail::Subtract(), stream());
+  binary(a, b, out, detail::Subtract(), stream());
 }

 void BitwiseBinary::eval_cpu(const std::vector<array>& inputs, array& out) {
@@ -242,19 +463,19 @@ void BitwiseBinary::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& b = inputs[1];
  switch (op_) {
    case BitwiseBinary::And:
-      binary_int_op_cpu(a, b, out, detail::BitwiseAnd(), stream());
+      binary_int(a, b, out, detail::BitwiseAnd(), stream());
      break;
    case BitwiseBinary::Or:
-      binary_int_op_cpu(a, b, out, detail::BitwiseOr(), stream());
+      binary_int(a, b, out, detail::BitwiseOr(), stream());
      break;
    case BitwiseBinary::Xor:
-      binary_int_op_cpu(a, b, out, detail::BitwiseXor(), stream());
+      binary_int(a, b, out, detail::BitwiseXor(), stream());
      break;
    case BitwiseBinary::LeftShift:
-      binary_int_op_cpu(a, b, out, detail::LeftShift(), stream());
+      binary_int(a, b, out, detail::LeftShift(), stream());
      break;
    case BitwiseBinary::RightShift:
-      binary_int_op_cpu(a, b, out, detail::RightShift(), stream());
+      binary_int(a, b, out, detail::RightShift(), stream());
      break;
  }
 }
@@ -263,7 +484,7 @@ void ArcTan2::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  const auto& a = inputs[0];
  const auto& b = inputs[1];
-  binary_float_op_cpu(a, b, out, detail::ArcTan2(), stream());
+  binary_float(a, b, out, detail::ArcTan2(), stream());
 }

 } // namespace mlx::core
--- a/mlx/backend/cpu/binary.h
+++ b/mlx/backend/cpu/binary.h
@@ -7,7 +7,6 @@
 #include "mlx/backend/common/binary.h"
 #include "mlx/backend/common/utils.h"

-#include "mlx/backend/cpu/encoder.h"
 #include "mlx/backend/cpu/simd/simd.h"

 namespace mlx::core {
@@ -291,227 +290,4 @@ void binary_op(const array& a, const array& b, array& out, BinaryOpType bopt) {
  binary_op<T, T, Op>(a, b, out, bopt);
 }

-template <typename Op>
-void binary_op_cpu(
-    const array& a,
-    const array& b,
-    array& out,
-    Op op,
-    Stream stream) {
-  auto bopt = get_binary_op_type(a, b);
-  set_binary_op_output_data(a, b, out, bopt);
-
-  auto& encoder = cpu::get_command_encoder(stream);
-  encoder.set_input_array(a);
-  encoder.set_input_array(b);
-  encoder.set_output_array(out);
-  encoder.dispatch([a = array::unsafe_weak_copy(a),
-                    b = array::unsafe_weak_copy(b),
-                    out = array::unsafe_weak_copy(out),
-                    bopt]() mutable {
-    switch (out.dtype()) {
-      case bool_:
-        binary_op<bool, Op>(a, b, out, bopt);
-        break;
-      case uint8:
-        binary_op<uint8_t, Op>(a, b, out, bopt);
-        break;
-      case uint16:
-        binary_op<uint16_t, Op>(a, b, out, bopt);
-        break;
-      case uint32:
-        binary_op<uint32_t, Op>(a, b, out, bopt);
-        break;
-      case uint64:
-        binary_op<uint64_t, Op>(a, b, out, bopt);
-        break;
-      case int8:
-        binary_op<int8_t, Op>(a, b, out, bopt);
-        break;
-      case int16:
-        binary_op<int16_t, Op>(a, b, out, bopt);
-        break;
-      case int32:
-        binary_op<int32_t, Op>(a, b, out, bopt);
-        break;
-      case int64:
-        binary_op<int64_t, Op>(a, b, out, bopt);
-        break;
-      case float16:
-        binary_op<float16_t, Op>(a, b, out, bopt);
-        break;
-      case float32:
-        binary_op<float, Op>(a, b, out, bopt);
-        break;
-      case float64:
-        binary_op<double, Op>(a, b, out, bopt);
-        break;
-      case bfloat16:
-        binary_op<bfloat16_t, Op>(a, b, out, bopt);
-        break;
-      case complex64:
-        binary_op<complex64_t, Op>(a, b, out, bopt);
-        break;
-    }
-  });
-}
-
-template <typename Op>
-void comparison_op_cpu(
-    const array& a,
-    const array& b,
-    array& out,
-    Op op,
-    Stream stream) {
-  auto bopt = get_binary_op_type(a, b);
-  set_binary_op_output_data(a, b, out, bopt);
-
-  auto& encoder = cpu::get_command_encoder(stream);
-  encoder.set_input_array(a);
-  encoder.set_input_array(b);
-  encoder.set_output_array(out);
-  encoder.dispatch([a = array::unsafe_weak_copy(a),
-                    b = array::unsafe_weak_copy(b),
-                    out = array::unsafe_weak_copy(out),
-                    bopt]() mutable {
-    switch (a.dtype()) {
-      case bool_:
-        binary_op<bool, bool, Op>(a, b, out, bopt);
-        break;
-      case uint8:
-        binary_op<uint8_t, bool, Op>(a, b, out, bopt);
-        break;
-      case uint16:
-        binary_op<uint16_t, bool, Op>(a, b, out, bopt);
-        break;
-      case uint32:
-        binary_op<uint32_t, bool, Op>(a, b, out, bopt);
-        break;
-      case uint64:
-        binary_op<uint64_t, bool, Op>(a, b, out, bopt);
-        break;
-      case int8:
-        binary_op<int8_t, bool, Op>(a, b, out, bopt);
-        break;
-      case int16:
-        binary_op<int16_t, bool, Op>(a, b, out, bopt);
-        break;
-      case int32:
-        binary_op<int32_t, bool, Op>(a, b, out, bopt);
-        break;
-      case int64:
-        binary_op<int64_t, bool, Op>(a, b, out, bopt);
-        break;
-      case float16:
-        binary_op<float16_t, bool, Op>(a, b, out, bopt);
-        break;
-      case float32:
-        binary_op<float, bool, Op>(a, b, out, bopt);
-        break;
-      case float64:
-        binary_op<double, bool, Op>(a, b, out, bopt);
-        break;
-      case bfloat16:
-        binary_op<bfloat16_t, bool, Op>(a, b, out, bopt);
-        break;
-      case complex64:
-        binary_op<complex64_t, bool, Op>(a, b, out, bopt);
-        break;
-    }
-  });
-}
-
-template <typename Op>
-void binary_float_op_cpu(
-    const array& a,
-    const array& b,
-    array& out,
-    Op op,
-    Stream stream) {
-  auto bopt = get_binary_op_type(a, b);
-  set_binary_op_output_data(a, b, out, bopt);
-
-  auto& encoder = cpu::get_command_encoder(stream);
-  encoder.set_input_array(a);
-  encoder.set_input_array(b);
-  encoder.set_output_array(out);
-  encoder.dispatch([a = array::unsafe_weak_copy(a),
-                    b = array::unsafe_weak_copy(b),
-                    out = array::unsafe_weak_copy(out),
-                    bopt]() mutable {
-    switch (out.dtype()) {
-      case float16:
-        binary_op<float16_t, Op>(a, b, out, bopt);
-        break;
-      case float32:
-        binary_op<float, Op>(a, b, out, bopt);
-        break;
-      case float64:
-        binary_op<double, Op>(a, b, out, bopt);
-        break;
-      case bfloat16:
-        binary_op<bfloat16_t, Op>(a, b, out, bopt);
-        break;
-      case complex64:
-        binary_op<complex64_t, Op>(a, b, out, bopt);
-        break;
-      default:
-        throw std::runtime_error(
-            "[binary_float] Only supports floating point types.");
-    }
-  });
-}
-
-template <typename Op>
-void binary_int_op_cpu(
-    const array& a,
-    const array& b,
-    array& out,
-    Op op,
-    Stream stream) {
-  auto bopt = get_binary_op_type(a, b);
-  set_binary_op_output_data(a, b, out, bopt);
-
-  auto& encoder = cpu::get_command_encoder(stream);
-  encoder.set_input_array(a);
-  encoder.set_input_array(b);
-  encoder.set_output_array(out);
-  encoder.dispatch([a = array::unsafe_weak_copy(a),
-                    b = array::unsafe_weak_copy(b),
-                    out = array::unsafe_weak_copy(out),
-                    bopt]() mutable {
-    switch (out.dtype()) {
-      case bool_:
-        binary_op<bool, Op>(a, b, out, bopt);
-      case uint8:
-        binary_op<uint8_t, Op>(a, b, out, bopt);
-        break;
-      case uint16:
-        binary_op<uint16_t, Op>(a, b, out, bopt);
-        break;
-      case uint32:
-        binary_op<uint32_t, Op>(a, b, out, bopt);
-        break;
-      case uint64:
-        binary_op<uint64_t, Op>(a, b, out, bopt);
-        break;
-      case int8:
-        binary_op<int8_t, Op>(a, b, out, bopt);
-        break;
-      case int16:
-        binary_op<int16_t, Op>(a, b, out, bopt);
-        break;
-      case int32:
-        binary_op<int32_t, Op>(a, b, out, bopt);
-        break;
-      case int64:
-        binary_op<int64_t, Op>(a, b, out, bopt);
-        break;
-      default:
-        throw std::runtime_error("[binary_int] Type not supported");
-        break;
-    }
-  });
-}
-
 } // namespace mlx::core
--- a/mlx/backend/cpu/distributed.cpp
+++ b/mlx/backend/cpu/distributed.cpp
@@ -95,9 +95,4 @@ void Recv::eval_cpu(
  distributed::detail::recv(group(), outputs[0], src_, stream());
 }

-void ReduceScatter::eval_cpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
-  throw std::runtime_error("[ReduceScatter] Not implemented yet.");
-}
 } // namespace mlx::core::distributed
--- a/mlx/backend/cpu/indexing.cpp
+++ b/mlx/backend/cpu/indexing.cpp
@@ -747,108 +747,4 @@ void ScatterAxis::eval_cpu(const std::vector<array>& inputs, array& out) {
  });
 }

-template <typename T>
-void masked_scatter_impl(const array& mask, const array& src, array& out) {
-  ContiguousIterator mask_it(mask);
-  ContiguousIterator src_it(src);
-  ContiguousIterator out_it(out);
-
-  const bool* mask_ptr = mask.data<bool>();
-  const T* src_ptr = src.data<T>();
-  T* dst_ptr = out.data<T>();
-
-  const size_t batch_count = mask.shape(0);
-  const size_t mask_batch_size = mask.size() / batch_count;
-  const size_t src_batch_size = src.size() / batch_count;
-
-  for (uint b = 0; b < batch_count; ++b) {
-    size_t src_consumed = 0;
-    src_it.seek(b * src_batch_size);
-
-    for (size_t i = 0; i < mask_batch_size; ++i) {
-      if (mask_ptr[mask_it.loc]) {
-        if (src_consumed >= src_batch_size) {
-          throw std::runtime_error(
-              "[MaskedScatter::eval_cpu] Source does not have enough elements for mask.");
-        }
-        dst_ptr[out_it.loc] = src_ptr[src_it.loc];
-        src_it.step();
-        ++src_consumed;
-      }
-      mask_it.step();
-      out_it.step();
-    }
-  }
-}
-
-void MaskedScatter::eval_cpu(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 3);
-
-  auto& dst = inputs[0];
-  auto& mask = inputs[1];
-  auto& src = inputs[2];
-
-  // Copy src into out (copy allocates memory for out)
-  auto ctype =
-      dst.flags().row_contiguous ? CopyType::Vector : CopyType::General;
-  copy_cpu(dst, out, ctype, stream());
-
-  if (mask.size() == 0) {
-    return;
-  }
-
-  auto& encoder = cpu::get_command_encoder(stream());
-  encoder.set_input_array(mask);
-  encoder.set_input_array(src);
-  encoder.set_output_array(out);
-  encoder.dispatch([mask = array::unsafe_weak_copy(mask),
-                    src = array::unsafe_weak_copy(src),
-                    out = array::unsafe_weak_copy(out)]() mutable {
-    switch (out.dtype()) {
-      case bool_:
-        masked_scatter_impl<bool>(mask, src, out);
-        break;
-      case uint8:
-        masked_scatter_impl<uint8_t>(mask, src, out);
-        break;
-      case uint16:
-        masked_scatter_impl<uint16_t>(mask, src, out);
-        break;
-      case uint32:
-        masked_scatter_impl<uint32_t>(mask, src, out);
-        break;
-      case uint64:
-        masked_scatter_impl<uint64_t>(mask, src, out);
-        break;
-      case int8:
-        masked_scatter_impl<int8_t>(mask, src, out);
-        break;
-      case int16:
-        masked_scatter_impl<int16_t>(mask, src, out);
-        break;
-      case int32:
-        masked_scatter_impl<int32_t>(mask, src, out);
-        break;
-      case int64:
-        masked_scatter_impl<int64_t>(mask, src, out);
-        break;
-      case float16:
-        masked_scatter_impl<float16_t>(mask, src, out);
-        break;
-      case float32:
-        masked_scatter_impl<float>(mask, src, out);
-        break;
-      case float64:
-        masked_scatter_impl<double>(mask, src, out);
-        break;
-      case bfloat16:
-        masked_scatter_impl<bfloat16_t>(mask, src, out);
-        break;
-      case complex64:
-        masked_scatter_impl<complex64_t>(mask, src, out);
-        break;
-    }
-  });
-}
-
 } // namespace mlx::core
--- a/mlx/backend/cpu/matmul.cpp
+++ b/mlx/backend/cpu/matmul.cpp
@@ -2,8 +2,6 @@

 #include <cstring>
 #include "mlx/array.h"
-#include "mlx/backend/cpu/binary.h"
-#include "mlx/backend/cpu/binary_ops.h"
 #include "mlx/backend/cpu/copy.h"
 #include "mlx/backend/cpu/encoder.h"
 #include "mlx/backend/cpu/gemm.h"
@@ -137,29 +135,15 @@ void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
    return;
  }

-  // Handle empty matrix case (K=0)
-  if (inputs[0].shape(-1) == 0) {
-    auto& c = inputs[2];
-    if (beta_ == 1.0f) {
-      CopyType ctype = c.data_size() == 1
-          ? CopyType::Scalar
-          : (c.flags().row_contiguous ? CopyType::Vector : CopyType::General);
-      copy_cpu(c, out, ctype, stream());
-    } else {
-      array beta_scalar = array(beta_, c.dtype());
-      auto& encoder = cpu::get_command_encoder(stream());
-      binary_float_op_cpu(c, beta_scalar, out, detail::Multiply(), stream());
-      encoder.add_temporary(std::move(beta_scalar));
-    }
-    return;
-  }
-
  // Fill output with C
  auto& c = inputs[2];
  CopyType ctype = c.data_size() == 1
      ? CopyType::Scalar
      : (c.flags().row_contiguous ? CopyType::Vector : CopyType::General);
  copy_cpu(c, out, ctype, stream());
+  if (inputs[0].shape(-1) == 0) {
+    return;
+  }
  matmul_general(inputs[0], inputs[1], out, stream(), alpha_, beta_);
 }

--- a/mlx/backend/cpu/simd/accelerate_simd.h
+++ b/mlx/backend/cpu/simd/accelerate_simd.h
@@ -217,20 +217,14 @@ Simd<T, N> atan2(Simd<T, N> a, Simd<T, N> b) {

 template <typename T, int N>
 Simd<T, N> maximum(Simd<T, N> a, Simd<T, N> b) {
-  auto out = Simd<T, N>(asd::max(a.value, b.value));
-  if constexpr (!std::is_integral_v<T>) {
-    out = select(isnan(b), b, select(isnan(a), a, out));
-  }
-  return out;
+  // TODO add isnan
+  return asd::max(a.value, b.value);
 }

 template <typename T, int N>
 Simd<T, N> minimum(Simd<T, N> a, Simd<T, N> b) {
-  auto out = Simd<T, N>(asd::min(a.value, b.value));
-  if constexpr (!std::is_integral_v<T>) {
-    out = select(isnan(b), b, select(isnan(a), a, out));
-  }
-  return out;
+  // TODO add isnan
+  return asd::min(a.value, b.value);
 }

 template <typename T, int N>
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -44,7 +44,6 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/row_reduce.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/rms_norm.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/rope.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/scaled_dot_product_attention.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/scaled_dot_product_attention.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/scan.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
@@ -126,11 +125,7 @@ endif()
 # Compute capability >= 7.0 is required for synchronization between CPU/GPU with
 # managed memory.
 if(NOT DEFINED MLX_CUDA_ARCHITECTURES)
-  execute_process(
-    COMMAND bash detect_cuda_arch.sh
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-    OUTPUT_VARIABLE MLX_CUDA_ARCHITECTURES
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
+  set(MLX_CUDA_ARCHITECTURES "native")
 endif()
 message(STATUS "CUDA architectures: ${MLX_CUDA_ARCHITECTURES}")
 set_target_properties(mlx PROPERTIES CUDA_ARCHITECTURES
@@ -142,7 +137,6 @@ FetchContent_Declare(
  URL "https://github.com/NVIDIA/cccl/releases/download/v2.8.1/cccl-v2.8.1.zip")
 FetchContent_MakeAvailable(cccl)
 target_include_directories(mlx BEFORE PRIVATE "${cccl_SOURCE_DIR}/include")
-set_target_properties(mlx PROPERTIES CCCL_DIR "${cccl_SOURCE_DIR}/include")

 # Use fixed version of NVTX.
 FetchContent_Declare(
@@ -168,7 +162,7 @@ target_link_libraries(mlx PRIVATE CUDA::nvrtc CUDA::cuda_driver)
 FetchContent_Declare(
  cudnn
  GIT_REPOSITORY https://github.com/NVIDIA/cudnn-frontend.git
-  GIT_TAG v1.16.0
+  GIT_TAG v1.14.0
  GIT_SHALLOW TRUE
  EXCLUDE_FROM_ALL)
 set(CUDNN_FRONTEND_SKIP_JSON_LIB ON)
--- a/mlx/backend/cuda/allocator.cpp
+++ b/mlx/backend/cuda/allocator.cpp
@@ -97,35 +97,17 @@ CudaAllocator::CudaAllocator()

  int device_count = 0;
  CHECK_CUDA_ERROR(cudaGetDeviceCount(&device_count));
-  int curr;
-  CHECK_CUDA_ERROR(cudaGetDevice(&curr));
  for (int i = 0; i < device_count; ++i) {
    CHECK_CUDA_ERROR(cudaSetDevice(i));
    cudaStream_t s;
    CHECK_CUDA_ERROR(cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking));
    free_streams_.push_back(s);
  }
-  CHECK_CUDA_ERROR(cudaSetDevice(curr));
 }

-void copy_to_managed(CudaBuffer& buf) {
-  // TODO maybe make this async on a i/o stream to avoid synchronizing the
-  // device on malloc/and free
-  void* new_data;
-  CHECK_CUDA_ERROR(cudaMallocManaged(&new_data, buf.size));
-  buf.device = -1;
-  CHECK_CUDA_ERROR(cudaMemcpy(new_data, buf.data, buf.size, cudaMemcpyDefault));
-  CHECK_CUDA_ERROR(cudaFree(buf.data));
-  buf.data = new_data;
-}
-
-Buffer
-CudaAllocator::malloc_async(size_t size, int device, cudaStream_t stream) {
-  if (size == 0) {
-    return Buffer{new CudaBuffer{nullptr, 0, -1}};
-  }
-
+Buffer CudaAllocator::malloc_impl(size_t size, cudaStream_t stream) {
  // Find available buffer from cache.
+  auto orig_size = size;
  std::unique_lock lock(mutex_);
  if (size <= small_block_size) {
    size = 8;
@@ -135,10 +117,6 @@ CudaAllocator::malloc_async(size_t size, int device, cudaStream_t stream) {
    size = page_size * ((size + page_size - 1) / page_size);
  }

-  if (size <= small_block_size || stream == nullptr) {
-    device = -1;
-  }
-
  CudaBuffer* buf = buffer_cache_.reuse_from_cache(size);
  if (!buf) {
    // If we have a lot of memory pressure try to reclaim memory from the cache.
@@ -154,6 +132,10 @@ CudaAllocator::malloc_async(size_t size, int device, cudaStream_t stream) {
    }
    lock.unlock();
    if (!buf) {
+      int device = -1;
+      if (stream != nullptr) {
+        cudaStreamGetDevice(stream, &device);
+      }
      buf = new CudaBuffer{nullptr, size, device};
      cudaError_t err;
      if (device == -1) {
@@ -168,22 +150,22 @@ CudaAllocator::malloc_async(size_t size, int device, cudaStream_t stream) {
    }
    lock.lock();
  }
-  active_memory_ += buf->size;
+  active_memory_ += size;
  peak_memory_ = std::max(active_memory_, peak_memory_);

  // Maintain the cache below the requested limit.
  if (get_cache_memory() > max_pool_size_) {
    buffer_cache_.release_cached_buffers(get_cache_memory() - max_pool_size_);
  }
-  // Copy to managed here if the buffer is not on the right device
-  if (buf->device != device) {
-    copy_to_managed(*buf);
-  }
  return Buffer{buf};
 }

+Buffer CudaAllocator::malloc_async(size_t size, cudaStream_t stream) {
+  return malloc_impl(size, stream);
+}
+
 Buffer CudaAllocator::malloc(size_t size) {
-  return malloc_async(size, -1, nullptr);
+  return malloc_impl(size, nullptr);
 }

 void CudaAllocator::free(Buffer buffer) {
@@ -191,10 +173,6 @@ void CudaAllocator::free(Buffer buffer) {
  if (!buf) {
    return;
  }
-  if (buf->size == 0) {
-    delete buf;
-    return;
-  }

  std::unique_lock lock(mutex_);
  active_memory_ -= buf->size;
@@ -273,9 +251,8 @@ CudaAllocator& allocator() {
  return *allocator_;
 }

-Buffer malloc_async(size_t size, CommandEncoder& encoder) {
-  auto buffer = allocator().malloc_async(
-      size, encoder.device().cuda_device(), encoder.stream());
+Buffer malloc_async(size_t size, cudaStream_t stream) {
+  auto buffer = allocator().malloc_async(size, stream);
  if (size && !buffer.ptr()) {
    std::ostringstream msg;
    msg << "[malloc_async] Unable to allocate " << size << " bytes.";
@@ -298,7 +275,15 @@ void* Buffer::raw_ptr() {
  }
  auto& cbuf = *static_cast<cu::CudaBuffer*>(ptr_);
  if (cbuf.device != -1) {
-    copy_to_managed(cbuf);
+    // TODO maybe make this async on a i/o stream to avoid synchronizing the
+    // device on malloc/and free
+    void* new_data;
+    CHECK_CUDA_ERROR(cudaMallocManaged(&new_data, cbuf.size));
+    cbuf.device = -1;
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(new_data, cbuf.data, cbuf.size, cudaMemcpyDefault));
+    CHECK_CUDA_ERROR(cudaFree(cbuf.data));
+    cbuf.data = new_data;
  }
  return cbuf.data;
 }
--- a/mlx/backend/cuda/allocator.h
+++ b/mlx/backend/cuda/allocator.h
@@ -13,8 +13,6 @@

 namespace mlx::core::cu {

-class CommandEncoder;
-
 using allocator::Buffer;

 // Stores cuda-managed unified memory.
@@ -50,7 +48,7 @@ class SmallSizePool {
 class CudaAllocator : public allocator::Allocator {
 public:
  Buffer malloc(size_t size) override;
-  Buffer malloc_async(size_t size, int device, cudaStream_t stream);
+  Buffer malloc_async(size_t size, cudaStream_t stream);
  void free(Buffer buffer) override;
  size_t size(Buffer buffer) const override;

@@ -64,6 +62,7 @@ class CudaAllocator : public allocator::Allocator {
  void clear_cache();

 private:
+  Buffer malloc_impl(size_t size, cudaStream_t stream);
  void cuda_free(CudaBuffer* buf);

  CudaAllocator();
@@ -81,6 +80,6 @@ class CudaAllocator : public allocator::Allocator {

 CudaAllocator& allocator();

-Buffer malloc_async(size_t size, CommandEncoder& encoder);
+Buffer malloc_async(size_t size, cudaStream_t stream);

 } // namespace mlx::core::cu
--- a/mlx/backend/cuda/arange.cu
+++ b/mlx/backend/cuda/arange.cu
@@ -42,7 +42,7 @@ void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
    return;
  }
  auto& encoder = cu::get_command_encoder(stream());
-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
  encoder.set_output_array(out);

  dispatch_int_float_types(out.dtype(), "Arange", [&](auto type_tag) {
--- a/mlx/backend/cuda/arg_reduce.cu
+++ b/mlx/backend/cuda/arg_reduce.cu
@@ -143,7 +143,7 @@ void ArgReduce::eval_gpu(const std::vector<array>& inputs, array& out) {

  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));

  // Prepare the shapes, strides and axis arguments.
  Shape shape = remove_index(in.shape(), axis_);
--- a/mlx/backend/cuda/binary/binary.cuh
+++ b/mlx/backend/cuda/binary/binary.cuh
@@ -367,8 +367,9 @@ void binary_op_gpu(
  auto bopt = get_binary_op_type(a, b);
  auto& encoder = cu::get_command_encoder(s);

-  set_binary_op_output_data(
-      a, b, out, bopt, [&](auto n) { return cu::malloc_async(n, encoder); });
+  set_binary_op_output_data(a, b, out, bopt, [&](auto n) {
+    return cu::malloc_async(n, encoder.stream());
+  });
  binary_op_gpu_inplace<Op>(inputs, out, op, s);
 }

--- a/mlx/backend/cuda/binary_two.cu
+++ b/mlx/backend/cuda/binary_two.cu
@@ -246,10 +246,12 @@ void binary_two_op_gpu_inplace(
  auto& out_b = outputs[1];
  auto bopt = get_binary_op_type(a, b);
  auto& encoder = cu::get_command_encoder(s);
-  set_binary_op_output_data(
-      a, b, out_a, bopt, [&](auto n) { return cu::malloc_async(n, encoder); });
-  set_binary_op_output_data(
-      a, b, out_b, bopt, [&](auto n) { return cu::malloc_async(n, encoder); });
+  set_binary_op_output_data(a, b, out_a, bopt, [&](auto n) {
+    return cu::malloc_async(n, encoder.stream());
+  });
+  set_binary_op_output_data(a, b, out_b, bopt, [&](auto n) {
+    return cu::malloc_async(n, encoder.stream());
+  });

  if (out_a.size() == 0) {
    return;
--- a/mlx/backend/cuda/compiled.cpp
+++ b/mlx/backend/cuda/compiled.cpp
@@ -298,7 +298,7 @@ void Compiled::eval_gpu(
  // Put outputs.
  compiled_allocate_outputs(
      inputs, outputs, is_constant_, contiguous, [&](auto n) {
-        return cu::malloc_async(n, encoder);
+        return cu::malloc_async(n, encoder.stream());
      });
  for (auto& x : outputs) {
    args.append(x);
--- a/mlx/backend/cuda/conv.cpp
+++ b/mlx/backend/cuda/conv.cpp
@@ -277,12 +277,11 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
  array in = inputs[0];
  array wt = inputs[1];
  array out = out_;
-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
  Dtype dtype = out.dtype();

  // Search cache.
-  BytesKey<ConvCacheKey> cache_key;
-  cache_key.pod = {
+  ConvCacheKey cache_key{
      encoder.device().cuda_device(),
      dtype_to_cudnn_type(dtype),
      vector_key(in.shape()),
--- a/mlx/backend/cuda/conv/gemm_conv.cu
+++ b/mlx/backend/cuda/conv/gemm_conv.cu
@@ -86,7 +86,7 @@ array unfold_inputs_nd(
    int mat_N,
    ConvParams<NDIM>& params) {
  array unfolded({mat_M, mat_K}, in.dtype(), nullptr, {});
-  unfolded.set_data(cu::malloc_async(unfolded.nbytes(), encoder));
+  unfolded.set_data(cu::malloc_async(unfolded.nbytes(), encoder.stream()));
  encoder.add_temporary(unfolded);

  int filter_size = params.C;
--- a/mlx/backend/cuda/conv/gemm_grouped_conv.cu
+++ b/mlx/backend/cuda/conv/gemm_grouped_conv.cu
@@ -89,7 +89,7 @@ array grouped_unfold_transpose_inputs_nd(
    int mat_N,
    ConvParams<NDIM>& params) {
  array unfolded({mat_M, mat_K * params.groups}, in.dtype(), nullptr, {});
-  unfolded.set_data(cu::malloc_async(unfolded.nbytes(), encoder));
+  unfolded.set_data(cu::malloc_async(unfolded.nbytes(), encoder.stream()));
  encoder.add_temporary(unfolded);

  int filter_size = params.C;
--- a/mlx/backend/cuda/copy.cu
+++ b/mlx/backend/cuda/copy.cu
@@ -7,8 +7,9 @@ namespace mlx::core {

 void copy_gpu(const array& in, array& out, CopyType ctype, const Stream& s) {
  auto& encoder = cu::get_command_encoder(s);
-  bool donated = set_copy_output_data(
-      in, out, ctype, [&](auto n) { return cu::malloc_async(n, encoder); });
+  bool donated = set_copy_output_data(in, out, ctype, [&](auto n) {
+    return cu::malloc_async(n, encoder.stream());
+  });
  if (donated && in.dtype() == out.dtype()) {
    // If the output has the same type as the input then there is nothing to
    // copy, just use the buffer.
@@ -103,7 +104,7 @@ void fill_gpu(const array& in, array& out, const Stream& s) {
    return;
  }
  auto& encoder = cu::get_command_encoder(s);
-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  copy_contiguous(encoder, CopyType::Scalar, in, out, 0, 0);
@@ -113,7 +114,7 @@ void reshape_gpu(const array& in, array& out, Stream s) {
  auto [copy_necessary, out_strides] = prepare_reshape(in, out);
  if (copy_necessary) {
    auto& encoder = cu::get_command_encoder(s);
-    out.set_data(cu::malloc_async(out.nbytes(), encoder));
+    out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
    copy_gpu_inplace(
        in,
        out,
--- a/mlx/backend/cuda/cudnn_utils.cpp
+++ b/mlx/backend/cuda/cudnn_utils.cpp
@@ -135,7 +135,9 @@ bool prepare_cudnn_plan(
  void* workspace_ptr = nullptr;
  if (workspace_size > 0) {
    array workspace(
-        cu::malloc_async(workspace_size, encoder), {workspace_size}, uint8);
+        cu::malloc_async(workspace_size, encoder.stream()),
+        {workspace_size},
+        uint8);
    encoder.add_temporary(workspace);
    workspace_ptr = gpu_ptr<void>(workspace);
  }
--- a/mlx/backend/cuda/cudnn_utils.h
+++ b/mlx/backend/cuda/cudnn_utils.h
@@ -44,13 +44,13 @@ inline SmallVector<T> convert_vector(const Vec& vec) {
 // There are 2 differences from the const_param util from kernel_utils.cuh:
 // 1. The rest of array is filled with 0.
 // 2. This util can be used in .cpp files.
-template <int NDIM = MAX_NDIM, typename T, template <typename U> class Vec>
-inline std::array<T, NDIM> vector_key(const Vec<T>& vec) {
-  if (vec.size() > NDIM) {
+template <typename T, template <typename U> class Vec>
+inline std::array<T, MAX_NDIM> vector_key(const Vec<T>& vec) {
+  if (vec.size() > MAX_NDIM) {
    throw std::runtime_error(
-        fmt::format("ndim can not be larger than {}.", NDIM));
+        fmt::format("ndim can not be larger than {}.", MAX_NDIM));
  }
-  std::array<T, NDIM> result = {};
+  std::array<T, MAX_NDIM> result = {};
  std::copy_n(vec.begin(), vec.size(), result.begin());
  return result;
 }
--- a/mlx/backend/cuda/custom_kernel.cpp
+++ b/mlx/backend/cuda/custom_kernel.cpp
@@ -57,7 +57,7 @@ std::string build_kernel(
    const std::vector<std::string>& output_names,
    const std::vector<Dtype>& output_dtypes,
    const std::vector<std::pair<std::string, TemplateArg>>& template_args,
-    const std::vector<std::tuple<bool, bool, bool>>& shape_infos) {
+    const std::vector<CustomKernelShapeInfo>& shape_infos) {
  std::string kernel_source;
  kernel_source.reserve(header.size() + source.size() + 8192);
  kernel_source += default_header;
@@ -81,17 +81,17 @@ std::string build_kernel(
    kernel_source += ",\n";
    // Add input shape, strides and ndim if present in the source
    if (arr.ndim() > 0) {
-      if (std::get<0>(shape_infos[i])) {
+      if (shape_infos[i].shape) {
        kernel_source += "    const __grid_constant__ Shape ";
        kernel_source += name;
        kernel_source += "_shape,\n";
      }
-      if (std::get<1>(shape_infos[i])) {
+      if (shape_infos[i].strides) {
        kernel_source += "    const __grid_constant__ Strides ";
        kernel_source += name;
        kernel_source += "_strides,\n";
      }
-      if (std::get<2>(shape_infos[i])) {
+      if (shape_infos[i].ndim) {
        kernel_source += "    const __grid_constant__ int ";
        kernel_source += name;
        kernel_source += "_ndim,\n";
@@ -154,12 +154,12 @@ CustomKernelFunction cuda_kernel(
        "[custom_kernel] Must specify at least one output.");
  }

-  std::vector<std::tuple<bool, bool, bool>> shape_infos;
+  std::vector<CustomKernelShapeInfo> shape_infos;
  for (auto& n : input_names) {
-    std::tuple<bool, bool, bool> shape_info;
-    std::get<0>(shape_info) = source.find(n + "_shape") != std::string::npos;
-    std::get<1>(shape_info) = source.find(n + "_strides") != std::string::npos;
-    std::get<2>(shape_info) = source.find(n + "_ndim") != std::string::npos;
+    CustomKernelShapeInfo shape_info;
+    shape_info.shape = source.find(n + "_shape") != std::string::npos;
+    shape_info.strides = source.find(n + "_strides") != std::string::npos;
+    shape_info.ndim = source.find(n + "_ndim") != std::string::npos;
    shape_infos.push_back(shape_info);
  }

@@ -254,8 +254,8 @@ std::vector<array> precompiled_cuda_kernel(
    std::optional<float> init_value,
    bool ensure_row_contiguous,
    StreamOrDevice s) {
-  std::vector<std::tuple<bool, bool, bool>> shape_infos(
-      inputs.size(), {false, false, false});
+  std::vector<CustomKernelShapeInfo> shape_infos(
+      inputs.size(), CustomKernelShapeInfo{false, false, false});
  return array::make_arrays(
      output_shapes,
      output_dtypes,
@@ -289,7 +289,7 @@ void CustomKernel::eval_gpu(
      copies.emplace_back(init_value_.value(), out.dtype());
      fill_gpu(copies.back(), out, s);
    } else {
-      out.set_data(cu::malloc_async(out.nbytes(), encoder));
+      out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
    }
  }

@@ -327,13 +327,13 @@ void CustomKernel::eval_gpu(
    const array& in = checked_inputs[i];
    auto& shape_info = shape_infos_[i];
    args.append(in);
-    if (std::get<0>(shape_info)) {
+    if (shape_info.shape) {
      args.append_ndim(in.shape());
    }
-    if (std::get<1>(shape_info)) {
+    if (shape_info.strides) {
      args.append_ndim(in.strides());
    }
-    if (std::get<2>(shape_info)) {
+    if (shape_info.ndim) {
      args.append<int32_t>(in.ndim());
    }
  }
--- a/mlx/backend/cuda/detect_cuda_arch.sh
+++ b/mlx/backend/cuda/detect_cuda_arch.sh
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-arch=`__nvcc_device_query`
-case "$arch" in
-    "90")
-        echo "90a" ;;
-    "100")
-        echo "100a" ;;
-    "121")
-        echo "121a" ;;
-    *)
-        echo "native" ;;
-esac
--- a/mlx/backend/cuda/device.cpp
+++ b/mlx/backend/cuda/device.cpp
@@ -46,7 +46,6 @@ Device::Device(int device) : device_(device) {
        "Device {} does not support synchronization in managed memory.",
        device_));
  }
-
  // The cublasLt handle is used by matmul.
  make_current();
  CHECK_CUBLAS_ERROR(cublasLtCreate(&lt_));
@@ -190,41 +189,12 @@ void CommandEncoder::insert_graph_dependencies(std::vector<GraphNode> nodes) {
  }
 }

-// Can be tuned with MLX_MAX_OPS_PER_BUFFER, MLX_MAX_MB_PER_BUFFER
-std::pair<int, int> get_graph_limits(Device& d) {
-  auto cc =
-      d.compute_capability_major() * 100 + d.compute_capability_minor() * 10;
-  int ops = 20;
-  int mb = 100;
-  switch (cc) {
-    case 800: // A100
-      ops = 20;
-      mb = 400;
-      break;
-    case 900: // H100
-      ops = 30;
-      mb = 400;
-      break;
-    case 1000: // B200
-      ops = 50;
-      mb = 500;
-      break;
-    case 1210: // DGX Spark
-      ops = 20;
-      mb = 25;
-      break;
-  }
-  return {env::max_ops_per_buffer(ops), env::max_mb_per_buffer(mb)};
-}
-
 CommandEncoder::CommandEncoder(Device& d)
    : device_(d),
      stream_(d),
      graph_(d),
      worker_(d),
-      graph_cache_("MLX_CUDA_GRAPH_CACHE_SIZE", /* default_capacity */ 400) {
-  std::tie(max_ops_per_graph_, max_mb_per_graph_) = get_graph_limits(d);
-}
+      graph_cache_("MLX_CUDA_GRAPH_CACHE_SIZE", /* default_capacity */ 400) {}

 void CommandEncoder::add_completed_handler(std::function<void()> task) {
  worker_.add_task(std::move(task));
@@ -234,7 +204,6 @@ void CommandEncoder::set_input_array(const array& arr) {
  if (!use_cuda_graphs()) {
    return;
  }
-  bytes_in_graph_ += arr.data_size();
  auto id = reinterpret_cast<std::uintptr_t>(arr.buffer().ptr());
  active_deps_.push_back(id);
 }
@@ -332,9 +301,8 @@ void CommandEncoder::add_graph_node(cudaGraph_t child) {
  insert_graph_dependencies(GraphNode{node, 'G'});
 }

-bool CommandEncoder::needs_commit() {
-  return (node_count_ > max_ops_per_graph_) ||
-      ((bytes_in_graph_ >> 20) > max_mb_per_graph_);
+int CommandEncoder::get_num_ops() {
+  return node_count_;
 }

 void CommandEncoder::commit() {
@@ -397,11 +365,10 @@ void CommandEncoder::commit() {
  // Put completion handlers in a batch.
  worker_.commit(stream_);
  node_count_ = 0;
-  bytes_in_graph_ = 0;
 }

 void CommandEncoder::synchronize() {
-  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+  cudaStreamSynchronize(stream_);
  auto p = std::make_shared<std::promise<void>>();
  std::future<void> f = p->get_future();
  add_completed_handler([p = std::move(p)]() { p->set_value(); });
--- a/mlx/backend/cuda/device.h
+++ b/mlx/backend/cuda/device.h
@@ -84,7 +84,7 @@ class CommandEncoder {
  }

  void add_completed_handler(std::function<void()> task);
-  bool needs_commit();
+  int get_num_ops();
  void commit();

  Device& device() {
@@ -131,9 +131,6 @@ class CommandEncoder {
  std::vector<std::uintptr_t> active_deps_;
  std::vector<std::uintptr_t> active_outputs_;
  std::unordered_map<std::uintptr_t, GraphNode> node_map_;
-  size_t bytes_in_graph_{0};
-  int max_ops_per_graph_;
-  int max_mb_per_graph_;
 };

 class Device {
@@ -169,7 +166,6 @@ class Device {
  int device_;
  int compute_capability_major_;
  int compute_capability_minor_;
-  std::string device_name_;
  cublasLtHandle_t lt_;
  cudnnHandle_t cudnn_;
  std::unordered_map<int, CommandEncoder> encoders_;
--- a/mlx/backend/cuda/distributed.cu
+++ b/mlx/backend/cuda/distributed.cu
@@ -26,7 +26,7 @@ void AllReduce::eval_gpu(
      out.copy_shared_buffer(in);
      return {in, out};
    } else {
-      out.set_data(cu::malloc_async(out.nbytes(), encoder));
+      out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
      return {in, out};
    }
  };
@@ -53,69 +53,4 @@ void AllReduce::eval_gpu(
          "Only all reduce sum, max, and min are supported.");
  }
 }
-
-void AllGather::eval_gpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
-  assert(inputs.size() == 1);
-  assert(outputs.size() == 1);
-
-  auto& s = stream();
-  auto& encoder = cu::get_command_encoder(s);
-
-  auto ensure_contiguous = [&s, &encoder](const array& x) {
-    if (x.flags().row_contiguous) {
-      return x;
-    } else {
-      array x_copy = contiguous_copy_gpu(x, s);
-      encoder.add_temporary(x_copy);
-      return x_copy;
-    }
-  };
-
-  auto input = ensure_contiguous(inputs[0]);
-  outputs[0].set_data(cu::malloc_async(outputs[0].nbytes(), encoder));
-
-  encoder.set_input_array(input);
-  encoder.set_output_array(outputs[0]);
-
-  auto capture = encoder.capture_context();
-  distributed::detail::all_gather(group(), input, outputs[0], s);
-}
-
-void ReduceScatter::eval_gpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
-  assert(inputs.size() == 1);
-  assert(outputs.size() == 1);
-
-  auto& s = stream();
-  auto& encoder = cu::get_command_encoder(s);
-
-  auto ensure_contiguous = [&s, &encoder](const array& x) {
-    if (x.flags().row_contiguous) {
-      return x;
-    } else {
-      array x_copy = contiguous_copy_gpu(x, s);
-      encoder.add_temporary(x_copy);
-      return x_copy;
-    }
-  };
-
-  auto input = ensure_contiguous(inputs[0]);
-  outputs[0].set_data(cu::malloc_async(outputs[0].nbytes(), encoder));
-
-  encoder.set_input_array(input);
-  encoder.set_output_array(outputs[0]);
-
-  auto capture = encoder.capture_context();
-
-  switch (reduce_type_) {
-    case Sum:
-      distributed::detail::sum_scatter(group(), input, outputs[0], s);
-      break;
-    default:
-      throw std::runtime_error("Only sum scatter is supported. ");
-  }
-}
 } // namespace mlx::core::distributed
--- a/mlx/backend/cuda/eval.cpp
+++ b/mlx/backend/cuda/eval.cpp
@@ -11,6 +11,9 @@

 namespace mlx::core::gpu {

+// Can be tuned with MLX_MAX_OPS_PER_BUFFER
+constexpr int default_max_nodes_per_graph = 20;
+
 bool is_available() {
  return true;
 }
@@ -50,7 +53,8 @@ void eval(array& arr) {
    encoder.add_temporary(s);
  }

-  if (encoder.needs_commit()) {
+  if (encoder.get_num_ops() >=
+      env::max_ops_per_buffer(default_max_nodes_per_graph)) {
    scheduler::notify_new_task(stream);
    encoder.add_completed_handler(
        [stream]() { scheduler::notify_task_completion(stream); });
--- a/mlx/backend/cuda/fence.cpp
+++ b/mlx/backend/cuda/fence.cpp
@@ -1,8 +1,6 @@
 // Copyright © 2025 Apple Inc.

 #include "mlx/fence.h"
-#include "mlx/backend/cuda/allocator.h"
-#include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/event.h"

 namespace mlx::core {
@@ -22,24 +20,8 @@ void Fence::wait(Stream s, const array&) {
  fence->event.wait(fence->count);
 }

-void Fence::update(Stream s, const array& a, bool cross_device) {
+void Fence::update(Stream s, const array&) {
  auto* fence = static_cast<FenceImpl*>(fence_.get());
-  if (cross_device) {
-    // Move to managed memory if there is a device switch
-    auto& cbuf =
-        *static_cast<cu::CudaBuffer*>(const_cast<array&>(a).buffer().ptr());
-    if (cbuf.device != -1) {
-      void* new_data;
-      CHECK_CUDA_ERROR(cudaMallocManaged(&new_data, cbuf.size));
-      cbuf.device = -1;
-      auto& encoder = cu::device(s.device).get_command_encoder(s);
-      encoder.commit();
-      CHECK_CUDA_ERROR(cudaMemcpyAsync(
-          new_data, cbuf.data, cbuf.size, cudaMemcpyDefault, encoder.stream()));
-      CHECK_CUDA_ERROR(cudaFreeAsync(cbuf.data, encoder.stream()));
-      cbuf.data = new_data;
-    }
-  }
  fence->count++;
  fence->event.signal(s, fence->count);
 }
--- a/mlx/backend/cuda/gemms/cublas_gemm.cpp
+++ b/mlx/backend/cuda/gemms/cublas_gemm.cpp
@@ -370,7 +370,7 @@ void CublasGemm::execute(
    // Ensure workspace is 256-byte aligned
    int nbytes = cuda::ceil_div(heuristic_.workspaceSize, 256) * 256;
    array workspace(
-        cu::malloc_async(nbytes, encoder),
+        cu::malloc_async(nbytes, encoder.stream()),
        {static_cast<int>(heuristic_.workspaceSize)},
        int8);
    encoder.add_temporary(workspace);
--- a/mlx/backend/cuda/gemms/cublas_gemm_batched_12_9.cu
+++ b/mlx/backend/cuda/gemms/cublas_gemm_batched_12_9.cu
@@ -163,7 +163,7 @@ void CublasGemm::run_batched(

  // Launch kernel to set device offsets
  auto pointers = array(
-      cu::malloc_async(batch_count * sizeof(void*) * 3, encoder),
+      cu::malloc_async(batch_count * sizeof(void*) * 3, encoder.stream()),
      {batch_count * 3},
      uint64);

@@ -251,7 +251,7 @@ void CublasGemm::run_batched(

  // Launch kernel to set device offsets
  auto pointers = array(
-      cu::malloc_async(batch_count * sizeof(uint64_t) * 4, encoder),
+      cu::malloc_async(batch_count * sizeof(uint64_t) * 4, encoder.stream()),
      {batch_count * 4},
      uint64);

--- a/mlx/backend/cuda/indexing.cpp
+++ b/mlx/backend/cuda/indexing.cpp
@@ -61,7 +61,7 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {

  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
  if (out.size() == 0) {
    return;
  }
@@ -241,7 +241,7 @@ void GatherAxis::eval_gpu(const std::vector<array>& inputs, array& out) {

  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
  if (out.size() == 0) {
    return;
  }
--- a/mlx/backend/cuda/jit_module.cpp
+++ b/mlx/backend/cuda/jit_module.cpp
@@ -279,14 +279,11 @@ void compile(
  // Compile program.
  std::vector<const char*> args;
  bool use_sass = compiler_supports_device_sass(device);
-  auto cc = device.compute_capability_major();
-  std::string arch_tag = (cc == 90 || cc == 100 || cc == 121) ? "a" : "";
  std::string compute = fmt::format(
-      "--gpu-architecture={}_{}{}{}",
+      "--gpu-architecture={}_{}{}",
      use_sass ? "sm" : "compute",
-      cc,
-      device.compute_capability_minor(),
-      arch_tag);
+      device.compute_capability_major(),
+      device.compute_capability_minor());
  args.push_back(compute.c_str());
  std::string cccl_include = cccl_dir();
  if (!cccl_include.empty()) {
--- a/mlx/backend/cuda/layer_norm.cu
+++ b/mlx/backend/cuda/layer_norm.cu
@@ -244,7 +244,7 @@ void LayerNorm::eval_gpu(
        out.copy_shared_buffer(x);
      } else {
        out.set_data(
-            cu::malloc_async(x.data_size() * x.itemsize(), encoder),
+            cu::malloc_async(x.data_size() * x.itemsize(), encoder.stream()),
            x.data_size(),
            x.strides(),
            x.flags());
@@ -335,7 +335,7 @@ void LayerNormVJP::eval_gpu(
    gx.copy_shared_buffer(g);
    g_in_gx = true;
  } else {
-    gx.set_data(cu::malloc_async(gx.nbytes(), encoder));
+    gx.set_data(cu::malloc_async(gx.nbytes(), encoder.stream()));
  }
  if (g_copied && !g_in_gx) {
    encoder.add_temporary(g);
@@ -355,7 +355,7 @@ void LayerNormVJP::eval_gpu(
      g_in_gw = true;
      gw_temp.copy_shared_buffer(g);
    } else {
-      gw_temp.set_data(cu::malloc_async(gw_temp.nbytes(), encoder));
+      gw_temp.set_data(cu::malloc_async(gw_temp.nbytes(), encoder.stream()));
      encoder.add_temporary(gw_temp);
    }
  }
--- a/mlx/backend/cuda/load.cpp
+++ b/mlx/backend/cuda/load.cpp
@@ -32,7 +32,7 @@ void Load::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& encoder = cu::get_command_encoder(stream());
  auto size = out.size();
  auto nbytes = size * out.itemsize();
-  out.set_data(cu::malloc_async(nbytes, encoder));
+  out.set_data(cu::malloc_async(nbytes, encoder.stream()));
  auto out_ptr = malloc(nbytes);
  reader_->read(static_cast<char*>(out_ptr), nbytes, offset_);
  if (swap_endianness_) {
--- a/mlx/backend/cuda/logsumexp.cu
+++ b/mlx/backend/cuda/logsumexp.cu
@@ -115,7 +115,7 @@ void LogSumExp::eval_gpu(const std::vector<array>& inputs, array& out) {

  auto in = ensure_contiguous(inputs[0]);
  if (in.flags().row_contiguous) {
-    out.set_data(cu::malloc_async(out.nbytes(), encoder));
+    out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
  } else {
    auto n = in.shape(-1);
    auto flags = in.flags();
@@ -130,7 +130,7 @@ void LogSumExp::eval_gpu(const std::vector<array>& inputs, array& out) {
    }
    flags.col_contiguous = col_contig;
    out.set_data(
-        cu::malloc_async(in.nbytes() / n, encoder),
+        cu::malloc_async(in.nbytes() / n, encoder.stream()),
        in.data_size() / n,
        std::move(strides),
        flags);
--- a/mlx/backend/cuda/lru_cache.h
+++ b/mlx/backend/cuda/lru_cache.h
@@ -135,19 +135,12 @@ class LRUCache {
 };

 // Turn a POD struct into a container key by doing bytes compare.
-//
-// Usage:
-//   BytesKey<MyKey> key;
-//   key.pod = { ... };
 template <typename T>
 struct BytesKey {
  T pod;
  static_assert(std::is_standard_layout_v<T>, "T is not POD");

-  BytesKey() {
-    // Make sure the paddings between members are filled with 0.
-    memset(&pod, 0, sizeof(T));
-  }
+  BytesKey(T pod) : pod(std::move(pod)) {}

  BytesKey(const BytesKey& other) {
    memcpy(&pod, &other.pod, sizeof(T));
--- a/mlx/backend/cuda/matmul.cpp
+++ b/mlx/backend/cuda/matmul.cpp
@@ -121,7 +121,7 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
    return;
  }

-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));

  int M = a_pre.shape(-2);
  int N = b_pre.shape(-1);
@@ -163,7 +163,7 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {

  if (beta_ == 1 && a.dtype() != complex64 && c.strides(-1) == 1 &&
      c.data_size() == out.shape(-1)) {
-    out.set_data(cu::malloc_async(out.nbytes(), encoder));
+    out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
    gemm_and_bias(
        encoder,
        M,
@@ -187,10 +187,10 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
    auto sty = c.strides()[c.ndim() - 1];
    if (sty == 1 && stx == c.shape(-1)) {
      ldc = stx;
-      out.set_data(cu::malloc_async(out.nbytes(), encoder));
+      out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
    } else if (sty == 1 && stx == 0) {
      ldc = 0;
-      out.set_data(cu::malloc_async(out.nbytes(), encoder));
+      out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
    } else {
      // Copy C into out and set C to out
      ldc = c.shape(-1);
--- a/mlx/backend/cuda/primitives.cpp
+++ b/mlx/backend/cuda/primitives.cpp
@@ -37,9 +37,9 @@ NO_GPU(Inverse)
 NO_GPU(Cholesky)
 NO_GPU_MULTI(Eig)
 NO_GPU_MULTI(Eigh)
-NO_GPU(MaskedScatter)

 namespace distributed {
+NO_GPU_MULTI(AllGather)
 NO_GPU_MULTI(Send)
 NO_GPU_MULTI(Recv)
 } // namespace distributed
--- a/mlx/backend/cuda/quantized/quantized.cpp
+++ b/mlx/backend/cuda/quantized/quantized.cpp
@@ -59,7 +59,7 @@ void fast::Quantize::eval_gpu(
    auto scales = ensure_row_contiguous(inputs[1], enc, s);
    auto& w = outputs[0];

-    w.set_data(cu::malloc_async(w.nbytes(), enc));
+    w.set_data(cu::malloc_async(w.nbytes(), enc.stream()));

    if (mode_ == QuantizationMode::Affine) {
      auto biases = ensure_row_contiguous(inputs[2], enc, s);
@@ -72,11 +72,11 @@ void fast::Quantize::eval_gpu(
    auto& wq = outputs[0];
    auto& scales = outputs[1];

-    wq.set_data(cu::malloc_async(wq.nbytes(), enc));
-    scales.set_data(cu::malloc_async(scales.nbytes(), enc));
+    wq.set_data(cu::malloc_async(wq.nbytes(), enc.stream()));
+    scales.set_data(cu::malloc_async(scales.nbytes(), enc.stream()));
    if (mode_ == QuantizationMode::Affine) {
      auto& biases = outputs[2];
-      biases.set_data(cu::malloc_async(biases.nbytes(), enc));
+      biases.set_data(cu::malloc_async(biases.nbytes(), enc.stream()));
      affine_quantize(w, wq, scales, biases, group_size_, bits_, enc, s);
    } else {
      fp_quantize(w, wq, scales, group_size_, bits_, enc, s);
--- a/mlx/backend/cuda/random.cu
+++ b/mlx/backend/cuda/random.cu
@@ -145,7 +145,7 @@ void RandomBits::eval_gpu(const std::vector<array>& inputs, array& out) {
  uint32_t bytes_per_key = out.itemsize() * elems_per_key;
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
  if (out.size() == 0) {
    return;
  }
--- a/mlx/backend/cuda/reduce/all_reduce.cu
+++ b/mlx/backend/cuda/reduce/all_reduce.cu
@@ -66,7 +66,7 @@ void all_reduce(
    Reduce::ReduceType reduce_type) {
  constexpr int N_READS = 8;

-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));

  auto get_args = [](size_t size, int N) {
    int threads = std::min(512UL, (size + N - 1) / N);
@@ -107,7 +107,8 @@ void all_reduce(
  encoder.set_input_array(in);
  if (blocks > 1) {
    array intermediate({blocks}, out.dtype(), nullptr, {});
-    intermediate.set_data(cu::malloc_async(intermediate.nbytes(), encoder));
+    intermediate.set_data(
+        cu::malloc_async(intermediate.nbytes(), encoder.stream()));
    encoder.add_temporary(intermediate);
    encoder.set_output_array(intermediate);
    dispatch_all_types(dt, [&](auto type_tag) {
--- a/mlx/backend/cuda/reduce/init_reduce.cu
+++ b/mlx/backend/cuda/reduce/init_reduce.cu
@@ -28,7 +28,7 @@ void init_reduce(
    Reduce::ReduceType reduce_type) {
  // Allocate if needed
  if (out.data_shared_ptr() == nullptr) {
-    out.set_data(cu::malloc_async(out.nbytes(), encoder));
+    out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
  }

  encoder.set_output_array(out);
--- a/mlx/backend/cuda/reduce/reduce_utils.cuh
+++ b/mlx/backend/cuda/reduce/reduce_utils.cuh
@@ -96,7 +96,7 @@ inline void allocate_same_layout(
    const std::vector<int>& axes,
    cu::CommandEncoder& encoder) {
  if (in.flags().row_contiguous) {
-    out.set_data(cu::malloc_async(out.nbytes(), encoder));
+    out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
    return;
  }

@@ -135,7 +135,7 @@ inline void allocate_same_layout(
  fl.col_contiguous = cc;
  fl.contiguous = true;
  out.set_data(
-      cu::malloc_async(out.nbytes(), encoder),
+      cu::malloc_async(out.nbytes(), encoder.stream()),
      data_size,
      final_strides,
      fl,
--- a/mlx/backend/cuda/rms_norm.cu
+++ b/mlx/backend/cuda/rms_norm.cu
@@ -190,7 +190,7 @@ void RMSNorm::eval_gpu(
        out.copy_shared_buffer(x);
      } else {
        out.set_data(
-            cu::malloc_async(x.data_size() * x.itemsize(), encoder),
+            cu::malloc_async(x.data_size() * x.itemsize(), encoder.stream()),
            x.data_size(),
            x.strides(),
            x.flags());
@@ -274,7 +274,7 @@ void RMSNormVJP::eval_gpu(
    gx.copy_shared_buffer(g);
    g_in_gx = true;
  } else {
-    gx.set_data(cu::malloc_async(gx.nbytes(), encoder));
+    gx.set_data(cu::malloc_async(gx.nbytes(), encoder.stream()));
  }
  if (g_copied && !g_in_gx) {
    encoder.add_temporary(g);
@@ -292,7 +292,7 @@ void RMSNormVJP::eval_gpu(
    if (!g_in_gx && donate_g) {
      gw_temp.copy_shared_buffer(g);
    } else {
-      gw_temp.set_data(cu::malloc_async(gw_temp.nbytes(), encoder));
+      gw_temp.set_data(cu::malloc_async(gw_temp.nbytes(), encoder.stream()));
      encoder.add_temporary(gw_temp);
    }
  }
--- a/mlx/backend/cuda/rope.cu
+++ b/mlx/backend/cuda/rope.cu
@@ -292,14 +292,14 @@ void RoPE::eval_gpu(
      donated = true;
      out.copy_shared_buffer(in);
    } else {
-      out.set_data(cu::malloc_async(out.nbytes(), encoder));
+      out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
    }
    strides[0] = mat_size;
    strides[1] = in.strides()[ndim - 2];
    strides[2] = in.strides()[ndim - 1];
  } else if (dispatch_ndim == 3) {
    // Handle non-contiguous 3D inputs
-    out.set_data(cu::malloc_async(out.nbytes(), encoder));
+    out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));
    strides[0] = in.strides()[ndim - 3];
    strides[1] = in.strides()[ndim - 2];
    strides[2] = in.strides()[ndim - 1];
--- a/mlx/backend/cuda/scaled_dot_product_attention.cpp
+++ b/mlx/backend/cuda/scaled_dot_product_attention.cpp
@@ -1,537 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/cuda/cudnn_utils.h"
-#include "mlx/backend/cuda/device.h"
-#include "mlx/backend/cuda/lru_cache.h"
-#include "mlx/backend/gpu/copy.h"
-#include "mlx/fast_primitives.h"
-
-#include <nvtx3/nvtx3.hpp>
-
-namespace mlx::core {
-
-namespace fe = cudnn_frontend;
-
-namespace {
-
-#define CHECK_CUDNN_FE_ERROR(cmd)                                    \
-  do {                                                               \
-    auto error = cmd;                                                \
-    if (!error.is_good()) {                                          \
-      throw std::runtime_error(                                      \
-          fmt::format("{} failed: {}.", #cmd, error.get_message())); \
-    }                                                                \
-  } while (0)
-
-std::vector<int64_t> normalized_strides(const array& x) {
-  std::vector<int64_t> strides(x.strides().begin(), x.strides().end());
-  if (std::all_of(
-          strides.begin(), strides.end(), [](int64_t s) { return s == 0; })) {
-    strides.back() = 1;
-    return strides;
-  }
-  if (!x.flags().row_contiguous || x.ndim() < 2) {
-    return strides;
-  }
-  for (int i = x.ndim() - 2; i >= 0; --i) {
-    if (x.shape(i) == 1) {
-      strides[i] = x.shape(i + 1) * strides[i + 1];
-    }
-  }
-  return strides;
-}
-
-void set_tensor_attrs(
-    std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
-    int64_t uid,
-    const array& x) {
-  tensor->set_uid(uid)
-      .set_dim({x.shape().begin(), x.shape().end()})
-      .set_stride(normalized_strides(x));
-}
-
-array prepare_sdpa_input(const array& x, Stream s) {
-  // SDPA kernel's requirements on inputs:
-  // 1. last dim's stride be 1;
-  // 2. pointer be aligned.
-  if (x.strides(-1) != 1 || get_alignment(x) < 16) {
-    array x_copy = contiguous_copy_gpu(x, s);
-    auto& encoder = cu::get_command_encoder(s);
-    encoder.add_temporary(x_copy);
-    return x_copy;
-  }
-  return x;
-}
-
-constexpr int QKV_NDIM = 4;
-
-struct SDPACacheKey {
-  int device_id;
-  cudnnDataType_t cudnn_dtype;
-  std::array<int, QKV_NDIM> q_shape;
-  std::array<int, QKV_NDIM> k_shape;
-  std::array<int, QKV_NDIM> v_shape;
-  std::array<int64_t, QKV_NDIM> q_strides;
-  std::array<int64_t, QKV_NDIM> k_strides;
-  std::array<int64_t, QKV_NDIM> v_strides;
-  bool do_causal;
-  bool output_logsumexp;
-};
-
-inline BytesKey<SDPACacheKey> build_sdpa_cache_key(
-    cu::CommandEncoder& encoder,
-    const array& q,
-    const array& k,
-    const array& v,
-    bool do_causal,
-    bool output_logsumexp = true) {
-  BytesKey<SDPACacheKey> cache_key;
-  cache_key.pod = {
-      encoder.device().cuda_device(),
-      dtype_to_cudnn_type(q.dtype()),
-      vector_key<QKV_NDIM>(q.shape()),
-      vector_key<QKV_NDIM>(k.shape()),
-      vector_key<QKV_NDIM>(v.shape()),
-      vector_key<QKV_NDIM>(q.strides()),
-      vector_key<QKV_NDIM>(k.strides()),
-      vector_key<QKV_NDIM>(v.strides()),
-      do_causal,
-      output_logsumexp,
-  };
-  return cache_key;
-}
-
-auto& sdpa_cache() {
-  static LRUBytesKeyCache<SDPACacheKey, fe::graph::Graph> cache(
-      "MLX_CUDA_SDPA_CACHE_SIZE", /* default_capacity */ 16);
-  return cache;
-}
-
-auto& sdpa_backward_cache() {
-  static LRUBytesKeyCache<SDPACacheKey, fe::graph::Graph> cache(
-      "MLX_CUDA_SDPA_BACKWARD_CACHE_SIZE", /* default_capacity */ 16);
-  return cache;
-}
-
-enum UIDS {
-  Q,
-  K,
-  V,
-  SCALE,
-  O,
-  STATS,
-  // Backward graph:
-  D_Q,
-  D_K,
-  D_V,
-  D_O,
-};
-
-fe::graph::Graph build_sdpa_graph(
-    cudnnHandle_t handle,
-    const array& q,
-    const array& k,
-    const array& v,
-    bool do_causal,
-    bool output_logsumexp,
-    const array& o,
-    const array& stats) {
-  auto dtype = fe::DataType_t::HALF;
-  if (q.dtype() == bfloat16) {
-    dtype = fe::DataType_t::BFLOAT16;
-  }
-
-  fe::graph::Graph graph;
-  graph.set_io_data_type(dtype)
-      .set_intermediate_data_type(fe::DataType_t::FLOAT)
-      .set_compute_data_type(fe::DataType_t::FLOAT);
-
-  auto q_ = graph.tensor(fe::graph::Tensor_attributes().set_name("Q"));
-  auto k_ = graph.tensor(fe::graph::Tensor_attributes().set_name("K"));
-  auto v_ = graph.tensor(fe::graph::Tensor_attributes().set_name("V"));
-  set_tensor_attrs(q_, Q, q);
-  set_tensor_attrs(k_, K, k);
-  set_tensor_attrs(v_, V, v);
-
-  auto scale = graph.tensor(fe::graph::Tensor_attributes()
-                                .set_name("Scale")
-                                .set_uid(SCALE)
-                                .set_dim({1, 1, 1, 1})
-                                .set_stride({1, 1, 1, 1})
-                                .set_is_pass_by_value(true)
-                                .set_data_type(fe::DataType_t::FLOAT));
-
-  auto options = fe::graph::SDPA_attributes()
-                     .set_name("sdpa_cudnn")
-                     .set_attn_scale(scale)
-                     .set_causal_mask(do_causal)
-                     .set_generate_stats(output_logsumexp);
-
-  auto [o_, stats_] = graph.sdpa(q_, k_, v_, options);
-  o_->set_output(true);
-  set_tensor_attrs(o_, O, o);
-  if (output_logsumexp) {
-    stats_->set_output(true).set_data_type(fe::DataType_t::FLOAT);
-    set_tensor_attrs(stats_, STATS, stats);
-  }
-
-  CHECK_CUDNN_FE_ERROR(graph.validate());
-  CHECK_CUDNN_FE_ERROR(graph.build_operation_graph(handle));
-  CHECK_CUDNN_FE_ERROR(graph.create_execution_plans({fe::HeurMode_t::A}));
-  graph.select_behavior_notes(
-      {fe::BehaviorNote_t::SUPPORTS_CUDA_GRAPH_NATIVE_API});
-  CHECK_CUDNN_FE_ERROR(graph.check_support(handle));
-  CHECK_CUDNN_FE_ERROR(graph.build_plans(handle));
-
-  return graph;
-}
-
-fe::graph::Graph build_sdpa_backward_graph(
-    cudnnHandle_t handle,
-    const array& q,
-    const array& k,
-    const array& v,
-    bool do_causal,
-    const array& o,
-    const array& d_o,
-    const array& stats,
-    array& d_q,
-    array& d_k,
-    array& d_v) {
-  auto dtype = fe::DataType_t::HALF;
-  if (q.dtype() == bfloat16) {
-    dtype = fe::DataType_t::BFLOAT16;
-  }
-
-  fe::graph::Graph graph;
-  graph.set_io_data_type(dtype)
-      .set_intermediate_data_type(fe::DataType_t::FLOAT)
-      .set_compute_data_type(fe::DataType_t::FLOAT);
-
-  auto q_ = graph.tensor(fe::graph::Tensor_attributes().set_name("Q"));
-  auto k_ = graph.tensor(fe::graph::Tensor_attributes().set_name("K"));
-  auto v_ = graph.tensor(fe::graph::Tensor_attributes().set_name("V"));
-  auto o_ = graph.tensor(fe::graph::Tensor_attributes().set_name("O"));
-  auto d_o_ = graph.tensor(fe::graph::Tensor_attributes().set_name("D_O"));
-  auto stats_ = graph.tensor(fe::graph::Tensor_attributes().set_name("STATS"));
-  set_tensor_attrs(q_, Q, q);
-  set_tensor_attrs(k_, K, k);
-  set_tensor_attrs(v_, V, v);
-  set_tensor_attrs(o_, O, o);
-  set_tensor_attrs(d_o_, D_O, d_o);
-  set_tensor_attrs(stats_, STATS, stats);
-  stats_->set_data_type(fe::DataType_t::FLOAT);
-
-  auto scale = graph.tensor(fe::graph::Tensor_attributes()
-                                .set_name("Scale")
-                                .set_uid(SCALE)
-                                .set_dim({1, 1, 1, 1})
-                                .set_stride({1, 1, 1, 1})
-                                .set_is_pass_by_value(true)
-                                .set_data_type(fe::DataType_t::FLOAT));
-
-  auto options = fe::graph::SDPA_backward_attributes()
-                     .set_name("sdpa_backward_cudnn")
-                     .set_attn_scale(scale)
-                     .set_causal_mask(do_causal);
-
-  auto [d_q_, d_k_, d_v_] =
-      graph.sdpa_backward(q_, k_, v_, o_, d_o_, stats_, options);
-  d_q_->set_output(true);
-  d_k_->set_output(true);
-  d_v_->set_output(true);
-  set_tensor_attrs(d_q_, D_Q, d_q);
-  set_tensor_attrs(d_k_, D_K, d_k);
-  set_tensor_attrs(d_v_, D_V, d_v);
-
-  CHECK_CUDNN_FE_ERROR(graph.validate());
-  CHECK_CUDNN_FE_ERROR(graph.build_operation_graph(handle));
-  CHECK_CUDNN_FE_ERROR(graph.create_execution_plans({fe::HeurMode_t::A}));
-  graph.select_behavior_notes(
-      {fe::BehaviorNote_t::SUPPORTS_CUDA_GRAPH_NATIVE_API});
-  CHECK_CUDNN_FE_ERROR(graph.check_support(handle));
-  CHECK_CUDNN_FE_ERROR(graph.build_plans(handle));
-
-  return graph;
-}
-
-void execute_graph(
-    cu::CommandEncoder& encoder,
-    cudnnHandle_t handle,
-    fe::graph::Graph& graph,
-    std::unordered_map<int64_t, void*>& variant_pack) {
-  int64_t workspace_size = 0;
-  CHECK_CUDNN_FE_ERROR(graph.get_workspace_size(workspace_size));
-  void* workspace_ptr = nullptr;
-  if (workspace_size > 0) {
-    array workspace(
-        cu::malloc_async(workspace_size, encoder),
-        {static_cast<int>(workspace_size)},
-        uint8);
-    encoder.add_temporary(workspace);
-    workspace_ptr = gpu_ptr<void>(workspace);
-  }
-
-  cudnnSetStream(handle, encoder.stream());
-
-  CudaGraph cuda_graph(encoder.device());
-  CHECK_CUDNN_FE_ERROR(graph.populate_cuda_graph(
-      handle, variant_pack, workspace_ptr, cuda_graph));
-  encoder.add_graph_node(cuda_graph);
-}
-
-} // namespace
-
-bool supports_sdpa_cudnn(
-    const array& q,
-    const array& k,
-    const array& v,
-    bool has_mask,
-    bool do_causal,
-    Stream s) {
-  static bool enabled = env::get_var("MLX_CUDA_USE_CUDNN_SPDA", 1);
-  if (!enabled) {
-    return false;
-  }
-
-  // cuDNN SDPA requires Ampere and later.
-  if (cu::device(s.device).compute_capability_major() < 8) {
-    return false;
-  }
-
-  if (has_mask) {
-    // TODO: Support array masks.
-    if (!do_causal) {
-      return false;
-    }
-    // FIXME: Causal mask generates wrong results when L_Q != L_K.
-    if (q.shape(2) != k.shape(2)) {
-      return false;
-    }
-  }
-
-  // Only use cuDNN for prefilling and training.
-  if (q.shape(2) != k.shape(2)) {
-    return false;
-  }
-
-  // D_qk and D_v must be a multiple of 8 with maximum value 128.
-  if ((q.shape(-1) % 8 != 0) || (q.shape(-1) > 128) || (v.shape(-1) % 8 != 0) ||
-      (v.shape(-1) > 128)) {
-    return false;
-  }
-
-  Dtype dtype = q.dtype();
-  return dtype == float16 || dtype == bfloat16;
-}
-
-void sdpa_cudnn(
-    const array& q,
-    const array& k,
-    const array& v,
-    float scale,
-    array& o,
-    array& stats,
-    bool do_causal,
-    bool output_logsumexp,
-    Stream s) {
-  auto& encoder = cu::get_command_encoder(s);
-  auto handle = encoder.device().cudnn_handle();
-
-  // TODO: Handle donation.
-  // TODO: Make O use same memory layout with Q.
-  o.set_data(cu::malloc_async(o.nbytes(), encoder));
-
-  encoder.set_input_array(q);
-  encoder.set_input_array(k);
-  encoder.set_input_array(v);
-  encoder.set_output_array(o);
-
-  if (output_logsumexp) {
-    stats.set_data(cu::malloc_async(stats.nbytes(), encoder));
-    encoder.set_output_array(stats);
-  }
-
-  // Search cache.
-  auto cache_key =
-      build_sdpa_cache_key(encoder, q, k, v, do_causal, output_logsumexp);
-  auto it = sdpa_cache().find(cache_key);
-  if (it == sdpa_cache().end()) {
-    auto graph = build_sdpa_graph(
-        handle, q, k, v, do_causal, output_logsumexp, o, stats);
-    it = sdpa_cache().emplace(cache_key, std::move(graph)).first;
-  }
-  auto& graph = it->second;
-
-  std::unordered_map<int64_t, void*> variant_pack{
-      {Q, const_cast<void*>(gpu_ptr<void>(q))},
-      {K, const_cast<void*>(gpu_ptr<void>(k))},
-      {V, const_cast<void*>(gpu_ptr<void>(v))},
-      {SCALE, &scale},
-      {O, gpu_ptr<void>(o)}};
-  if (output_logsumexp) {
-    variant_pack[STATS] = gpu_ptr<void>(stats);
-  }
-
-  execute_graph(encoder, handle, graph, variant_pack);
-}
-
-void sdpa_backward_cudnn(
-    const array& q,
-    const array& k,
-    const array& v,
-    float scale,
-    const array& o,
-    const array& stats,
-    bool do_causal,
-    const array& d_o,
-    array& d_q,
-    array& d_k,
-    array& d_v,
-    Stream s) {
-  auto& encoder = cu::get_command_encoder(s);
-  auto handle = encoder.device().cudnn_handle();
-
-  // TODO: Handle donation.
-  d_q.set_data(cu::malloc_async(d_q.nbytes(), encoder));
-  d_k.set_data(cu::malloc_async(d_k.nbytes(), encoder));
-  d_v.set_data(cu::malloc_async(d_v.nbytes(), encoder));
-
-  encoder.set_input_array(q);
-  encoder.set_input_array(k);
-  encoder.set_input_array(v);
-  encoder.set_input_array(o);
-  encoder.set_input_array(stats);
-  encoder.set_input_array(d_o);
-  encoder.set_output_array(d_q);
-  encoder.set_output_array(d_k);
-  encoder.set_output_array(d_v);
-
-  // Search cache.
-  auto cache_key = build_sdpa_cache_key(encoder, q, k, v, do_causal);
-  auto it = sdpa_backward_cache().find(cache_key);
-  if (it == sdpa_backward_cache().end()) {
-    auto graph = build_sdpa_backward_graph(
-        handle, q, k, v, do_causal, o, d_o, stats, d_q, d_k, d_v);
-    it = sdpa_backward_cache().emplace(cache_key, std::move(graph)).first;
-  }
-  auto& graph = it->second;
-
-  std::unordered_map<int64_t, void*> variant_pack{
-      {Q, const_cast<void*>(gpu_ptr<void>(q))},
-      {K, const_cast<void*>(gpu_ptr<void>(k))},
-      {V, const_cast<void*>(gpu_ptr<void>(v))},
-      {SCALE, &scale},
-      {O, const_cast<void*>(gpu_ptr<void>(o))},
-      {STATS, const_cast<void*>(gpu_ptr<void>(stats))},
-      {D_O, const_cast<void*>(gpu_ptr<void>(d_o))},
-      {D_Q, gpu_ptr<void>(d_q)},
-      {D_K, gpu_ptr<void>(d_k)},
-      {D_V, gpu_ptr<void>(d_v)}};
-
-  execute_graph(encoder, handle, graph, variant_pack);
-}
-
-// Defined in scaled_dot_product_attention.cu file.
-bool supports_sdpa_vector(
-    const array& q,
-    const array& k,
-    const array& v,
-    bool has_mask,
-    bool has_arr_mask,
-    bool do_causal,
-    bool output_logsumexp);
-void sdpa_vector(
-    const array& q,
-    const array& k,
-    const array& v,
-    float scale,
-    array& o,
-    bool do_causal,
-    const std::optional<array>& sinks,
-    Stream s);
-
-namespace fast {
-
-bool ScaledDotProductAttention::use_fallback(
-    const array& q,
-    const array& k,
-    const array& v,
-    bool has_mask,
-    bool has_arr_mask,
-    bool do_causal,
-    bool is_training,
-    bool output_logsumexp,
-    Stream s) {
-  if (s.device == Device::cpu) {
-    return true;
-  }
-
-  return !supports_sdpa_vector(
-             q, k, v, has_mask, has_arr_mask, do_causal, output_logsumexp) &&
-      !supports_sdpa_cudnn(q, k, v, has_mask, do_causal, s);
-}
-
-void ScaledDotProductAttention::eval_gpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
-  nvtx3::scoped_range r("ScaledDotProductAttention::eval_gpu");
-
-  auto& s = stream();
-
-  array q = prepare_sdpa_input(inputs[0], s);
-  array k = prepare_sdpa_input(inputs[1], s);
-  array v = prepare_sdpa_input(inputs[2], s);
-  auto& out = outputs[0];
-  auto& stats = outputs[1];
-  bool has_mask = inputs.size() - has_sinks_ > 3;
-  bool has_arr_mask = has_mask && !do_causal_;
-
-  if (supports_sdpa_vector(
-          q, k, v, has_mask, has_arr_mask, do_causal_, output_logsumexp_)) {
-    if (has_sinks_) {
-      sdpa_vector(q, k, v, scale_, out, do_causal_, inputs.back(), s);
-    } else {
-      sdpa_vector(q, k, v, scale_, out, do_causal_, std::nullopt, s);
-    }
-  } else {
-    sdpa_cudnn(q, k, v, scale_, out, stats, do_causal_, output_logsumexp_, s);
-  }
-}
-
-bool ScaledDotProductAttentionVJP::use_fallback(const array& q, Stream s) {
-  // The frontend adds a padding mask when sequence length is not a multiple of
-  // tile size.
-  if (q.shape(2) % 128 != 0) {
-    return true;
-  }
-  return s.device == Device::cpu;
-}
-
-void ScaledDotProductAttentionVJP::eval_gpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
-  nvtx3::scoped_range r("ScaledDotProductAttentionVJP::eval_gpu");
-
-  auto& s = stream();
-
-  assert(inputs.size() == 6);
-  array q = prepare_sdpa_input(inputs[0], s);
-  array k = prepare_sdpa_input(inputs[1], s);
-  array v = prepare_sdpa_input(inputs[2], s);
-  array o = prepare_sdpa_input(inputs[3], s);
-  array stats = prepare_sdpa_input(inputs[4], s);
-  array d_o = prepare_sdpa_input(inputs[5], s);
-
-  assert(outputs.size() == 3);
-  auto& d_q = outputs[0];
-  auto& d_k = outputs[1];
-  auto& d_v = outputs[2];
-
-  sdpa_backward_cudnn(
-      q, k, v, scale_, o, stats, do_causal_, d_o, d_q, d_k, d_v, s);
-}
-
-} // namespace fast
-
-} // namespace mlx::core
--- a/mlx/backend/cuda/scaled_dot_product_attention.cu
+++ b/mlx/backend/cuda/scaled_dot_product_attention.cu
@@ -6,6 +6,10 @@
 #include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/backend/gpu/copy.h"
 #include "mlx/dtype_utils.h"
+#include "mlx/fast_primitives.h"
+#include "mlx/transforms_impl.h"
+
+#include <nvtx3/nvtx3.hpp>

 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
@@ -561,9 +565,10 @@ void sdpa_vector_2pass_fallback(
  array sums(intermediate_shape, float32, nullptr, {});
  array maxs(std::move(intermediate_shape), float32, nullptr, {});

-  intermediate.set_data(cu::malloc_async(intermediate.nbytes(), encoder));
-  sums.set_data(cu::malloc_async(sums.nbytes(), encoder));
-  maxs.set_data(cu::malloc_async(maxs.nbytes(), encoder));
+  intermediate.set_data(
+      cu::malloc_async(intermediate.nbytes(), encoder.stream()));
+  sums.set_data(cu::malloc_async(sums.nbytes(), encoder.stream()));
+  maxs.set_data(cu::malloc_async(maxs.nbytes(), encoder.stream()));

  encoder.add_temporary(intermediate);
  encoder.add_temporary(sums);
@@ -658,16 +663,21 @@ void sdpa_vector_fallback(

 } // namespace

-bool supports_sdpa_vector(
+namespace fast {
+
+bool ScaledDotProductAttention::use_fallback(
    const array& q,
    const array& k,
    const array& v,
    bool has_mask,
    bool has_arr_mask,
    bool do_causal,
-    bool output_logsumexp) {
-  if (output_logsumexp) {
-    return false;
+    Stream s) {
+  if (detail::in_grad_tracing()) {
+    return true;
+  }
+  if (s.device == Device::cpu) {
+    return true;
  }

  const int value_head_dim = v.shape(-1);
@@ -681,24 +691,29 @@ bool supports_sdpa_vector(
  const bool supported_vector_config =
      sdpa_supported_head_dim && query_sequence_length < 4;

-  return supported_vector_config && !has_arr_mask;
+  const bool supported_config = supported_vector_config;
+
+  return has_arr_mask || !supported_config;
 }

-void sdpa_vector(
-    const array& q_pre,
-    const array& k_pre,
-    const array& v_pre,
-    float scale,
-    array& o,
-    bool do_causal,
-    const std::optional<array>& sinks_pre,
-    Stream s) {
+void ScaledDotProductAttention::eval_gpu(
+    const std::vector<array>& inputs,
+    array& out) {
+  nvtx3::scoped_range r("ScaledDotProductAttention::eval_gpu");
+
+  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
+
+  auto& q_pre = inputs[0];
+  auto& k_pre = inputs[1];
+  auto& v_pre = inputs[2];
+  auto& o = out;
+
  std::vector<array> copies;

  // Define some copy functions to ensure the layout of the inputs is as
  // expected.
-  copies.reserve(4);
+  copies.reserve(inputs.size());
  auto copy_unless = [&copies, &s](
                         auto predicate, const array& arr) -> const array& {
    if (!predicate(arr)) {
@@ -716,8 +731,8 @@ void sdpa_vector(
  };

  std::optional<array> sinks = std::nullopt;
-  if (sinks_pre) {
-    sinks = copy_unless(is_matrix_contiguous, sinks_pre.value());
+  if (has_sinks_) {
+    sinks = copy_unless(is_matrix_contiguous, inputs.back());
  }

  // We are in vector mode ie single query
@@ -773,7 +788,7 @@ void sdpa_vector(
      };

      o.set_data(
-          cu::malloc_async(o.nbytes(), encoder),
+          cu::malloc_async(o.nbytes(), encoder.stream()),
          o.size(),
          {str_oB, str_oH, str_oL, str_oD},
          flags);
@@ -783,7 +798,8 @@ void sdpa_vector(
      encoder.add_temporary(cp);
    }

-    sdpa_vector_fallback(s, encoder, q, k, v, scale, o, do_causal, sinks);
+    return sdpa_vector_fallback(
+        s, encoder, q, k, v, scale_, o, do_causal_, sinks);
  }

  // Full attention mode should never reach here
@@ -792,4 +808,6 @@ void sdpa_vector(
  }
 }

+} // namespace fast
+
 } // namespace mlx::core
--- a/mlx/backend/cuda/scan.cu
+++ b/mlx/backend/cuda/scan.cu
@@ -374,7 +374,7 @@ void Scan::eval_gpu(const std::vector<array>& inputs, array& out) {
      out.copy_shared_buffer(in);
    } else {
      out.set_data(
-          cu::malloc_async(in.data_size() * out.itemsize(), encoder),
+          cu::malloc_async(in.data_size() * out.itemsize(), encoder.stream()),
          in.data_size(),
          in.strides(),
          in.flags());
--- a/mlx/backend/cuda/slicing.cpp
+++ b/mlx/backend/cuda/slicing.cpp
@@ -24,7 +24,7 @@ void concatenate_gpu(
  std::partial_sum(sizes.cbegin(), sizes.cend(), sizes.begin());

  auto& encoder = cu::get_command_encoder(s);
-  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder.stream()));

  auto strides = out.strides();
  auto flags = out.flags();
@@ -89,7 +89,7 @@ array compute_dynamic_offset(
  if (donate) {
    offset.copy_shared_buffer(indices);
  } else {
-    offset.set_data(cu::malloc_async(offset.itemsize(), encoder));
+    offset.set_data(cu::malloc_async(offset.itemsize(), encoder.stream()));
  }

  encoder.add_temporary(offset);
--- a/mlx/backend/cuda/softmax.cu
+++ b/mlx/backend/cuda/softmax.cu
@@ -118,7 +118,7 @@ void Softmax::eval_gpu(const std::vector<array>& inputs, array& out) {
        out.copy_shared_buffer(x);
      } else {
        out.set_data(
-            cu::malloc_async(x.data_size() * x.itemsize(), encoder),
+            cu::malloc_async(x.data_size() * x.itemsize(), encoder.stream()),
            x.data_size(),
            x.strides(),
            x.flags());
--- a/mlx/backend/cuda/sort.cu
+++ b/mlx/backend/cuda/sort.cu
@@ -49,12 +49,14 @@ void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
    array trans = swapaxes_in_eval(in, axis, last_dim);
    in = contiguous_copy_gpu(trans, s);
    encoder.add_temporary(in);
-    out =
-        array(cu::malloc_async(out.nbytes(), encoder), in.shape(), out.dtype());
+    out = array(
+        cu::malloc_async(out.nbytes(), encoder.stream()),
+        in.shape(),
+        out.dtype());
    encoder.add_temporary(out);
  } else {
    out.set_data(
-        cu::malloc_async(in.data_size() * out.itemsize(), encoder),
+        cu::malloc_async(in.data_size() * out.itemsize(), encoder.stream()),
        in.data_size(),
        in.strides(),
        in.flags());
@@ -72,13 +74,17 @@ void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
      if (argsort) {
        // Indices in the sorted dimension.
        array indices(
-            cu::malloc_async(out.nbytes(), encoder), in.shape(), out.dtype());
+            cu::malloc_async(out.nbytes(), encoder.stream()),
+            in.shape(),
+            out.dtype());
        encoder.add_temporary(indices);

        // In argsort though we don't need the result of sorted values, the
        // API requires us to provide an array to store it.
        array discard(
-            cu::malloc_async(in.nbytes(), encoder), in.shape(), in.dtype());
+            cu::malloc_async(in.nbytes(), encoder.stream()),
+            in.shape(),
+            in.dtype());
        encoder.add_temporary(discard);

        size_t size;
@@ -98,7 +104,9 @@ void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
            stream));

        array temp(
-            cu::malloc_async(size, encoder), {static_cast<int>(size)}, uint8);
+            cu::malloc_async(size, encoder.stream()),
+            {static_cast<int>(size)},
+            uint8);
        encoder.add_temporary(temp);

        // Start capturing after allocations
@@ -140,7 +148,9 @@ void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
            stream));

        array temp(
-            cu::malloc_async(size, encoder), {static_cast<int>(size)}, uint8);
+            cu::malloc_async(size, encoder.stream()),
+            {static_cast<int>(size)},
+            uint8);
        encoder.add_temporary(temp);

        // Start capturing after allocations
--- a/mlx/backend/cuda/ternary.cu
+++ b/mlx/backend/cuda/ternary.cu
@@ -257,8 +257,9 @@ void ternary_op_gpu(
  auto& c = inputs[2];
  auto topt = get_ternary_op_type(a, b, c);
  auto& encoder = cu::get_command_encoder(s);
-  set_ternary_op_output_data(
-      a, b, c, out, topt, [&](auto n) { return cu::malloc_async(n, encoder); });
+  set_ternary_op_output_data(a, b, c, out, topt, [&](auto n) {
+    return cu::malloc_async(n, encoder.stream());
+  });
  ternary_op_gpu_inplace<Op>(inputs, out, s);
 }

--- a/mlx/backend/cuda/unary/unary.cuh
+++ b/mlx/backend/cuda/unary/unary.cuh
@@ -208,8 +208,9 @@ void unary_op_gpu(
    const char* op,
    const Stream& s) {
  auto& encoder = cu::get_command_encoder(s);
-  set_unary_output_data(
-      inputs[0], out, [&](auto n) { return cu::malloc_async(n, encoder); });
+  set_unary_output_data(inputs[0], out, [&](auto n) {
+    return cu::malloc_async(n, encoder.stream());
+  });
  unary_op_gpu_inplace<Op>(inputs, out, op, s);
 }

--- a/mlx/backend/cuda/utils.cpp
+++ b/mlx/backend/cuda/utils.cpp
@@ -60,7 +60,7 @@ const char* dtype_to_cuda_type(const Dtype& dtype) {
    case float64:
      return "double";
    case complex64:
-      return "mlx::core::cu::complex64_t";
+      return "complex64_t";
    default:
      return "unknown";
  }
--- a/mlx/backend/cuda/worker.cpp
+++ b/mlx/backend/cuda/worker.cpp
@@ -44,7 +44,7 @@ void Worker::commit(cudaStream_t stream) {
  }
  signal_event_.record(stream);
  signal_event_.wait(signal_stream_);
-  CHECK_CUDA_ERROR(cudaLaunchHostFunc(signal_stream_, signal, this));
+  cudaLaunchHostFunc(signal_stream_, signal, this);
 }

 void Worker::thread_fn() {
--- a/mlx/backend/gpu/slicing.cpp
+++ b/mlx/backend/gpu/slicing.cpp
@@ -11,7 +11,7 @@ void slice_gpu(
    array& out,
    const Shape& start_indices,
    const Shape& strides,
-    const Stream&) {
+    const Stream& s) {
  slice(in, out, start_indices, strides);
 }

--- a/mlx/backend/metal/CMakeLists.txt
+++ b/mlx/backend/metal/CMakeLists.txt
@@ -21,14 +21,19 @@ function(make_jit_source SRC_FILE)
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/jit/${SRC_NAME}.cpp)
 endfunction(make_jit_source)

-make_jit_source(utils kernels/bf16.h kernels/bf16_math.h kernels/complex.h
-                kernels/defines.h)
+make_jit_source(
+  utils
+  kernels/jit/bf16.h
+  kernels/metal_3_0/bf16.h
+  kernels/metal_3_1/bf16.h
+  kernels/bf16_math.h
+  kernels/complex.h
+  kernels/defines.h)
 make_jit_source(unary_ops kernels/erf.h kernels/expm1f.h kernels/fp8.h)
 make_jit_source(binary_ops)
 make_jit_source(ternary_ops)
 make_jit_source(reduce_utils kernels/atomic.h kernels/reduction/ops.h)
 make_jit_source(indexing/scatter kernels/indexing/indexing.h)
-make_jit_source(indexing/masked_scatter)
 make_jit_source(indexing/gather kernels/indexing/indexing.h)
 make_jit_source(indexing/gather_front kernels/indexing/indexing.h)
 make_jit_source(indexing/gather_axis)
@@ -121,14 +126,6 @@ if(NOT MLX_METAL_PATH)
  set(MLX_METAL_PATH ${CMAKE_CURRENT_BINARY_DIR}/kernels/)
 endif()

-if((MLX_METAL_VERSION GREATER_EQUAL 400) AND (MACOS_SDK_VERSION GREATER_EQUAL
-                                              26.2))
-  set(MLX_ENABLE_NAX TRUE)
-  target_compile_definitions(mlx PRIVATE MLX_ENABLE_NAX)
-else()
-  set(MLX_ENABLE_NAX FALSE)
-endif()
-
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels)

 target_compile_definitions(mlx
--- a/mlx/backend/metal/custom_kernel.cpp
+++ b/mlx/backend/metal/custom_kernel.cpp
@@ -32,7 +32,7 @@ std::string write_signature(
    const std::vector<Dtype>& output_dtypes,
    const std::vector<std::pair<std::string, TemplateArg>>& template_args,
    const std::vector<std::string>& attributes,
-    const std::vector<std::tuple<bool, bool, bool>>& shape_infos,
+    const std::vector<CustomKernelShapeInfo>& shape_infos,
    bool atomic_outputs) {
  std::string kernel_source;
  kernel_source.reserve(header.size() + source.size() + 16384);
@@ -88,19 +88,19 @@ std::string write_signature(
    index++;
    // Add input shape, strides and ndim if present in the source
    if (arr.ndim() > 0) {
-      if (std::get<0>(shape_infos[i])) {
+      if (shape_infos[i].shape) {
        kernel_source +=
            ("  const constant int* " + name + "_shape [[buffer(" +
             std::to_string(index) + ")]],\n");
        index++;
      }
-      if (std::get<1>(shape_infos[i])) {
+      if (shape_infos[i].strides) {
        kernel_source +=
            ("  const constant int64_t* " + name + "_strides [[buffer(" +
             std::to_string(index) + ")]],\n");
        index++;
      }
-      if (std::get<2>(shape_infos[i])) {
+      if (shape_infos[i].ndim) {
        kernel_source +=
            ("  const constant int& " + name + "_ndim [[buffer(" +
             std::to_string(index) + ")]],\n");
@@ -184,12 +184,12 @@ CustomKernelFunction metal_kernel(
    throw std::invalid_argument(
        "[metal_kernel] Must specify at least one output.");
  }
-  std::vector<std::tuple<bool, bool, bool>> shape_infos;
+  std::vector<CustomKernelShapeInfo> shape_infos;
  for (auto& n : input_names) {
-    std::tuple<bool, bool, bool> shape_info;
-    std::get<0>(shape_info) = source.find(n + "_shape") != std::string::npos;
-    std::get<1>(shape_info) = source.find(n + "_strides") != std::string::npos;
-    std::get<2>(shape_info) = source.find(n + "_ndim") != std::string::npos;
+    CustomKernelShapeInfo shape_info;
+    shape_info.shape = source.find(n + "_shape") != std::string::npos;
+    shape_info.strides = source.find(n + "_strides") != std::string::npos;
+    shape_info.ndim = source.find(n + "_ndim") != std::string::npos;
    shape_infos.push_back(shape_info);
  }
  const std::vector<std::pair<std::string, std::string>> metal_attributes = {
@@ -388,15 +388,15 @@ void CustomKernel::eval_gpu(
    index++;
    if (in.ndim() > 0) {
      int ndim = in.ndim();
-      if (std::get<0>(shape_info)) {
+      if (shape_info.shape) {
        compute_encoder.set_vector_bytes(in.shape(), ndim, index);
        index++;
      }
-      if (std::get<1>(shape_info)) {
+      if (shape_info.strides) {
        compute_encoder.set_vector_bytes(in.strides(), ndim, index);
        index++;
      }
-      if (std::get<2>(shape_info)) {
+      if (shape_info.ndim) {
        compute_encoder.set_bytes(ndim, index);
        index++;
      }
--- a/mlx/backend/metal/device.cpp
+++ b/mlx/backend/metal/device.cpp
@@ -21,12 +21,12 @@ constexpr const char* default_mtllib_path = METAL_PATH;

 auto get_metal_version() {
  auto get_metal_version_ = []() {
-    if (__builtin_available(macOS 26, iOS 26, tvOS 26, visionOS 26, *)) {
-      return MTL::LanguageVersion4_0;
-    } else if (__builtin_available(macOS 15, iOS 18, tvOS 18, visionOS 2, *)) {
+    if (__builtin_available(macOS 15, iOS 18, tvOS 18, visionOS 2, *)) {
      return MTL::LanguageVersion3_2;
-    } else {
+    } else if (__builtin_available(macOS 14, iOS 17, tvOS 17, visionOS 1, *)) {
      return MTL::LanguageVersion3_1;
+    } else {
+      return MTL::LanguageVersion3_0;
    }
  };
  static auto metal_version_ = get_metal_version_();
@@ -119,10 +119,8 @@ std::pair<MTL::Library*, NS::Error*> load_swiftpm_library(
  // if SWIFTPM_BUNDLE is a framework identifier, try loading from that
  auto frameworks = NS::Bundle::allFrameworks();
  for (int i = 0, c = (int)frameworks->count(); i < c; i++) {
-    const auto bundle = reinterpret_cast<NS::Bundle*>(frameworks->object(i));
-    const auto identifier = bundle->bundleIdentifier();
-    if (identifier != nullptr &&
-        !strcmp(identifier->utf8String(), SWIFTPM_BUNDLE)) {
+    auto bundle = reinterpret_cast<NS::Bundle*>(frameworks->object(i));
+    if (!strcmp(bundle->bundleIdentifier()->utf8String(), SWIFTPM_BUNDLE)) {
      library = try_load_framework(device, bundle->resourceURL(), lib_name);
      if (library != nullptr) {
        return {library, nullptr};
@@ -382,8 +380,11 @@ MTL::CommandQueue* Device::get_queue(Stream stream) {

 bool Device::command_buffer_needs_commit(int index) {
  auto& stream = get_stream_(index);
-  return (stream.buffer_ops > max_ops_per_buffer_) ||
-      ((stream.buffer_sizes >> 20) > max_mb_per_buffer_);
+  if (stream.buffer_ops > max_ops_per_buffer_ ||
+      (stream.buffer_sizes >> 20) > max_mb_per_buffer_) {
+    return true;
+  }
+  return false;
 }

 MTL::CommandBuffer* Device::get_command_buffer(int index) {
--- a/mlx/backend/metal/device.h
+++ b/mlx/backend/metal/device.h
@@ -265,14 +265,4 @@ Device& device(mlx::core::Device);

 std::unique_ptr<void, std::function<void(void*)>> new_scoped_memory_pool();

-#ifdef MLX_ENABLE_NAX
-
-inline bool is_nax_available() {
-  static bool is_nax_available_ =
-      metal::device(mlx::core::Device::gpu).get_architecture_gen() >= 17;
-  return is_nax_available_;
-}
-
-#endif // MLX_ENABLE_NAX
-
 } // namespace mlx::core::metal
--- a/mlx/backend/metal/distributed.cpp
+++ b/mlx/backend/metal/distributed.cpp
@@ -30,9 +30,4 @@ void Recv::eval_gpu(const std::vector<array>&, std::vector<array>&) {
  throw std::runtime_error("[Recv::eval_gpu] has no GPU implementation.");
 }

-void ReduceScatter::eval_gpu(const std::vector<array>&, std::vector<array>&) {
-  throw std::runtime_error(
-      "[ReduceScatter::eval_gpu] has no GPU implementation.");
-}
-
 } // namespace mlx::core::distributed
--- a/mlx/backend/metal/fence.cpp
+++ b/mlx/backend/metal/fence.cpp
@@ -99,7 +99,7 @@ void Fence::wait(Stream stream, const array& x) {
      [fence_ = fence_](MTL::CommandBuffer* cbuf) {});
 }

-void Fence::update(Stream stream, const array& x, bool cross_device) {
+void Fence::update(Stream stream, const array& x) {
  auto& f = *static_cast<FenceImpl*>(fence_.get());
  f.count++;

@@ -130,23 +130,21 @@ void Fence::update(Stream stream, const array& x, bool cross_device) {

  // Launch input visibility kernels
  auto& compute_encoder = d.get_command_encoder(idx);
-  if (cross_device) {
-    auto kernel = d.get_kernel("input_coherent");
-    uint32_t nthreads = (x.data_size() * x.itemsize() + sizeof(uint32_t) - 1) /
-        sizeof(uint32_t);
-    MTL::Size group_dims = MTL::Size(1024, 1, 1);
-    MTL::Size grid_dims = MTL::Size((nthreads + 1024 - 1) / 1024, 1, 1);
-    compute_encoder.set_compute_pipeline_state(kernel);
-    compute_encoder.set_input_array(x, 0);
-    compute_encoder.set_bytes(nthreads, 1);
-    compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
-  }
+  auto kernel = d.get_kernel("input_coherent");
+  uint32_t nthreads =
+      (x.data_size() * x.itemsize() + sizeof(uint32_t) - 1) / sizeof(uint32_t);
+  MTL::Size group_dims = MTL::Size(1024, 1, 1);
+  MTL::Size grid_dims = MTL::Size((nthreads + 1024 - 1) / 1024, 1, 1);
+  compute_encoder.set_compute_pipeline_state(kernel);
+  compute_encoder.set_input_array(x, 0);
+  compute_encoder.set_bytes(nthreads, 1);
+  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);

  // Barrier on previous kernels
  compute_encoder.barrier();

  // Launch value update kernel
-  auto kernel = d.get_kernel("fence_update");
+  kernel = d.get_kernel("fence_update");
  MTL::Size kernel_dims = MTL::Size(1, 1, 1);
  compute_encoder.set_compute_pipeline_state(kernel);

--- a/mlx/backend/metal/indexing.cpp
+++ b/mlx/backend/metal/indexing.cpp
@@ -1,5 +1,4 @@
 // Copyright © 2023-2024 Apple Inc.
-
 #include <fmt/format.h>

 #include "mlx/backend/common/compiled.h"
@@ -9,9 +8,7 @@
 #include "mlx/backend/metal/jit/includes.h"
 #include "mlx/backend/metal/jit/indexing.h"
 #include "mlx/backend/metal/kernels.h"
-#include "mlx/backend/metal/scan.h"
 #include "mlx/backend/metal/utils.h"
-#include "mlx/dtype.h"
 #include "mlx/primitives.h"
 #include "mlx/utils.h"

@@ -644,84 +641,4 @@ void ScatterAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
  compute_encoder.dispatch_threads(grid_dims, group_dims);
 }

-void MaskedScatter::eval_gpu(const std::vector<array>& inputs, array& out) {
-  const array& dst = inputs[0];
-  const array& mask = inputs[1];
-  const array& src = inputs[2];
-
-  auto& s = stream();
-  auto& d = metal::device(s.device);
-
-  const size_t total = mask.size();
-  const CopyType ct = (total == 1)
-      ? CopyType::Scalar
-      : (dst.flags().row_contiguous ? CopyType::Vector : CopyType::General);
-  copy_gpu(dst, out, ct, s);
-  if (total == 0) {
-    return;
-  }
-
-  array mask_flat = flatten_in_eval(mask, 1, -1, s);
-  if (mask_flat.data<void>() != mask.data<void>()) {
-    d.add_temporary(mask_flat, s.index);
-  }
-
-  if (!mask_flat.flags().row_contiguous) {
-    mask_flat = contiguous_copy_gpu(mask_flat, s);
-    d.add_temporary(mask_flat, s.index);
-  }
-
-  // Prefix (exclusive) of mask → scatter_offsets
-  array scatter_offsets(mask_flat.shape(), uint32, nullptr, {});
-  scatter_offsets.set_data(allocator::malloc(scatter_offsets.nbytes()));
-  d.add_temporary(scatter_offsets, s.index);
-
-  scan_gpu_inplace(
-      mask_flat,
-      scatter_offsets,
-      Scan::Sum,
-      /*axis=*/1,
-      /*reverse=*/false,
-      /*inclusive=*/false,
-      s);
-
-  // Kernel selection/build
-  static constexpr std::string_view kBaseName = "masked_assign";
-  const std::string dtype_tag = type_to_name(out.dtype());
-  const std::string value_type = get_type_string(out.dtype());
-  const std::string contiguous =
-      (src.flags().row_contiguous) ? "true" : "false";
-  const std::string kernel_name =
-      fmt::format("{}_{}_{}", kBaseName, dtype_tag, contiguous);
-
-  auto lib = d.get_library(kernel_name, [&]() {
-    std::string source = metal::utils();
-    source += metal::masked_scatter();
-    source += fmt::format(
-        std::string(masked_assign_kernel), kernel_name, value_type, contiguous);
-    return source;
-  });
-  auto kernel = d.get_kernel(kernel_name, lib);
-
-  // Binding
-  int bind_idx = 0;
-  const int ndim = static_cast<int>(src.ndim());
-  auto& compute_encoder = d.get_command_encoder(s.index);
-  compute_encoder.set_compute_pipeline_state(kernel);
-  compute_encoder.set_input_array(mask_flat, bind_idx++);
-  compute_encoder.set_input_array(scatter_offsets, bind_idx++);
-  compute_encoder.set_input_array(src, bind_idx++);
-  compute_encoder.set_output_array(out, bind_idx++);
-  compute_encoder.set_vector_bytes(src.shape(), bind_idx++);
-  compute_encoder.set_vector_bytes(src.strides(), bind_idx++);
-  compute_encoder.set_bytes(ndim, bind_idx++);
-  compute_encoder.set_bytes(src.size() / src.shape(0), bind_idx++);
-  compute_encoder.set_bytes(mask_flat.size() / mask.shape(0), bind_idx++);
-
-  // Dispatch
-  auto group_dims = get_block_dims(total, 1, 1);
-  MTL::Size grid_dims(total, 1, 1);
-  compute_encoder.dispatch_threads(grid_dims, group_dims);
-}
-
 } // namespace mlx::core
--- a/mlx/backend/metal/jit/includes.h
+++ b/mlx/backend/metal/jit/includes.h
@@ -11,7 +11,6 @@ const char* ternary_ops();
 const char* reduce_utils();
 const char* gather();
 const char* scatter();
-const char* masked_scatter();

 const char* arange();
 const char* unary();
--- a/mlx/backend/metal/jit/indexing.h
+++ b/mlx/backend/metal/jit/indexing.h
@@ -70,7 +70,3 @@ constexpr std::string_view scatter_kernels = R"(
      gid);
 }}
 )";
-
-constexpr std::string_view masked_assign_kernel = R"(
-template [[host_name("{0}")]] [[kernel]] decltype(masked_assign_impl<{1}, {2}>) masked_assign_impl<{1}, {2}>;
-)";
--- a/mlx/backend/metal/kernels/CMakeLists.txt
+++ b/mlx/backend/metal/kernels/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(BASE_HEADERS
-    bf16.h
+    metal_3_1/bf16.h
+    metal_3_0/bf16.h
    bf16_math.h
    complex.h
    defines.h
@@ -9,20 +10,24 @@ set(BASE_HEADERS
    utils.h)

 function(build_kernel_base TARGET SRCFILE DEPS)
-  set(METAL_FLAGS -x metal -Wall -Wextra -fno-fast-math -Wno-c++17-extensions)
+  set(METAL_FLAGS -Wall -Wextra -fno-fast-math -Wno-c++17-extensions)
  if(MLX_METAL_DEBUG)
    set(METAL_FLAGS ${METAL_FLAGS} -gline-tables-only -frecord-sources)
  endif()
-  if(MLX_ENABLE_NAX)
-    set(METAL_FLAGS ${METAL_FLAGS} -Wno-c++20-extensions -std=metal4.0)
-  endif()
  if(NOT CMAKE_OSX_DEPLOYMENT_TARGET STREQUAL "")
    set(METAL_FLAGS ${METAL_FLAGS}
                    "-mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET}")
  endif()
+  if(MLX_METAL_VERSION GREATER_EQUAL 310)
+    set(VERSION_INCLUDES
+        ${PROJECT_SOURCE_DIR}/mlx/backend/metal/kernels/metal_3_1)
+  else()
+    set(VERSION_INCLUDES
+        ${PROJECT_SOURCE_DIR}/mlx/backend/metal/kernels/metal_3_0)
+  endif()
  add_custom_command(
    COMMAND xcrun -sdk macosx metal ${METAL_FLAGS} -c ${SRCFILE}
-            -I${PROJECT_SOURCE_DIR} -o ${TARGET}.air
+            -I${PROJECT_SOURCE_DIR} -I${VERSION_INCLUDES} -o ${TARGET}.air
    DEPENDS ${SRCFILE} ${DEPS} ${BASE_HEADERS}
    OUTPUT ${TARGET}.air
    COMMENT "Building ${TARGET}.air"
@@ -123,30 +128,6 @@ if(NOT MLX_METAL_JIT)
  build_kernel(gemv_masked steel/utils.h)
 endif()

-if(MLX_ENABLE_NAX)
-
-  set(STEEL_NAX_HEADERS
-      steel/defines.h
-      steel/utils.h
-      steel/gemm/transforms.h
-      steel/gemm/nax.h
-      steel/gemm/gemm_nax.h
-      steel/utils/type_traits.h
-      steel/utils/integral_constant.h)
-
-  build_kernel(steel/gemm/kernels/steel_gemm_fused_nax ${STEEL_NAX_HEADERS})
-  build_kernel(steel/gemm/kernels/steel_gemm_gather_nax ${STEEL_NAX_HEADERS})
-
-  build_kernel(quantized_nax quantized_nax.h ${STEEL_NAX_HEADERS})
-  build_kernel(fp_quantized_nax fp_quantized_nax.h ${STEEL_NAX_HEADERS})
-
-  set(STEEL_NAX_ATTN_HEADERS
-      steel/defines.h steel/utils.h steel/attn/nax.h steel/utils/type_traits.h
-      steel/utils/integral_constant.h)
-
-  build_kernel(steel/attn/kernels/steel_attention_nax ${STEEL_NAX_ATTN_HEADERS})
-endif()
-
 add_custom_command(
  OUTPUT ${MLX_METAL_PATH}/mlx.metallib
  COMMAND xcrun -sdk macosx metallib ${KERNEL_AIR} -o
--- a/mlx/backend/metal/kernels/fp4.h
+++ b/mlx/backend/metal/kernels/fp4.h
@@ -49,10 +49,7 @@ struct fp4_e2m1 {
  }

  operator float() {
-    half converted = as_type<half>(ushort((bits & 7) << 9));
-    converted *= 16384.0;
-    converted = bits & 8 ? -converted : converted;
-    return converted;
+    return FP4_LUT[bits];
  }

  uint8_t bits;
--- a/mlx/backend/metal/kernels/fp8.h
+++ b/mlx/backend/metal/kernels/fp8.h
@@ -1,5 +1,12 @@
 #pragma once

+inline float fp32_from_bits(uint32_t bits) {
+  return *(reinterpret_cast<thread float*>(&bits));
+}
+inline float fp32_to_bits(float x) {
+  return *(reinterpret_cast<thread uint32_t*>(&x));
+}
+
 struct fp8_e4m3 {
  template <typename T>
  fp8_e4m3(T f) {
@@ -7,7 +14,7 @@ struct fp8_e4m3 {
    // https://github.com/pytorch/pytorch/blob/e3643e1e0e923f0fc063dfab6f45c956d568919d/c10/util/Float8_e4m3fn.h#L148
    uint32_t fp8_max = 543 << 21;
    uint32_t denorm_mask = 141 << 23;
-    uint32_t f_bits = as_type<uint32_t>(static_cast<float>(f));
+    uint32_t f_bits = fp32_to_bits(static_cast<float>(f));
    uint32_t sign = f_bits & 0x80000000;
    f_bits ^= sign;
    if (f_bits >= fp8_max) {
@@ -15,8 +22,8 @@ struct fp8_e4m3 {
      bits = 0x7E;
    } else {
      if (f_bits < (121 << 23)) {
-        f_bits = as_type<uint32_t>(
-            as_type<float>(f_bits) + as_type<float>(denorm_mask));
+        f_bits =
+            fp32_to_bits(fp32_from_bits(f_bits) + fp32_from_bits(denorm_mask));
        bits = static_cast<uint8_t>(f_bits - denorm_mask);
      } else {
        // resulting mantissa is odd
@@ -46,7 +53,7 @@ struct fp8_e4m3 {
        ((((nonsign << renorm_shift >> 4) + ((0x78 - renorm_shift) << 23)) |
          inf_nan_mask) &
         ~zero_mask);
-    return as_type<float>(result);
+    return fp32_from_bits(result);
  }

  uint8_t bits;
@@ -70,12 +77,11 @@ struct fp8_e8m0 {
    bits = static_cast<uint8_t>(n + 127);
  }

-  operator bfloat16_t() {
-    uint16_t out = (bits == 0 ? 0x40 : (static_cast<uint16_t>(bits) << 7));
-    return as_type<bfloat16_t>(out);
-  }
  operator float() {
-    return static_cast<float>(this->operator bfloat16_t());
+    if (bits == 0xFF) {
+      return metal::numeric_limits<float>::quiet_NaN();
+    }
+    return metal::ldexp(1.0f, static_cast<int>(bits) - 127);
  }

  uint8_t bits;
--- a/mlx/backend/metal/kernels/fp_quantized.h
+++ b/mlx/backend/metal/kernels/fp_quantized.h
@@ -29,31 +29,15 @@ inline constexpr short get_bytes_per_pack() {

 template <typename T>
 static inline T dequantize_scale(uint8_t s) {
-  return T(*(thread fp8_e8m0*)(&s));
+  using FOrI = union {
+    bfloat16_t f;
+    uint16_t i;
+  };
+  FOrI out;
+  out.i = (s == 0 ? 0x40 : (static_cast<uint16_t>(s) << 7));
+  return static_cast<T>(out.f);
 }

-template <int bits>
-struct Quantize {
-  uint8_t operator()(float x) {
-    if (bits == 8) {
-      return fp8_e4m3(x).bits;
-    } else {
-      return fp4_e2m1(x).bits;
-    }
-  }
-};
-
-template <int bits>
-struct Dequantize {
-  float operator()(uint8_t x) {
-    if (bits == 8) {
-      return float(*(thread fp8_e4m3*)(&x));
-    } else {
-      return float(*(thread fp4_e2m1*)(&x));
-    }
-  }
-};
-
 template <typename T, typename U, int values_per_thread>
 inline void load_vector(const device T* x, thread U* x_thread) {
  for (int i = 0; i < values_per_thread; i += 4) {
@@ -78,41 +62,62 @@ inline void load_vector_safe(const device T* x, thread U* x_thread, int N) {
  }
 }

+template <typename T>
+void load_fp4_lut(threadgroup T* lut, uint simd_gid, uint simd_lid) {
+  if (simd_gid == 0 && simd_lid < 16) {
+    lut[simd_lid] = static_cast<T>(FP4_LUT[simd_lid]);
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+}
+
 template <typename U, int values_per_thread>
-inline U qdot(const device uint8_t* w, const thread U* x_thread, U scale) {
+inline U qdot(
+    const device uint8_t* w,
+    const thread U* x_thread,
+    U scale,
+    const threadgroup U* lut) {
  U accum = 0;
  const device uint16_t* ws = (const device uint16_t*)w;
  for (int i = 0; i < (values_per_thread / 4); i++) {
    accum +=
-        (x_thread[4 * i] * Dequantize<4>{}(ws[i]) +
-         x_thread[4 * i + 1] * Dequantize<4>{}(ws[i] >> 4) +
-         x_thread[4 * i + 2] * Dequantize<4>{}(ws[i] >> 8) +
-         x_thread[4 * i + 3] * Dequantize<4>{}(ws[i] >> 12));
+        (x_thread[4 * i] * lut[ws[i] & 0xf] +
+         x_thread[4 * i + 1] * lut[(ws[i] >> 4) & 0xf] +
+         x_thread[4 * i + 2] * lut[(ws[i] >> 8) & 0xf] +
+         x_thread[4 * i + 3] * lut[(ws[i] >> 12) & 0xf]);
  }
  return scale * accum;
 }

 template <typename U, int values_per_thread>
-inline U
-qdot_safe(const device uint8_t* w, const thread U* x_thread, U scale, int N) {
+inline U qdot_safe(
+    const device uint8_t* w,
+    const thread U* x_thread,
+    U scale,
+    const threadgroup U* lut,
+    int N) {
  U accum = 0;

  const device uint16_t* ws = (const device uint16_t*)w;
  for (int i = 0; i < (N / 4); i++) {
    accum +=
-        (x_thread[4 * i] * Dequantize<4>{}(ws[i]) +
-         x_thread[4 * i + 1] * Dequantize<4>{}(ws[i] >> 4) +
-         x_thread[4 * i + 2] * Dequantize<4>{}(ws[i] >> 8) +
-         x_thread[4 * i + 3] * Dequantize<4>{}(ws[i] >> 12));
+        (x_thread[4 * i] * lut[ws[i] & 0xf] +
+         x_thread[4 * i + 1] * lut[(ws[i] >> 4) & 0xf] +
+         x_thread[4 * i + 2] * lut[(ws[i] >> 8) & 0xf] +
+         x_thread[4 * i + 3] * lut[(ws[i] >> 12) & 0xf]);
  }
  return scale * accum;
 }

 template <typename U, int values_per_thread>
-inline void qouter(const thread uint8_t* w, U x, U scale, thread U* result) {
+inline void qouter(
+    const thread uint8_t* w,
+    U x,
+    U scale,
+    thread U* result,
+    const threadgroup U* lut) {
  for (int i = 0; i < (values_per_thread / 2); i++) {
-    result[2 * i] += x * scale * Dequantize<4>{}(w[i]);
-    result[2 * i + 1] += x * scale * Dequantize<4>{}(w[i] >> 4);
+    result[2 * i] += x * scale * lut[w[i] & 0xf];
+    result[2 * i + 1] += x * scale * lut[(w[i] >> 4) & 0xf];
  }
 }

@@ -187,10 +192,7 @@ struct QuantizedBlockLoader {
            bj * bytes_per_pack),
        scales(scales_ + bi * src_ld / group_size),
        lut(lut_) {
-    if (simd_group_id == 0 && simd_lane_id < 16) {
-      lut[simd_lane_id] = static_cast<T>(FP4_LUT[simd_lane_id]);
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+    load_fp4_lut(lut, simd_group_id, simd_lane_id);
  }

  void load_unsafe() const {
@@ -262,7 +264,10 @@ METAL_FUNC void fp_qmv_quad_impl(
    const constant int& out_vec_size,
    uint3 tid [[threadgroup_position_in_grid]],
    uint quad_gid [[quadgroup_index_in_threadgroup]],
-    uint quad_lid [[thread_index_in_quadgroup]]) {
+    uint quad_lid [[thread_index_in_quadgroup]],
+    uint simd_gid [[simdgroup_index_in_threadgroup]],
+    uint simd_lid [[thread_index_in_simdgroup]],
+    threadgroup float* lut) {
  constexpr int quads_per_simd = SIMD_SIZE / QUAD_SIZE;
  constexpr int pack_factor = 8;
  constexpr int values_per_thread = D / QUAD_SIZE;
@@ -274,6 +279,7 @@ METAL_FUNC void fp_qmv_quad_impl(

  thread U x_thread[values_per_thread];
  thread U result[results_per_quadgroup] = {0};
+  load_fp4_lut(lut, simd_gid, simd_lid);

  // Adjust positions
  const int in_vec_size_w = in_vec_size / pack_factor;
@@ -293,7 +299,7 @@ METAL_FUNC void fp_qmv_quad_impl(

    U s = dequantize_scale<U>(sl[0]);
    if (row * quads_per_simd + out_row < out_vec_size) {
-      result[row] += qdot<U, values_per_thread>(wl, x_thread, s);
+      result[row] += qdot<U, values_per_thread>(wl, x_thread, s, lut);
    }
  }

@@ -315,7 +321,8 @@ METAL_FUNC void fp_qmv_fast_impl(
    const constant int& out_vec_size,
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
-    uint simd_lid [[thread_index_in_simdgroup]]) {
+    uint simd_lid [[thread_index_in_simdgroup]],
+    threadgroup float* lut) {
  constexpr int packs_per_thread = 2;
  constexpr int num_simdgroups = 2;
  constexpr int results_per_simdgroup = 4;
@@ -330,6 +337,7 @@ METAL_FUNC void fp_qmv_fast_impl(
  typedef float U;
  thread U x_thread[values_per_thread];
  thread U result[results_per_simdgroup] = {0};
+  load_fp4_lut(lut, simd_gid, simd_lid);

  // Adjust positions
  const int in_vec_size_w = in_vec_size * bytes_per_pack / pack_factor;
@@ -350,7 +358,7 @@ METAL_FUNC void fp_qmv_fast_impl(
      const device auto* sl = scales + row * in_vec_size_g;

      U s = dequantize_scale<U>(sl[0]);
-      result[row] += qdot<U, values_per_thread>(wl, x_thread, s);
+      result[row] += qdot<U, values_per_thread>(wl, x_thread, s, lut);
    }

    ws += block_size * bytes_per_pack / pack_factor;
@@ -376,7 +384,8 @@ METAL_FUNC void fp_qmv_impl(
    const constant int& out_vec_size,
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
-    uint simd_lid [[thread_index_in_simdgroup]]) {
+    uint simd_lid [[thread_index_in_simdgroup]],
+    threadgroup float* lut) {
  constexpr int num_simdgroups = 2;
  constexpr int results_per_simdgroup = 4;
  constexpr int packs_per_thread = 1;
@@ -393,6 +402,7 @@ METAL_FUNC void fp_qmv_impl(

  thread U x_thread[values_per_thread];
  thread U result[results_per_simdgroup] = {0};
+  load_fp4_lut(lut, simd_gid, simd_lid);

  // Adjust positions
  const int in_vec_size_w = in_vec_size * bytes_per_pack / pack_factor;
@@ -423,7 +433,7 @@ METAL_FUNC void fp_qmv_impl(
        const device auto* sl = scales + row * in_vec_size_g;

        uint8_t s = sl[0];
-        result[row] += qdot<U, values_per_thread>(wl, x_thread, s);
+        result[row] += qdot<U, values_per_thread>(wl, x_thread, s, lut);
      }

      ws += block_size * bytes_per_pack / pack_factor;
@@ -442,7 +452,7 @@ METAL_FUNC void fp_qmv_impl(
        const device auto* sl = scales + row * in_vec_size_g;

        U s = dequantize_scale<U>(sl[0]);
-        result[row] += qdot<U, values_per_thread>(wl, x_thread, s);
+        result[row] += qdot<U, values_per_thread>(wl, x_thread, s, lut);
      }
    }

@@ -471,7 +481,7 @@ METAL_FUNC void fp_qmv_impl(
        const device auto* sl = scales + row * in_vec_size_g;

        U s = dequantize_scale<U>(sl[0]);
-        result[row] += qdot<U, values_per_thread>(wl, x_thread, s);
+        result[row] += qdot<U, values_per_thread>(wl, x_thread, s, lut);
      }

      ws += block_size * bytes_per_pack / pack_factor;
@@ -491,7 +501,7 @@ METAL_FUNC void fp_qmv_impl(

        U s = dequantize_scale<U>(sl[0]);
        result[row] +=
-            qdot_safe<U, values_per_thread>(wl, x_thread, s, remaining);
+            qdot_safe<U, values_per_thread>(wl, x_thread, s, lut, remaining);
      }
    }
    for (int row = 0; row < results_per_simdgroup; row++) {
@@ -513,7 +523,8 @@ METAL_FUNC void fp_qvm_impl(
    const int out_vec_size,
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
-    uint simd_lid [[thread_index_in_simdgroup]]) {
+    uint simd_lid [[thread_index_in_simdgroup]],
+    threadgroup float* lut) {
  constexpr int num_simdgroups = 2;
  constexpr int pack_factor = get_pack_factor<32>();
  constexpr int bytes_per_pack = get_bytes_per_pack();
@@ -534,6 +545,8 @@ METAL_FUNC void fp_qvm_impl(
  thread U scale = 0;
  thread U x_local = 0;

+  load_fp4_lut(lut, simd_gid, simd_lid);
+
  // Adjust positions
  const int out_vec_size_w = out_vec_size * bytes_per_pack / pack_factor;
  const int out_vec_size_g = out_vec_size / group_size;
@@ -555,7 +568,7 @@ METAL_FUNC void fp_qvm_impl(
      scale = dequantize_scale<U>(*scales);
      w_local = *((device vec_w*)ws);
      qouter<U, tn * pack_factor>(
-          (thread uint8_t*)&w_local, x_local, scale, result);
+          (thread uint8_t*)&w_local, x_local, scale, result, lut);

      x += block_size;
      scales += block_size * out_vec_size_g;
@@ -568,7 +581,7 @@ METAL_FUNC void fp_qvm_impl(
      w_local = *((device vec_w*)ws);

      qouter<U, tn * pack_factor>(
-          (thread uint8_t*)&w_local, x_local, scale, result);
+          (thread uint8_t*)&w_local, x_local, scale, result, lut);

      x += block_size;
      scales += block_size * out_vec_size_g;
@@ -583,7 +596,7 @@ METAL_FUNC void fp_qvm_impl(
      scale = 0;
    }
    qouter<U, tn * pack_factor>(
-        (thread uint8_t*)&w_local, x_local, scale, result);
+        (thread uint8_t*)&w_local, x_local, scale, result, lut);
  }

 // Accumulate in the simdgroup
@@ -962,7 +975,9 @@ template <typename T, int group_size, int bits, int D, bool batched>
    const constant int64_t* s_strides,
    uint3 tid [[threadgroup_position_in_grid]],
    uint quad_gid [[quadgroup_index_in_threadgroup]],
-    uint quad_lid [[thread_index_in_quadgroup]]) {
+    uint quad_lid [[thread_index_in_quadgroup]],
+    uint simd_gid [[simdgroup_index_in_threadgroup]],
+    uint simd_lid [[thread_index_in_simdgroup]]) {
  if (batched) {
    int M = x_shape[x_batch_ndims];
    adjust_matrix_offsets(
@@ -980,8 +995,20 @@ template <typename T, int group_size, int bits, int D, bool batched>
        s_strides,
        tid);
  }
+  threadgroup float lut[16];
  fp_qmv_quad_impl<T, group_size, bits, D>(
-      w, scales, x, y, in_vec_size, out_vec_size, tid, quad_gid, quad_lid);
+      w,
+      scales,
+      x,
+      y,
+      in_vec_size,
+      out_vec_size,
+      tid,
+      quad_gid,
+      quad_lid,
+      simd_gid,
+      simd_lid,
+      lut);
 }

 template <typename T, int group_size, int bits, bool batched>
@@ -1019,8 +1046,9 @@ template <typename T, int group_size, int bits, bool batched>
        s_strides,
        tid);
  }
+  threadgroup float lut[16];
  fp_qmv_fast_impl<T, group_size, bits>(
-      w, scales, x, y, in_vec_size, out_vec_size, tid, simd_gid, simd_lid);
+      w, scales, x, y, in_vec_size, out_vec_size, tid, simd_gid, simd_lid, lut);
 }

 template <typename T, const int group_size, int bits, bool batched>
@@ -1058,8 +1086,9 @@ template <typename T, const int group_size, int bits, bool batched>
        s_strides,
        tid);
  }
+  threadgroup float lut[16];
  fp_qmv_impl<T, group_size, bits>(
-      w, scales, x, y, in_vec_size, out_vec_size, tid, simd_gid, simd_lid);
+      w, scales, x, y, in_vec_size, out_vec_size, tid, simd_gid, simd_lid, lut);
 }

 template <typename T, const int group_size, int bits, bool batched>
@@ -1097,8 +1126,9 @@ template <typename T, const int group_size, int bits, bool batched>
        s_strides,
        tid);
  }
+  threadgroup float lut[16];
  fp_qvm_impl<T, group_size, bits>(
-      w, scales, x, y, in_vec_size, out_vec_size, tid, simd_gid, simd_lid);
+      w, scales, x, y, in_vec_size, out_vec_size, tid, simd_gid, simd_lid, lut);
 }

 template <typename T, const int group_size, int bits, int split_k = 32>
@@ -1140,8 +1170,18 @@ template <typename T, const int group_size, int bits, int split_k = 32>
  int in_vec_size_adj =
      tid.z % split_k == split_k - 1 ? final_block_size : in_vec_size;

+  threadgroup float lut[16];
  fp_qvm_impl<T, group_size, bits>(
-      w, scales, x, y, in_vec_size_adj, out_vec_size, tid, simd_gid, simd_lid);
+      w,
+      scales,
+      x,
+      y,
+      in_vec_size_adj,
+      out_vec_size,
+      tid,
+      simd_gid,
+      simd_lid,
+      lut);
 }

 template <
@@ -1302,8 +1342,9 @@ template <typename T, int group_size, int bits>
      w_strides,
      s_strides,
      tid);
+  threadgroup float lut[16];
  fp_qmv_fast_impl<T, group_size, bits>(
-      w, scales, x, y, in_vec_size, out_vec_size, tid, simd_gid, simd_lid);
+      w, scales, x, y, in_vec_size, out_vec_size, tid, simd_gid, simd_lid, lut);
 }

 template <typename T, int group_size, int bits>
@@ -1351,8 +1392,9 @@ template <typename T, int group_size, int bits>
      w_strides,
      s_strides,
      tid);
+  threadgroup float lut[16];
  fp_qmv_impl<T, group_size, bits>(
-      w, scales, x, y, in_vec_size, out_vec_size, tid, simd_gid, simd_lid);
+      w, scales, x, y, in_vec_size, out_vec_size, tid, simd_gid, simd_lid, lut);
 }

 template <typename T, int group_size, int bits>
@@ -1400,8 +1442,9 @@ template <typename T, int group_size, int bits>
      w_strides,
      s_strides,
      tid);
+  threadgroup float lut[16];
  fp_qvm_impl<T, group_size, bits>(
-      w, scales, x, y, in_vec_size, out_vec_size, tid, simd_gid, simd_lid);
+      w, scales, x, y, in_vec_size, out_vec_size, tid, simd_gid, simd_lid, lut);
 }

 template <
@@ -1728,6 +1771,28 @@ template <
  }
 }

+template <int bits>
+struct Quantize {
+  uint8_t operator()(float x) {
+    if (bits == 8) {
+      return fp8_e4m3(x).bits;
+    } else {
+      return fp4_e2m1(x).bits;
+    }
+  }
+};
+
+template <int bits>
+struct Dequantize {
+  float operator()(uint8_t x) {
+    if (bits == 8) {
+      return float(*(thread fp8_e4m3*)(&x));
+    } else {
+      return float(*(thread fp4_e2m1*)(&x));
+    }
+  }
+};
+
 template <typename T, const int group_size, const int bits>
 [[kernel]] void fp_quantize(
    const device T* w [[buffer(0)]],
--- a/mlx/backend/metal/kernels/fp_quantized_nax.h
+++ b/mlx/backend/metal/kernels/fp_quantized_nax.h
--- a/mlx/backend/metal/kernels/fp_quantized_nax.metal
+++ b/mlx/backend/metal/kernels/fp_quantized_nax.metal
@@ -1,74 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-// clang-format off
-#include "mlx/backend/metal/kernels/utils.h"
-#include "mlx/backend/metal/kernels/steel/gemm/gemm.h"
-#include "mlx/backend/metal/kernels/quantized_utils.h"
-#include "mlx/backend/metal/kernels/steel/gemm/nax.h"
-#include "mlx/backend/metal/kernels/fp_quantized_nax.h"
-
-
-#define instantiate_quantized_batched(mode, name, type, bm, bn, bk, wm, wn, batched) \
-  instantiate_kernel( \
-      #mode "_" #name "_" #type "_gs_32_b_4_bm" #bm "_bn" #bn "_bk" #bk "_wm" #wm "_wn" #wn "_batch_" #batched, \
-      fp_ ## name,  \
-      type,         \
-      32,           \
-      4,            \
-      batched)
-
-#define instantiate_quantized_aligned(mode, name, type, bm, bn, bk, wm, wn, aligned) \
-  instantiate_kernel( \
-      #mode "_" #name "_" #type "_gs_32_b_4_bm" #bm "_bn" #bn "_bk" #bk "_wm" #wm "_wn" #wn "_alN_" #aligned, \
-      fp_ ## name, \
-      type,        \
-      32,          \
-      4,           \
-      aligned)
-
-#define instantiate_quantized_aligned_batched(mode, name, type, bm, bn, bk, wm, wn, aligned, batched) \
-  instantiate_kernel( \
-      #mode "_" #name "_" #type "_gs_32_b_4_bm" #bm "_bn" #bn "_bk" #bk "_wm" #wm "_wn" #wn "_alN_" #aligned "_batch_" #batched, \
-      fp_ ## name,    \
-      type,    \
-      32,      \
-      4,       \
-      aligned, \
-      batched)
-
-#define instantiate_gather_qmm_rhs(func, name, type, bm, bn, bk, wm, wn, transpose) \
-  instantiate_kernel( \
-      #name "_" #type "_gs_32_b_4_bm_" #bm "_bn_" #bn "_bk_" #bk "_wm_" #wm "_wn_" #wn, \
-      func,    \
-      type,    \
-      32,      \
-      4,       \
-      bm,      \
-      bn,      \
-      bk,      \
-      wm,      \
-      wn,      \
-      transpose)
-
-
-#define instantiate_quantized_all_aligned(type) \
-  instantiate_quantized_aligned(mxfp4, gather_qmm_t_nax, type, 64, 64, 64, 2, 2, true)      \
-  instantiate_quantized_aligned(mxfp4, gather_qmm_t_nax, type, 64, 64, 64, 2, 2, false)     \
-  instantiate_quantized_aligned_batched(mxfp4, qmm_t_nax, type, 64, 64, 64, 2, 2, true, 1)  \
-  instantiate_quantized_aligned_batched(mxfp4, qmm_t_nax, type, 64, 64, 64, 2, 2, true, 0)  \
-  instantiate_quantized_aligned_batched(mxfp4, qmm_t_nax, type, 64, 64, 64, 2, 2, false, 1) \
-  instantiate_quantized_aligned_batched(mxfp4, qmm_t_nax, type, 64, 64, 64, 2, 2, false, 0)
-
-
-#define instantiate_quantized_all_rhs(type) \
-  instantiate_gather_qmm_rhs(fp_gather_qmm_rhs_nax, mxfp4_gather_qmm_rhs_nax_nt, type, 64, 64, 64, 2, 2, true) \
-  instantiate_gather_qmm_rhs(fp_gather_qmm_rhs_nax, mxfp4_gather_qmm_rhs_nax_nn, type, 64, 64, 64, 2, 2, false) 
-
-#define instantiate_quantized_types(type) \
-  instantiate_quantized_all_aligned(type) \
-  instantiate_quantized_all_rhs(type)
-
-instantiate_quantized_types(float)
-instantiate_quantized_types(bfloat16_t)
-instantiate_quantized_types(float16_t)
-    // clang-format on
--- a/mlx/backend/metal/kernels/indexing/masked_scatter.h
+++ b/mlx/backend/metal/kernels/indexing/masked_scatter.h
@@ -1,38 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#pragma once
-
-template <typename T, bool src_contiguous>
-[[kernel]] void masked_assign_impl(
-    const device bool* mask [[buffer(0)]],
-    const device uint* scatter_offsets [[buffer(1)]],
-    const device T* src [[buffer(2)]],
-    device T* out [[buffer(3)]],
-    const constant int* src_shapes [[buffer(4)]],
-    const constant int64_t* src_strides [[buffer(5)]],
-    const constant int& src_ndim [[buffer(6)]],
-    const constant int64_t& src_batch_size [[buffer(7)]],
-    const constant int64_t& mask_batch_size [[buffer(8)]],
-    uint idx [[thread_position_in_grid]]) {
-  const bool mask_value = mask[idx];
-  if (!mask_value) {
-    return;
-  }
-
-  const uint src_index = scatter_offsets[idx];
-  if (src_index >= src_batch_size) {
-    return;
-  }
-
-  const uint batch_idx = idx / mask_batch_size;
-
-  if (src_contiguous) {
-    out[idx] = src[batch_idx * src_batch_size + src_index];
-  } else {
-    out[idx] = src[elem_to_loc<uint>(
-        batch_idx * src_batch_size + src_index,
-        src_shapes,
-        src_strides,
-        src_ndim)];
-  }
-}
--- a/mlx/backend/metal/kernels/jit/bf16.h
+++ b/mlx/backend/metal/kernels/jit/bf16.h
@@ -0,0 +1,16 @@
+// Copyright © 2024 Apple Inc.
+
+// clang-format off
+#define jit_if #if
+#define jit_else #else
+#define jit_endif #endif
+
+jit_if (__METAL_VERSION__ >= 310)
+
+#include "mlx/backend/metal/kernels/metal_3_1/bf16.h"
+
+jit_else
+
+#include "mlx/backend/metal/kernels/metal_3_0/bf16.h"
+
+jit_endif // clang-format on
--- a/mlx/backend/metal/kernels/metal_3_0/bf16.h
+++ b/mlx/backend/metal/kernels/metal_3_0/bf16.h
@@ -0,0 +1,314 @@
+// Copyright © 2023 Apple Inc.
+
+#pragma once
+
+#include <metal_stdlib>
+
+using namespace metal;
+
+/////////////////////////////////////////////////////////////////////////////
+// Helpers
+/////////////////////////////////////////////////////////////////////////////
+
+constexpr METAL_FUNC uint16_t float_to_bfloat_bits(float x) {
+  // Check for nan
+  if ((as_type<uint32_t>(x) & ~_fp_encoding_traits<float>::sign_mask) >
+      _fp_encoding_traits<float>::inf_mask) {
+    return uint16_t(as_type<uint32_t>(0x7FC0));
+  }
+  // Take bits
+  uint32_t float_bits = as_type<uint32_t>(x);
+
+  // Round to nearest even
+  float_bits += ((float_bits >> 16) & 1) + as_type<uint32_t>(0x7FFF);
+
+  // Take upper 16 bits
+  return float_bits >> 16;
+}
+
+constexpr METAL_FUNC float bfloat_bits_to_float(uint16_t x) {
+  // Upper 16 bits are the data and lower 16 bits are 0s
+  return as_type<float>((uint32_t)x << 16);
+}
+
+struct _MLX_BFloat16;
+
+template <typename T>
+static constexpr constant bool can_convert_to_bfloat =
+    !is_same_v<T, _MLX_BFloat16> && is_convertible_v<T, float>;
+
+template <typename T>
+static constexpr constant bool can_convert_from_bfloat =
+    !is_same_v<T, _MLX_BFloat16> && is_convertible_v<float, T>;
+
+/////////////////////////////////////////////////////////////////////////////
+// Bfloat struct
+/////////////////////////////////////////////////////////////////////////////
+
+struct _MLX_BFloat16 {
+  /////////////////////////////////////////////////////////////////////////////
+  // Constructors
+  uint16_t bits_;
+  _MLX_BFloat16() thread = default;
+  _MLX_BFloat16() threadgroup = default;
+  _MLX_BFloat16() device = default;
+  _MLX_BFloat16() constant = default;
+
+  struct bits_to_bfloat_struct {};
+  static constexpr METAL_FUNC bits_to_bfloat_struct bits_to_bfloat() {
+    return bits_to_bfloat_struct();
+  }
+  constexpr METAL_FUNC _MLX_BFloat16(uint16_t bits, bits_to_bfloat_struct)
+      : bits_(bits) {}
+
+  /////////////////////////////////////////////////////////////////////////////
+  // Conversions to bfloat
+
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_to_bfloat<T>>::type>
+  constexpr METAL_FUNC _MLX_BFloat16(T x) thread
+      : bits_(float_to_bfloat_bits(static_cast<float>(x))) {}
+
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_to_bfloat<T>>::type>
+  constexpr METAL_FUNC _MLX_BFloat16(T x) threadgroup
+      : bits_(float_to_bfloat_bits(static_cast<float>(x))) {}
+
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_to_bfloat<T>>::type>
+  constexpr METAL_FUNC _MLX_BFloat16(T x) device
+      : bits_(float_to_bfloat_bits(static_cast<float>(x))) {}
+
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_to_bfloat<T>>::type>
+  constexpr METAL_FUNC _MLX_BFloat16(T x) constant
+      : bits_(float_to_bfloat_bits(static_cast<float>(x))) {}
+
+  /////////////////////////////////////////////////////////////////////////////
+  // Conversions from bfloat
+
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_from_bfloat<T>>::type>
+  constexpr METAL_FUNC operator T() const thread {
+    return static_cast<T>(bfloat_bits_to_float(bits_));
+  }
+
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_from_bfloat<T>>::type>
+  constexpr METAL_FUNC operator T() const threadgroup {
+    return static_cast<T>(bfloat_bits_to_float(bits_));
+  }
+
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_from_bfloat<T>>::type>
+  constexpr METAL_FUNC operator T() const device {
+    return static_cast<T>(bfloat_bits_to_float(bits_));
+  }
+
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_from_bfloat<T>>::type>
+  constexpr METAL_FUNC operator T() const constant {
+    return static_cast<T>(bfloat_bits_to_float(bits_));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////
+// Bfloat operators
+/////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////
+// Unary ops
+constexpr METAL_FUNC _MLX_BFloat16 operator-(_MLX_BFloat16 x) {
+  return -static_cast<float>(x);
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// Binary operators
+#define bfloat_binop_base(__op__, __operator__, otype, atype, btype, ctype) \
+  constexpr METAL_FUNC otype __operator__(atype lhs, btype rhs) {           \
+    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);          \
+  }
+
+#define bfloat_binop_helper(__op__, __operator__, otype, itype, ctype)    \
+  constexpr METAL_FUNC otype __operator__(_MLX_BFloat16 lhs, itype rhs) { \
+    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);        \
+  }                                                                       \
+  constexpr METAL_FUNC otype __operator__(itype lhs, _MLX_BFloat16 rhs) { \
+    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);        \
+  }
+
+/////////////////////////////////////////////////////////////////////////////
+// Arithmetic Operators
+#define bfloat_binop(_op_, _operator_)                                       \
+  bfloat_binop_base(                                                         \
+      _op_, _operator_, _MLX_BFloat16, _MLX_BFloat16, _MLX_BFloat16, float); \
+  bfloat_binop_helper(_op_, _operator_, float, float, float);                \
+  bfloat_binop_helper(_op_, _operator_, float, half, float);                 \
+  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, int32_t, float);      \
+  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, uint32_t, float);     \
+  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, int64_t, float);      \
+  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, uint64_t, float);
+
+bfloat_binop(+, operator+);
+bfloat_binop(-, operator-);
+bfloat_binop(*, operator*);
+bfloat_binop(/, operator/);
+
+/////////////////////////////////////////////////////////////////////////////
+// Comparison ops
+#define bfloat_compop(__op__, __operator__)                             \
+  bfloat_binop_base(                                                    \
+      __op__, __operator__, bool, _MLX_BFloat16, _MLX_BFloat16, float); \
+  bfloat_binop_helper(__op__, __operator__, bool, float, float);        \
+  bfloat_binop_helper(__op__, __operator__, bool, half, float);         \
+  bfloat_binop_helper(__op__, __operator__, bool, int32_t, float);      \
+  bfloat_binop_helper(__op__, __operator__, bool, uint32_t, float);     \
+  bfloat_binop_helper(__op__, __operator__, bool, int64_t, float);      \
+  bfloat_binop_helper(__op__, __operator__, bool, uint64_t, float);
+
+bfloat_compop(>, operator>);
+bfloat_compop(<, operator<);
+bfloat_compop(>=, operator>=);
+bfloat_compop(<=, operator<=);
+bfloat_compop(==, operator==);
+bfloat_compop(!=, operator!=);
+
+#undef bfloat_compop
+#undef bfloat_binop_base
+#undef bfloat_binop_helper
+#undef bfloat_binop
+
+/////////////////////////////////////////////////////////////////////////////
+// Inplace Operators
+#define bfloat_inplace_op_helper(__op__, __operator__, itype, addr_space) \
+  constexpr METAL_FUNC addr_space _MLX_BFloat16& __operator__(            \
+      addr_space _MLX_BFloat16& lhs, itype rhs) {                         \
+    lhs = static_cast<float>(lhs) __op__ static_cast<float>(rhs);         \
+    return lhs;                                                           \
+  }                                                                       \
+  constexpr METAL_FUNC addr_space itype& __operator__(                    \
+      addr_space itype& lhs, _MLX_BFloat16 rhs) {                         \
+    lhs = static_cast<float>(lhs) __op__ static_cast<float>(rhs);         \
+    return lhs;                                                           \
+  }
+
+#define bfloat_inplace_op_addr_space_helper(__op__, __operator__, itype) \
+  bfloat_inplace_op_helper(__op__, __operator__, itype, device);         \
+  bfloat_inplace_op_helper(__op__, __operator__, itype, thread);         \
+  bfloat_inplace_op_helper(__op__, __operator__, itype, threadgroup);
+
+#define bfloat_inplace_op(itype)                             \
+  bfloat_inplace_op_addr_space_helper(+, operator+=, itype); \
+  bfloat_inplace_op_addr_space_helper(-, operator-=, itype); \
+  bfloat_inplace_op_addr_space_helper(*, operator*=, itype); \
+  bfloat_inplace_op_addr_space_helper(/, operator/=, itype);
+
+bfloat_inplace_op(float);
+bfloat_inplace_op(half);
+bfloat_inplace_op(int16_t);
+bfloat_inplace_op(int32_t);
+bfloat_inplace_op(int64_t);
+bfloat_inplace_op(uint16_t);
+bfloat_inplace_op(uint32_t);
+bfloat_inplace_op(uint64_t);
+
+#undef bfloat_inplace_op_helper
+#undef bfloat_inplace_op_addr_space_helper
+#undef bfloat_inplace_op
+
+#define bfloat_inplace_op_helper(__op__, __operator__, addr_space) \
+  constexpr METAL_FUNC addr_space _MLX_BFloat16& __operator__(     \
+      addr_space _MLX_BFloat16& lhs, _MLX_BFloat16 rhs) {          \
+    lhs = static_cast<float>(lhs) __op__ static_cast<float>(rhs);  \
+    return lhs;                                                    \
+  }
+
+#define bfloat_inplace_op_addr_space_helper(__op__, __operator__) \
+  bfloat_inplace_op_helper(__op__, __operator__, device);         \
+  bfloat_inplace_op_helper(__op__, __operator__, thread);         \
+  bfloat_inplace_op_helper(__op__, __operator__, threadgroup);
+
+bfloat_inplace_op_addr_space_helper(+, operator+=);
+bfloat_inplace_op_addr_space_helper(-, operator-=);
+bfloat_inplace_op_addr_space_helper(*, operator*=);
+bfloat_inplace_op_addr_space_helper(/, operator/=);
+
+#undef bfloat_inplace_op_helper
+#undef bfloat_inplace_op_addr_space_helper
+
+/////////////////////////////////////////////////////////////////////////////
+// Bfloat typedef
+/////////////////////////////////////////////////////////////////////////////
+
+typedef struct _MLX_BFloat16 bfloat16_t;
+
+/////////////////////////////////////////////////////////////////////////////
+// Bfloat numeric limits
+/////////////////////////////////////////////////////////////////////////////
+
+#pragma METAL internals : enable
+
+namespace metal {
+
+template <>
+struct _numeric_limits_impl<bfloat16_t> : _fp_numeric_limits_impl_base {
+  static constexpr constant int digits = 8;
+  static constexpr constant int digits10 = 2;
+  static constexpr constant int max_digits10 = 4;
+  static constexpr constant int radix = 2;
+  static constexpr constant int min_exponent = -125;
+  static constexpr constant int min_exponent10 = -37;
+  static constexpr constant int max_exponent = 128;
+  static constexpr constant int max_exponent10 = 38;
+
+  static constexpr bfloat16_t min() {
+    return _MLX_BFloat16(0x0080, _MLX_BFloat16::bits_to_bfloat());
+  }
+  static constexpr bfloat16_t lowest() {
+    return _MLX_BFloat16(0xFF7F, _MLX_BFloat16::bits_to_bfloat());
+  }
+  static constexpr bfloat16_t max() {
+    return _MLX_BFloat16(0x7F7F, _MLX_BFloat16::bits_to_bfloat());
+  }
+  static constexpr bfloat16_t epsilon() {
+    return _MLX_BFloat16(0x3C00, _MLX_BFloat16::bits_to_bfloat());
+  }
+  static constexpr bfloat16_t round_error() {
+    return _MLX_BFloat16(0x3F00, _MLX_BFloat16::bits_to_bfloat());
+  }
+  static constexpr bfloat16_t infinity() {
+    return _MLX_BFloat16(0x7F80, _MLX_BFloat16::bits_to_bfloat());
+  }
+  static constexpr bfloat16_t quiet_NaN() {
+    return _MLX_BFloat16(0x7FC0, _MLX_BFloat16::bits_to_bfloat());
+  }
+  static constexpr bfloat16_t signaling_NaN() {
+    return _MLX_BFloat16(0x7F80, _MLX_BFloat16::bits_to_bfloat());
+  }
+  static constexpr bfloat16_t denorm_min() {
+    return _MLX_BFloat16(0x0001, _MLX_BFloat16::bits_to_bfloat());
+  }
+};
+
+METAL_FUNC bool isnan(_MLX_BFloat16 x) {
+  return x != x;
+}
+
+} // namespace metal
+
+#pragma METAL internals : disable
+inline uint16_t bfloat16_to_uint16(const bfloat16_t x) {
+  return x.bits_;
+}
+
+inline bfloat16_t uint16_to_bfloat16(const uint16_t x) {
+  return _MLX_BFloat16(x, _MLX_BFloat16::bits_to_bfloat());
+}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Awni Hannun	529842fed9	fix	2025-11-03 16:43:19 -08:00
Awni Hannun	cc6df9fc8a	fix	2025-11-03 15:07:01 -08:00
Awni Hannun	742033fefe	remove use of cuda pool, use cuda free async	2025-11-03 09:14:17 -08:00
Awni Hannun	c27a0647a3	load eval gpu for cuda	2025-11-01 13:18:57 -07:00
Awni Hannun	d378567cc6	refactor for regular cuda malloc	2025-10-31 14:12:15 -07:00
Awni Hannun	b84fc978d3	add pool threshold	2025-10-30 10:32:57 -07:00
Awni Hannun	764b4b7ce8	Use async cuda malloc managed with cuda 13	2025-10-30 10:32:57 -07:00