use faster dequant for fp4 qmv (#2720 )

fix docs path (#2719 )
Fix the order of hosts in the ring (#2718 )
2025-12-16 01:49:05 +08:00 · 2025-10-31 11:49:59 -07:00 · 2025-10-30 19:12:49 -05:00 · 2025-10-30 15:02:39 -07:00 · 2025-10-30 14:27:15 -07:00 · 2025-10-30 13:17:20 -07:00
249 changed files with 11431 additions and 3281 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -18,16 +18,17 @@ jobs:
        type: boolean
        default: false
    macos:
-      xcode: "16.2.0"
+      xcode: "26.0.0"
-    resource_class: m2pro.medium
+    resource_class: m4pro.medium
    steps:
      - checkout
      - run:
          name: Install
          command: |
-            brew install python@3.9
+            xcodebuild -downloadComponent MetalToolchain
            brew install python@3.10
            brew install doxygen
-            python3.9 -m venv env
+            python3.10 -m venv env
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
@@ -89,7 +90,8 @@ jobs:
          command: |
            uv venv
            uv pip install cmake
-            uv pip install -e ".[dev]" -v
+            DEBUG=1 CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
              uv pip install -e ".[dev]" -v
      - run:
          name: Generate package stubs
          command: |
@@ -118,7 +120,7 @@ jobs:
    parameters:
      xcode_version:
        type: string
-        default: "16.2.0"
+        default: "26.0.0"
      macosx_deployment_target:
        type: string
        default: ""
@@ -126,18 +128,19 @@ jobs:
      xcode: << parameters.xcode_version >>
    environment:
      MACOSX_DEPLOYMENT_TARGET: << parameters.macosx_deployment_target >>
-    resource_class: m2pro.medium
+    resource_class: m4pro.medium
    steps:
      - checkout
      - run:
          name: Install dependencies
          command: |
            xcodebuild -downloadComponent MetalToolchain
            HOMEBREW_NO_AUTO_UPDATE=1 HOMEBREW_NO_INSTALL_CLEANUP=1 \
              brew install openmpi uv
      - run:
          name: Install Python package
          command: |
-            uv venv --python 3.9
+            uv venv --python 3.10
            uv pip install \
              nanobind==2.4.0 \
              cmake \
@@ -196,7 +199,7 @@ jobs:
          name: Run Python tests with JIT
          command: |
            CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
-              uv pip install -e .
+              uv pip install -e . -v
            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 \
              METAL_DEBUG_ERROR_MODE=0 \
              uv run --no-project python -m xmlrunner discover \
@@ -222,15 +225,20 @@ jobs:
            sudo apt-get update
            sudo apt-get install libcudnn9-dev-cuda-12
            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
            sudo apt-get install libnccl2 libnccl-dev
            curl -sL https://github.com/ccache/ccache/releases/download/v4.11.3/ccache-4.11.3-linux-x86_64.tar.xz | tar xJf -
            sudo mv ccache-4.11.3-linux-x86_64/ccache /usr/bin/ccache
            rm -rf ccache-4.11.3-linux-x86_64
            curl -LsSf https://astral.sh/uv/install.sh | sh
      - run:
          name: Set CCache size
          command: ccache --max-size 1G
      - run:
          name: Install Python package
          command: |
            uv venv
-            CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+            uv pip install cmake
            DEBUG=1 CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
              uv pip install -e ".[dev]" -v
      - run:
          name: Run Python tests
@@ -238,12 +246,23 @@ jobs:
            source .venv/bin/activate
            LOW_MEMORY=1 DEVICE=cpu python -m unittest discover python/tests -v
            LOW_MEMORY=1 DEVICE=gpu python -m tests discover python/tests -v
      - run:
          name: Build CPP only
          command: |
            source .venv/bin/activate
            cmake . -B build \
              -DMLX_BUILD_CUDA=ON \
              -DCMAKE_CUDA_COMPILER=`which nvcc` \
              -DCMAKE_BUILD_TYPE=DEBUG
            cmake --build build -j `nproc`
      - run:
          name: Run CPP tests
          command: ./build/tests/tests -sfe="*fft_tests.cpp,*linalg_tests.cpp"
      - run:
          name: CCache report
          command: |
            ccache --show-stats
            ccache --zero-stats
            ccache --max-size 400MB
            ccache --cleanup
      - save_cache:
          key: cuda-<< parameters.image_date >>-{{ arch }}-{{ epoch }}
@@ -254,10 +273,10 @@ jobs:
    parameters:
      python_version:
        type: string
-        default: "3.9"
+        default: "3.10"
      xcode_version:
        type: string
-        default: "16.2.0"
+        default: "26.0.0"
      build_env:
        type: string
        default: ""
@@ -266,7 +285,7 @@ jobs:
        default: ""
    macos:
      xcode: << parameters.xcode_version >>
-    resource_class: m2pro.medium
+    resource_class: m4pro.medium
    environment:
      MACOSX_DEPLOYMENT_TARGET: << parameters.macosx_deployment_target >>
    steps:
@@ -274,11 +293,15 @@ jobs:
      - run:
          name: Install dependencies
          command: |
-            brew install python@<< parameters.python_version >>
+            xcodebuild -downloadComponent MetalToolchain
-            brew install openmpi
+            mkdir -p ~/miniconda3
-            python<< parameters.python_version >> -m venv env
+            curl https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh -o ~/miniconda3/miniconda.sh
-            source env/bin/activate
+            bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
-            pip install --upgrade pip
+            rm ~/miniconda3/miniconda.sh
            source ~/miniconda3/bin/activate
            conda init --all
            conda create -n env python=<< parameters.python_version >> -y
            conda activate env
            pip install --upgrade cmake
            pip install nanobind==2.4.0
            pip install --upgrade setuptools
@@ -288,29 +311,29 @@ jobs:
      - run:
          name: Install Python package
          command: |
-            source env/bin/activate
+            conda activate env
            env -u MACOSX_DEPLOYMENT_TARGET DEV_RELEASE=1 \
              pip install . -v
      - run:
          name: Generate package stubs
          command: |
-            source env/bin/activate
+            conda activate env
            pip install typing_extensions
            python setup.py generate_stubs
      - run:
          name: Build Python package
          command: |
-            source env/bin/activate
+            conda activate env
            python setup.py clean --all
            << parameters.build_env >> MLX_BUILD_STAGE=1 python -m build -w
      - when:
          condition:
-            equal: ["3.9", << parameters.python_version >>]
+            equal: ["3.10", << parameters.python_version >>]
          steps:
            - run:
                name: Build common package
                command: |
-                  source env/bin/activate
+                  conda activate env
                  python setup.py clean --all
                  << parameters.build_env >> MLX_BUILD_STAGE=2 python -m build -w
      - when:
@@ -319,7 +342,7 @@ jobs:
            - run:
                name: Upload package
                command: |
-                  source env/bin/activate
+                  conda activate env
                  twine upload dist/*
      - store_artifacts:
          path: dist/
@@ -328,7 +351,7 @@ jobs:
    parameters:
      python_version:
        type: string
-        default: "3.9"
+        default: "3.10"
      build_env:
        type: string
        default: ""
@@ -364,7 +387,7 @@ jobs:
            bash python/scripts/repair_linux.sh
      - when:
          condition:
-            equal: ["3.9", << parameters.python_version >>]
+            equal: ["3.10", << parameters.python_version >>]
          steps:
            - run:
                name: Build common package
@@ -392,7 +415,7 @@ jobs:
        default: ""
    machine:
      image: ubuntu-2204:current
-      resource_class: large
+      resource_class: xlarge
    steps:
      - checkout
      - run:
@@ -439,7 +462,7 @@ workflows:
      - mac_build_and_test:
          matrix:
            parameters:
-              macosx_deployment_target: ["13.5", "14.0"]
+              macosx_deployment_target: ["13.5", "15.0"]
      - linux_build_and_test
      - cuda_build_and_test:
          matrix:
@@ -461,71 +484,10 @@ workflows:
              ignore: /.*/
          matrix:
            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
              macosx_deployment_target: ["13.5", "14.0", "15.0"]
              build_env: ["PYPI_RELEASE=1"]
-              xcode_version: ["16.2.0", "15.0.0"]
+              xcode_version: ["26.0.0"]
            exclude:
              - macosx_deployment_target: "13.5"
                xcode_version: "16.2.0"
                python_version: "3.9"
                build_env: "PYPI_RELEASE=1"
              - macosx_deployment_target: "13.5"
                xcode_version: "16.2.0"
                python_version: "3.10"
                build_env: "PYPI_RELEASE=1"
              - macosx_deployment_target: "13.5"
                xcode_version: "16.2.0"
                python_version: "3.11"
                build_env: "PYPI_RELEASE=1"
              - macosx_deployment_target: "13.5"
                xcode_version: "16.2.0"
                python_version: "3.12"
                build_env: "PYPI_RELEASE=1"
              - macosx_deployment_target: "13.5"
                xcode_version: "16.2.0"
                python_version: "3.13"
                build_env: "PYPI_RELEASE=1"
              - macosx_deployment_target: "14.0"
                xcode_version: "15.0.0"
                python_version: "3.9"
                build_env: "PYPI_RELEASE=1"
              - macosx_deployment_target: "14.0"
                xcode_version: "15.0.0"
                python_version: "3.10"
                build_env: "PYPI_RELEASE=1"
              - macosx_deployment_target: "14.0"
                xcode_version: "15.0.0"
                python_version: "3.11"
                build_env: "PYPI_RELEASE=1"
              - macosx_deployment_target: "14.0"
                xcode_version: "15.0.0"
                python_version: "3.12"
                build_env: "PYPI_RELEASE=1"
              - macosx_deployment_target: "14.0"
                xcode_version: "15.0.0"
                python_version: "3.13"
                build_env: "PYPI_RELEASE=1"
              - macosx_deployment_target: "15.0"
                xcode_version: "15.0.0"
                python_version: "3.9"
                build_env: "PYPI_RELEASE=1"
              - macosx_deployment_target: "15.0"
                xcode_version: "15.0.0"
                python_version: "3.10"
                build_env: "PYPI_RELEASE=1"
              - macosx_deployment_target: "15.0"
                xcode_version: "15.0.0"
                python_version: "3.11"
                build_env: "PYPI_RELEASE=1"
              - macosx_deployment_target: "15.0"
                xcode_version: "15.0.0"
                python_version: "3.12"
                build_env: "PYPI_RELEASE=1"
              - macosx_deployment_target: "15.0"
                xcode_version: "15.0.0"
                python_version: "3.13"
                build_env: "PYPI_RELEASE=1"
      - build_documentation:
          filters:
            tags:
@@ -541,7 +503,7 @@ workflows:
              ignore: /.*/
          matrix:
            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
              build_env: ["PYPI_RELEASE=1"]
      - build_cuda_release:
          filters:
@@ -567,7 +529,7 @@ workflows:
          requires: [ hold ]
          matrix:
            parameters:
-              macosx_deployment_target: ["13.5", "14.0"]
+              macosx_deployment_target: ["13.5", "15.0"]
      - linux_build_and_test:
          requires: [ hold ]
      - cuda_build_and_test:
@@ -584,59 +546,13 @@ workflows:
      - build_release:
          matrix:
            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
              macosx_deployment_target: ["13.5", "14.0", "15.0"]
-              xcode_version: ["16.2.0", "15.0.0"]
+              xcode_version: ["26.0.0"]
            exclude:
              - macosx_deployment_target: "13.5"
                xcode_version: "16.2.0"
                python_version: "3.9"
              - macosx_deployment_target: "13.5"
                xcode_version: "16.2.0"
                python_version: "3.10"
              - macosx_deployment_target: "13.5"
                xcode_version: "16.2.0"
                python_version: "3.11"
              - macosx_deployment_target: "13.5"
                xcode_version: "16.2.0"
                python_version: "3.12"
              - macosx_deployment_target: "13.5"
                xcode_version: "16.2.0"
                python_version: "3.13"
              - macosx_deployment_target: "14.0"
                xcode_version: "15.0.0"
                python_version: "3.9"
              - macosx_deployment_target: "14.0"
                xcode_version: "15.0.0"
                python_version: "3.10"
              - macosx_deployment_target: "14.0"
                xcode_version: "15.0.0"
                python_version: "3.11"
              - macosx_deployment_target: "14.0"
                xcode_version: "15.0.0"
                python_version: "3.12"
              - macosx_deployment_target: "14.0"
                xcode_version: "15.0.0"
                python_version: "3.13"
              - macosx_deployment_target: "15.0"
                xcode_version: "15.0.0"
                python_version: "3.9"
              - macosx_deployment_target: "15.0"
                xcode_version: "15.0.0"
                python_version: "3.10"
              - macosx_deployment_target: "15.0"
                xcode_version: "15.0.0"
                python_version: "3.11"
              - macosx_deployment_target: "15.0"
                xcode_version: "15.0.0"
                python_version: "3.12"
              - macosx_deployment_target: "15.0"
                xcode_version: "15.0.0"
                python_version: "3.13"
      - build_linux_release:
          matrix:
            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
      - build_cuda_release
  build_dev_release:
@@ -648,75 +564,14 @@ workflows:
      - build_release:
          matrix:
            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
              macosx_deployment_target: ["13.5", "14.0", "15.0"]
              build_env: ["DEV_RELEASE=1"]
-              xcode_version: ["16.2.0", "15.0.0"]
+              xcode_version: ["26.0.0"]
            exclude:
              - macosx_deployment_target: "13.5"
                xcode_version: "16.2.0"
                python_version: "3.9"
                build_env: "DEV_RELEASE=1"
              - macosx_deployment_target: "13.5"
                xcode_version: "16.2.0"
                python_version: "3.10"
                build_env: "DEV_RELEASE=1"
              - macosx_deployment_target: "13.5"
                xcode_version: "16.2.0"
                python_version: "3.11"
                build_env: "DEV_RELEASE=1"
              - macosx_deployment_target: "13.5"
                xcode_version: "16.2.0"
                python_version: "3.12"
                build_env: "DEV_RELEASE=1"
              - macosx_deployment_target: "13.5"
                xcode_version: "16.2.0"
                python_version: "3.13"
                build_env: "DEV_RELEASE=1"
              - macosx_deployment_target: "14.0"
                xcode_version: "15.0.0"
                python_version: "3.9"
                build_env: "DEV_RELEASE=1"
              - macosx_deployment_target: "14.0"
                xcode_version: "15.0.0"
                python_version: "3.10"
                build_env: "DEV_RELEASE=1"
              - macosx_deployment_target: "14.0"
                xcode_version: "15.0.0"
                python_version: "3.11"
                build_env: "DEV_RELEASE=1"
              - macosx_deployment_target: "14.0"
                xcode_version: "15.0.0"
                python_version: "3.12"
                build_env: "DEV_RELEASE=1"
              - macosx_deployment_target: "14.0"
                xcode_version: "15.0.0"
                python_version: "3.13"
                build_env: "DEV_RELEASE=1"
              - macosx_deployment_target: "15.0"
                xcode_version: "15.0.0"
                python_version: "3.9"
                build_env: "DEV_RELEASE=1"
              - macosx_deployment_target: "15.0"
                xcode_version: "15.0.0"
                python_version: "3.10"
                build_env: "DEV_RELEASE=1"
              - macosx_deployment_target: "15.0"
                xcode_version: "15.0.0"
                python_version: "3.11"
                build_env: "DEV_RELEASE=1"
              - macosx_deployment_target: "15.0"
                xcode_version: "15.0.0"
                python_version: "3.12"
                build_env: "DEV_RELEASE=1"
              - macosx_deployment_target: "15.0"
                xcode_version: "15.0.0"
                python_version: "3.13"
                build_env: "DEV_RELEASE=1"
      - build_linux_release:
          matrix:
            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
              build_env: ["DEV_RELEASE=1"]
      - build_cuda_release:
          matrix:
--- a/.github/actions/build-cuda-release/action.yml
+++ b/.github/actions/build-cuda-release/action.yml
@@ -0,0 +1,24 @@
 name: 'Build CUDA wheel'
 description: 'Build CUDA wheel'
 inputs:
  nvcc-location:
    description: 'Location of nvcc compiler'
    required: true
 runs:
  using: "composite"
  steps:
    - name: Build package
      shell: bash
      env:
        MLX_BUILD_STAGE: 2
        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }}
      run: |
        pip install auditwheel build patchelf setuptools
        python setup.py clean --all
        python -m build -w
        if [ -f "python/scripts/repair_cuda.sh" ]; then
          bash python/scripts/repair_cuda.sh
        fi
--- a/.github/actions/build-cuda/action.yml
+++ b/.github/actions/build-cuda/action.yml
@@ -0,0 +1,68 @@
 name: 'Build and Test with CUDA'
 description: 'Build and test MLX with CUDA'
 inputs:
  build-type:
    description: 'Build type (debug, release)'
    required: false
    default: 'debug'
  run-tests:
    description: 'Whether to run tests'
    required: false
    default: 'true'
  nvcc-location:
    description: 'Location of nvcc compiler'
    required: true
    default: '/usr/local/cuda-12.9/bin/nvcc'
    # this value is dependent on the CUDA tools installed in the setup-linux workflow
 runs:
  using: "composite"
  steps:
    - name: Install Python package
      shell: bash
      env:
        DEBUG: 1
        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }}
      run: pip install -e ".[dev]" -v
    - name: Check if build actually worked
      shell: bash
      run: python -c "import mlx.core"
    - name: Run Python tests - CPU
      if: inputs.run-tests == 'true'
      shell: bash
      env:
        LOW_MEMORY: 1
        DEVICE: cpu
      run: python -m unittest discover python/tests -v
    - name: Run Python tests - GPU
      if: inputs.run-tests == 'true'
      shell: bash
      env:
        LOW_MEMORY: 1
        DEVICE: gpu
      run: python -m tests discover python/tests -v
    - name: Build CPP only
      if: inputs.build-type == 'debug'
      shell: bash
      run: |
        cmake . -B build \
          -DMLX_BUILD_CUDA=ON \
          -DCMAKE_CUDA_COMPILER=${{ inputs.nvcc-location }} \
          -DCMAKE_BUILD_TYPE=DEBUG
        cmake --build build -j $(nproc)
    - name: Run CPP tests
      if: ${{ inputs.build-type == 'debug' && inputs.run-tests == 'true' }}
      shell: bash
      run: ./build/tests/tests -sfe="*fft_tests.cpp,*linalg_tests.cpp"
    - name: Build Python package
      if: inputs.build-type == 'release'
      uses: ./.github/actions/build-cuda-release
      with:
        nvcc-location: ${{ inputs.nvcc-location }}
--- a/.github/actions/build-docs/action.yml
+++ b/.github/actions/build-docs/action.yml
@@ -0,0 +1,38 @@
 name: 'Build Documentation'
 description: 'Build documentation on a mac'
 runs:
  using: "composite"
  steps:
    - name: Setup machine
      uses: ./.github/actions/setup-macos
    - name: Install dependencies
      shell: sh
      run: |
        brew install doxygen
        uv pip install --upgrade pip cmake
        uv pip install -r docs/requirements.txt
        uv pip install . -v
    - name: Build documentation
      shell: bash
      run: |
        source .venv/bin/activate
        cd docs
        doxygen
        make html O=-W
    - name: Create artifact tar
      shell: sh
      run: tar -cf artifact.tar --cd docs --dereference build/html index.html
    # Do it manually because upload-pages-artifact requires gtar
    - name: Upload artifact
      id: upload-artifact
      uses: actions/upload-artifact@v5
      with:
        name: github-pages
        path: artifact.tar
        retention-days: 1
        if-no-files-found: error
--- a/.github/actions/build-linux/action.yml
+++ b/.github/actions/build-linux/action.yml
@@ -0,0 +1,78 @@
 name: 'Build and Test on Linux'
 description: 'Build and test MLX on Linux'
 inputs:
  build-type:
    description: 'Build type'
    required: false
    default: 'debug'
    type: choice
    options:
      - debug
      - release
  run-tests:
    description: 'Whether to run tests'
    required: false
    default: 'true'
    type: boolean
 runs:
  using: "composite"
  steps:
    - name: Set DEBUG
      shell: sh
      if: inputs.build-type == 'debug'
      run: echo "DEBUG=1" >> $GITHUB_ENV
    - name: Install Python package
      shell: sh
      env:
        CMAKE_ARGS: "-DCMAKE_COMPILE_WARNING_AS_ERROR=ON"
      run: pip install -e ".[dev]" -v
    - name: Generate package stubs
      shell: sh
      run: |
        pip install typing_extensions
        python setup.py generate_stubs
    - name: Run Python tests
      if: inputs.run-tests == 'true'
      shell: bash
      run: |
        python -m unittest discover python/tests -v
        mpirun --bind-to none --allow-run-as-root -host localhost:8 -np 8 python python/tests/mpi_test_distributed.py
        mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
        if grep -Fq '[WARN]' stderr.log ; then
          grep -F '[WARN]' stderr.log
          echo "Distributed ring test failed";
          exit 1;
        fi
    - name: Build CPP only
      if: inputs.build-type == 'debug'
      shell: bash
      run: |
        mkdir -p build && cd build
        cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
        make -j $(nproc)
    - name: Run CPP tests
      if: ${{ inputs.build-type == 'debug' && inputs.run-tests == 'true' }}
      shell: sh
      run: ./build/tests/tests
    - name: Build Python package
      if: inputs.build-type == 'release'
      shell: bash
      run: |
        pip install auditwheel patchelf build
        python setup.py clean --all
        MLX_BUILD_STAGE=1 python -m build -w
        if [ -f "python/scripts/repair_linux.sh" ]; then
          bash python/scripts/repair_linux.sh
        fi
        python setup.py clean --all
        MLX_BUILD_STAGE=2 python -m build -w
        auditwheel repair dist/mlx_cpu*.whl --plat manylinux_2_35_x86_64
--- a/.github/actions/build-macos-release/action.yml
+++ b/.github/actions/build-macos-release/action.yml
@@ -0,0 +1,22 @@
 name: 'Build macOS release'
 description: 'Build MLX releases macOS'
 inputs:
  macos-target:
    description: 'macOS build target'
    required: false
    default: '15.0'
 runs:
  using: "composite"
  steps:
    - name: Build Python package(s)
      shell: bash
      env:
        MACOSX_DEPLOYMENT_TARGET: ${{ inputs.macos-target }}
      run: |
        uv pip install build
        uv run --no-project setup.py clean --all
        MLX_BUILD_STAGE=1 uv run -m build -w
        uv run --no-project setup.py clean --all
        MLX_BUILD_STAGE=2 uv run -m build -w
--- a/.github/actions/build-macos/action.yml
+++ b/.github/actions/build-macos/action.yml
@@ -0,0 +1,124 @@
 name: 'Build and Test on macOS'
 description: 'Build and test MLX on macOS'
 inputs:
  build-type:
    description: 'Build type (debug, release)'
    required: false
    default: 'debug'
    type: choice
    options:
      - debug
      - release
  run-tests:
    description: 'Whether to run tests'
    required: false
    default: 'true'
  build-jit:
    description: 'Whether to build with JIT'
    required: false
    default: 'true'
 runs:
  using: "composite"
  steps:
    - name: Install dependencies
      shell: sh
      env:
        DEBUG: 1
        DEV_RELEASE: 1
      run: |
        uv pip install --upgrade pip cmake setuptools
        uv pip install nanobind==2.4.0 \
          numpy torch tensorflow unittest-xml-reporting
        uv pip install -e . -v
    - name: Generate package stubs
      shell: bash
      run: |
        uv pip install typing_extensions
        uv run --no-project setup.py generate_stubs
    - name: Run Python tests
      if: inputs.run-tests == 'true'
      shell: bash
      env:
        LOW_MEMORY: 1
      run: |
        DEVICE=cpu uv run -m xmlrunner discover -v python/tests -o test-results/cpu
        DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 uv run -m xmlrunner discover -v python/tests -o test-results/gpu
        mpirun --bind-to none -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
        mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
        if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
    - name: Build example extension
      if: inputs.run-tests == 'true'
      shell: bash
      run: |
        cd examples/extensions
        uv pip install -r requirements.txt
        uv run --no-project setup.py build_ext --inplace
        uv run --no-project test.py
    - name: Build CPP only
      if: inputs.build-type == 'debug'
      shell: bash
      run: |
        mkdir -p build
        cd build
        cmake ..
        make -j $(sysctl -n hw.ncpu)
    - name: Run CPP tests
      if: ${{ inputs.build-type == 'debug' && inputs.run-tests == 'true' }}
      shell: bash
      env:
        DEVICE: gpu
        METAL_DEVICE_WRAPPER_TYPE: 1
        METAL_DEBUG_ERROR_MODE: 0
      run: ./build/tests/tests
    - name: Build small binary with JIT
      if: inputs.build-jit == 'true'
      shell: bash
      run: |
        mkdir -p build
        cd build
        cmake .. -DCMAKE_BUILD_TYPE=MinSizeRel \
          -DBUILD_SHARED_LIBS=ON \
          -DMLX_BUILD_CPU=OFF \
          -DMLX_BUILD_SAFETENSORS=OFF \
          -DMLX_BUILD_GGUF=OFF \
          -DMLX_METAL_JIT=ON
        make -j $(sysctl -n hw.ncpu)
    - name: Run Python tests with JIT
      if: ${{ inputs.build-jit == 'true' && inputs.run-tests == 'true' }}
      shell: bash
      env:
        LOW_MEMORY: 1
        DEVICE: gpu
        METAL_DEVICE_WRAPPER_TYPE: 1
        METAL_DEBUG_ERROR_MODE: 0
      run: |
        CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
          uv pip install -e . -v
        uv run -m xmlrunner discover \
            -v python/tests \
            -o test-results/gpu_jit
    - name: Build macOS 13 package
      if: inputs.build-type == 'release'
      uses: ./.github/actions/build-macos-release
      with:
        macos-target: 13.0
    - name: Build macOS 14 package
      if: inputs.build-type == 'release'
      uses: ./.github/actions/build-macos-release
      with:
        macos-target: 14.0
    - name: Build macOS 15 package
      if: inputs.build-type == 'release'
      uses: ./.github/actions/build-macos-release
      with:
        macos-target: 15.0
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@@ -0,0 +1,83 @@
 name: 'Setup Linux Environment'
 description: 'Install dependencies for Linux builds'
 inputs:
  runner-type:
    description: 'Whether to set this up as a linux or CUDA runner'
    required: false
    default: 'linux'
    type: choice
    options:
      - linux
      - cuda
  python-version:
    description: 'Version of python to set up'
    required: false
    default: '3.10'
 runs:
  using: "composite"
  steps:
    - name: Free disk space
      shell: sh
      if: inputs.runner-type == 'linux'
      run: sudo rm -rf "$AGENT_TOOLSDIRECTORY"
    - name: Install common dependencies
      env:
        TZ: Etc/UTC
      shell: bash
      run: |
        sudo apt-get update
        sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev tzdata zip
        sudo apt autoremove -y
    - uses: actions/setup-python@v6
      with:
        python-version: ${{ inputs.python-version }}
        cache: 'pip'
    - name: setup python venv
      shell: bash
      run: |
        python -m venv .venv
        source .venv/bin/activate
        echo PATH=$PATH >> $GITHUB_ENV
        pip install --upgrade pip cmake
    - name: Install MPI
      if: inputs.runner-type == 'linux'
      shell: bash
      run: sudo apt-get install -y openmpi-bin openmpi-common libopenmpi-dev
    - name: Network CUDA installation from packages
      id: install-cuda
      if: inputs.runner-type == 'cuda'
      env:
        TZ: Etc/UTC
      shell: bash ## Specific to Ubuntu 22.04 & Architecture x86_64
      run: |
        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
        sudo dpkg -i cuda-keyring_1.1-1_all.deb
        sudo apt-get update
        sudo apt-get install -y libcudnn9-dev-cuda-12 libnccl2 libnccl-dev cuda-toolkit-12-9
      # Note: This installs CUDA 12.9, which is the latest supported by cuDNN 9.x and works with the NVidia 570 drivers
      # cuda-toolkit by itself installs version 13 (+) and requires updated drives (580+), which require a reboot to function properly.
      # Compatibility matrix: https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html
      # This also drops `nvcc` into `/usr/local/cuda-12.9/bin/nvcc` - but it's *not* on the default PATH
    - name: Package and Driver Report
      if: inputs.runner-type == 'cuda'
      shell: bash
      run: |
        sudo apt-get install -y ubuntu-drivers-common dkms
        echo "NVIDIA Driver Packages Available:"
        sudo ubuntu-drivers list --gpgpu
        echo "NVIDIA Driver Version:"
        cat /proc/driver/nvidia/version || echo "nvidia driver not found"
        echo "Installed NVIDIA and CUDA packages:"
        dpkg -l | egrep "cuda|nvidia" -i
        echo "DKMS Status:"
        dkms status || echo "dkms not found"
        echo "NVIDIA-SMI Status:"
        nvidia-smi || echo "nvidia-smi not found"
--- a/.github/actions/setup-macos/action.yml
+++ b/.github/actions/setup-macos/action.yml
@@ -0,0 +1,31 @@
 name: 'Setup macOS Environment'
 description: 'Install dependencies for macOS builds'
 inputs:
  install-mpi:
    description: 'Whether to install MPI'
    required: false
    default: 'true'
    type: boolean
  python-version:
    description: 'Python version to use'
    required: false
    default: '3.10'
 runs:
  using: "composite"
  steps:
    - name: Install Homebrew packages
      shell: sh
      if: inputs.install-mpi == 'true'
      run: /opt/homebrew/bin/brew install openmpi
    - name: Verify MetalToolchain installed
      shell: bash
      run: xcodebuild -showComponent MetalToolchain
    - name: Setup uv
      uses: astral-sh/setup-uv@v6
      with:
          python-version: ${{ inputs.python-version }}
          activate-environment: true
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -0,0 +1,6 @@
 version: 2
 updates:
  - package-ecosystem: "github-actions"
    directory: "/"
    schedule:
      interval: "weekly"
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -0,0 +1,28 @@
 name: Documentation
 on:
  workflow_dispatch:
 permissions:
  contents: read
 jobs:
  build:
    runs-on: [self-hosted, macos]
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/build-docs
  deploy:
    needs: build
    permissions:
      pages: write
      id-token: write
    runs-on: ubuntu-latest
    environment:
      name: github-pages
      url: ${{ steps.deployment.outputs.page_url }}
    steps:
      - name: Deploy to GitHub Pages
        id: deployment
        uses: actions/deploy-pages@v4
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -0,0 +1,93 @@
 name: Nightly Build
 on:
  schedule:
    - cron: 33 6 * * 1-5
  workflow_dispatch:
 permissions:
  contents: read
 jobs:
  build_linux_release:
    strategy:
      fail-fast: false
      matrix:
        python_version: ["3.10", "3.14"]
    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
      - uses: ./.github/actions/build-linux
        with:
          build-type: release
          run-tests: false
      - name: Upload mlx artifacts
        uses: actions/upload-artifact@v5
        with:
          name: linux-wheels-${{ matrix.python_version }}
          path: wheelhouse/mlx-*.whl
          retention-days: 7
      - name: Upload mlx-cpu artifacts
        if: matrix.python_version == '3.10'
        uses: actions/upload-artifact@v5
        with:
          name: mlx-cpu
          path: wheelhouse/mlx_cpu-*.whl
          retention-days: 7
  build_linux_with_tests:
    strategy:
      fail-fast: false
      matrix:
        python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
        with:
          python-version: ${{ matrix.python_version }}
      - uses: ./.github/actions/build-linux
  build_mac_release:
    strategy:
      matrix:
        python-version: ["3.10", "3.13"]
        # TODO: 3.14 had issues finding a compatible tensorflow
    env:
      MACOSX_DEPLOYMENT_TARGET: "15.0"
    runs-on: [self-hosted, macos]
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-macos
        with:
          python-version: ${{ matrix.python-version }}
      - uses: ./.github/actions/build-macos
  build_cuda_with_tests:
    runs-on: gpu-t4-4-core
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
        with:
          runner-type: 'cuda'
      - uses: ./.github/actions/build-cuda
  build_cuda_release:
    runs-on: ubuntu-22-large
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
        with:
          runner-type: 'cuda'
      - name: Build Python package
        uses: ./.github/actions/build-cuda-release
        with:
          nvcc-location: '/usr/local/cuda-12.9/bin/nvcc'
      - name: Upload artifacts
        uses: actions/upload-artifact@v5
        with:
          name: mlx-cuda
          path: wheelhouse/mlx_cuda-*.whl
          retention-days: 7
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -1,20 +1,46 @@
-on:
+name: Build and Test
-  pull_request:
+
-    branches:
+on: pull_request  
-      - main
+
 permissions:
  contents: read
 jobs:
  check_lint:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
-      - uses: actions/setup-python@v4
+      - uses: ./.github/actions/setup-linux
      - uses: pre-commit/action@v3.0.1
  linux_build_and_test:
    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
      - uses: ./.github/actions/build-linux
  mac_build_and_test:
    runs-on: [self-hosted, macos]
    needs: check_lint
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-macos
      - uses: ./.github/actions/build-macos
  cuda_build_and_test:
    runs-on: gpu-t4-4-core
    needs: check_lint
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
        with:
-          python-version: 3.8
+          runner-type: 'cuda'
-      - name: Install dependencies
+      - uses: ./.github/actions/build-cuda
-        run: |
+
-          python -m pip install --upgrade pip
+  build_documentation:
-          pip install pre-commit black isort clang-format
+    runs-on: [self-hosted, macos]
-      - name: Run lint
+    needs: check_lint
-        run: |
+    steps:
-          pre-commit run --all-files
+      - uses: actions/checkout@v5
      - uses: ./.github/actions/build-docs
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -0,0 +1,188 @@
 name: PyPI Release
 on:
  push:
    tags:
      - 'v*'
  workflow_dispatch:
 permissions:
  contents: read
 jobs:
  build_documentation:
    runs-on: [self-hosted, macos]
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/build-docs
  deploy_documentation:
    needs: build_documentation
    permissions:
      pages: write
      id-token: write
    runs-on: ubuntu-latest
    environment:
      name: github-pages
      url: ${{ steps.deployment.outputs.page_url }}
    steps:
      - name: Deploy to GitHub Pages
        id: deployment
        uses: actions/deploy-pages@v4
  build_linux_release:
    strategy:
      matrix:
        python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
    runs-on: ubuntu-22.04
    env:
      PYPI_RELEASE: 1
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
        with:
          python-version: ${{ matrix.python_version }}
      - uses: ./.github/actions/build-linux
        with:
          build-type: release
          run-tests: false
      - name: Upload MLX artifacts
        uses: actions/upload-artifact@v5
        with:
          name: linux-wheels-${{ matrix.python_version }}
          path: wheelhouse/mlx-*.whl
      - name: Upload CPU artifacts
        if: matrix.python_version == '3.10'
        uses: actions/upload-artifact@v5
        with:
          name: mlx-cpu
          path: wheelhouse/mlx_cpu-*.whl
  build_mac_release:
    strategy:
      matrix:
        python-version: ["3.10", "3.11", "3.12", "3.13"]
        # TODO: 3.14 had issues finding a compatible tensorflow
    runs-on: [self-hosted, macos]
    env:
      PYPI_RELEASE: 1
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-macos
        with:
          python-version: ${{ matrix.python-version }}
      - uses: ./.github/actions/build-macos
        with:
          build-type: release
      - name: Upload MLX artifacts
        uses: actions/upload-artifact@v5
        with:
          name: mac-wheels-${{ matrix.python-version }}
          path: dist/mlx-*.whl
      - name: Upload Metal artifacts
        if: matrix.python-version == '3.10'
        uses: actions/upload-artifact@v5
        with:
          name: mlx-metal
          path: dist/mlx_metal-*.whl
  build_cuda_release:
    runs-on: ubuntu-22-large
    env:
      PYPI_RELEASE: 1
    steps:
      - uses: actions/checkout@v5
      - uses: ./.github/actions/setup-linux
        with:
          runner-type: 'cuda'
      - name: Build Python package
        uses: ./.github/actions/build-cuda-release
        with:
          nvcc-location: '/usr/local/cuda-12.9/bin/nvcc'
      - name: Upload artifacts
        uses: actions/upload-artifact@v5
        with:
          name: mlx-cuda
          path: wheelhouse/mlx_cuda-*.whl
  pypi-publish:
    name: Upload release to PyPI
    runs-on: ubuntu-latest
    needs: [build_linux_release, build_mac_release]
    permissions:
      id-token: write
    environment:
      name: pypi
      url: https://pypi.org/p/mlx
    steps:
      - uses: actions/download-artifact@v6
        with:
          pattern: linux-wheels-*
          merge-multiples: true
          path: artifacts
      - uses: actions/download-artifact@v6
        with:
          pattern: mac-wheels-*
          merge-multiples: true
          path: artifacts
      - name: Display structure of downloaded files
        run: ls -R artifacts
      # - name: Publish package distributions to PyPI
      #  uses: pypa/gh-action-pypi-publish@release/v1
  pypi-publish-cuda:
    name: Upload CUDA release to PyPI
    runs-on: ubuntu-latest
    needs: build_cuda_release
    permissions:
      id-token: write
    environment:
      name: pypi
      url: https://pypi.org/p/mlx-cuda
    steps:
      - uses: actions/download-artifact@v6
        with:
          name: mlx-cuda
          path: artifacts
      - name: Display structure of downloaded files
        run: ls -R artifacts
      # - name: Publish package distributions to PyPI
      #  uses: pypa/gh-action-pypi-publish@release/v1
  pypi-publish-cpu:
    name: Upload CPU release to PyPI
    runs-on: ubuntu-latest
    needs: build_linux_release
    permissions:
      id-token: write
    environment:
      name: pypi
      url: https://pypi.org/p/mlx-cpu
    steps:
      - uses: actions/download-artifact@v6
        with:
          name: mlx-cpu
          path: artifacts
      - name: Display structure of downloaded files
        run: ls -R artifacts
      # - name: Publish package distributions to PyPI
      #  uses: pypa/gh-action-pypi-publish@release/v1
  pypi-publish-metal:
    name: Upload Metal release to PyPI
    runs-on: ubuntu-latest
    needs: build_mac_release
    permissions:
      id-token: write
    environment:
      name: pypi
      url: https://pypi.org/p/mlx-metal
    steps:
      - uses: actions/download-artifact@v6
        with:
          name: mlx-metal
          path: artifacts
      - name: Display structure of downloaded files
        run: ls -R artifacts
      # - name: Publish package distributions to PyPI
      #  uses: pypa/gh-action-pypi-publish@release/v1
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,10 @@
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v6.0.0
    hooks:
    -   id: check-yaml
    # -   id: end-of-file-fixer
    # -   id: trailing-whitespace
 -   repo: https://github.com/pre-commit/mirrors-clang-format
    rev: v19.1.7
    hooks:
--- a/ACKNOWLEDGMENTS.md
+++ b/ACKNOWLEDGMENTS.md
@@ -19,12 +19,17 @@ MLX was developed with contributions from the following individuals:
 - Gleb Pobudzey: Added the `where` primitive, and groups in 1D and 2D convolutions.
 - Paul Paczuski: Improved stability of BCE loss calculation
 - Max-Heinrich Laves: Added `conv_transpose1d`, `conv_transpose2d`, and `conv_transpose3d` ops.
- Gökdeniz Gülmez: Added the `Muon (MomentUm Orthogonalized by Newton-schulz)` optimizer.
+- Gökdeniz Gülmez: Added the `Muon (MomentUm Orthogonalized by Newton-schulz)` optimizer, and the `ReLU²` activation function.
 <a href="https://github.com/ml-explore/mlx/graphs/contributors">
  <img class="dark-light" src="https://contrib.rocks/image?repo=ml-explore/mlx&anon=0&columns=20&max=100&r=true" />
 </a>
 # Organizations
 MLX has received contributions from the following companies:
 - NVIDIA Corporation & Affiliates
 # Third-Party Software
 MLX leverages several third-party software, listed here together with
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,6 +26,7 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_INSTALL_MESSAGE NEVER)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 # ----------------------------- Configuration -----------------------------
 option(MLX_BUILD_TESTS "Build tests for mlx" ON)
@@ -87,22 +88,26 @@ cmake_policy(SET CMP0135 NEW)
 add_library(mlx)
-if(MLX_BUILD_METAL)
+# Supress warnings: note: parameter passing for argument of type
-  set(METAL_LIB "-framework Metal")
+# ‘std::pair<float, float>’ when C++17 is enabled changed to match C++14 in GCC
-  set(FOUNDATION_LIB "-framework Foundation")
+# 10.1
-  set(QUARTZ_LIB "-framework QuartzCore")
+target_compile_options(mlx PRIVATE -Wno-psabi)
 endif()
 if(MLX_BUILD_CUDA)
  enable_language(CUDA)
 endif()
-if(MLX_BUILD_METAL AND NOT METAL_LIB)
+if(MLX_BUILD_METAL)
-  message(STATUS "Metal not found. Unable to build GPU")
+  find_library(METAL_LIB Metal)
-  set(MLX_BUILD_METAL OFF)
+  find_library(FOUNDATION_LIB Foundation)
-  set(MLX_METAL_DEBUG OFF)
+  find_library(QUARTZ_LIB QuartzCore)
-elseif(MLX_BUILD_METAL)
+  if(METAL_LIB)
-  message(STATUS "Building METAL sources")
+    message(STATUS "Metal found ${METAL_LIB}")
  else()
    message(
      FATAL_ERROR
        "Metal not found. Set MLX_BUILD_METAL=OFF to build without GPU")
  endif()
  if(MLX_METAL_DEBUG)
    add_compile_definitions(MLX_METAL_DEBUG)
@@ -111,7 +116,8 @@ elseif(MLX_BUILD_METAL)
  # Throw an error if xcrun not found
  execute_process(
    COMMAND zsh "-c" "/usr/bin/xcrun -sdk macosx --show-sdk-version"
-    OUTPUT_VARIABLE MACOS_SDK_VERSION COMMAND_ERROR_IS_FATAL ANY)
+    OUTPUT_VARIABLE MACOS_SDK_VERSION
    OUTPUT_STRIP_TRAILING_WHITESPACE COMMAND_ERROR_IS_FATAL ANY)
  if(${MACOS_SDK_VERSION} LESS 14.0)
    message(
@@ -140,6 +146,12 @@ elseif(MLX_BUILD_METAL)
  target_link_libraries(mlx PUBLIC ${METAL_LIB} ${FOUNDATION_LIB} ${QUARTZ_LIB})
 endif()
 if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
  # With newer clang/gcc versions following libs are implicitly linked, but when
  # building on old distributions they need to be explicitly listed.
  target_link_libraries(mlx PRIVATE dl pthread)
 endif()
 if(WIN32)
  if(MSVC)
    # GGUF does not build with MSVC.
@@ -167,7 +179,7 @@ if(MLX_BUILD_CPU)
    message(STATUS "Accelerate found ${ACCELERATE_LIBRARY}")
    set(MLX_BUILD_ACCELERATE ON)
  else()
-    message(STATUS "Accelerate or arm neon not found, using default backend.")
+    message(STATUS "Accelerate not found, using default backend.")
    set(MLX_BUILD_ACCELERATE OFF)
  endif()
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 [**Quickstart**](#quickstart) | [**Installation**](#installation) |
 [**Documentation**](https://ml-explore.github.io/mlx/build/html/index.html) |
-[**Examples**](#examples) 
+[**Examples**](#examples)
 [![CircleCI](https://circleci.com/gh/ml-explore/mlx.svg?style=svg)](https://circleci.com/gh/ml-explore/mlx)
@@ -11,37 +11,37 @@ brought to you by Apple machine learning research.
 Some key features of MLX include:
- - **Familiar APIs**: MLX has a Python API that closely follows NumPy. MLX
+- **Familiar APIs**: MLX has a Python API that closely follows NumPy. MLX
   also has fully featured C++, [C](https://github.com/ml-explore/mlx-c), and
   [Swift](https://github.com/ml-explore/mlx-swift/) APIs, which closely mirror
   the Python API. MLX has higher-level packages like `mlx.nn` and
   `mlx.optimizers` with APIs that closely follow PyTorch to simplify building
   more complex models.
- - **Composable function transformations**: MLX supports composable function
+- **Composable function transformations**: MLX supports composable function
-   transformations for automatic differentiation, automatic vectorization,
+  transformations for automatic differentiation, automatic vectorization,
-   and computation graph optimization.
+  and computation graph optimization.
- - **Lazy computation**: Computations in MLX are lazy. Arrays are only
+- **Lazy computation**: Computations in MLX are lazy. Arrays are only
-   materialized when needed.
+  materialized when needed.
- - **Dynamic graph construction**: Computation graphs in MLX are constructed
+- **Dynamic graph construction**: Computation graphs in MLX are constructed
-   dynamically. Changing the shapes of function arguments does not trigger
+  dynamically. Changing the shapes of function arguments does not trigger
-   slow compilations, and debugging is simple and intuitive.
+  slow compilations, and debugging is simple and intuitive.
- - **Multi-device**: Operations can run on any of the supported devices
+- **Multi-device**: Operations can run on any of the supported devices
-   (currently the CPU and the GPU).
+  (currently the CPU and the GPU).
- - **Unified memory**: A notable difference from MLX and other frameworks
+- **Unified memory**: A notable difference from MLX and other frameworks
-   is the *unified memory model*. Arrays in MLX live in shared memory.
+  is the *unified memory model*. Arrays in MLX live in shared memory.
-   Operations on MLX arrays can be performed on any of the supported
+  Operations on MLX arrays can be performed on any of the supported
-   device types without transferring data.
+  device types without transferring data.
 MLX is designed by machine learning researchers for machine learning
 researchers. The framework is intended to be user-friendly, but still efficient
 to train and deploy models. The design of the framework itself is also
 conceptually simple. We intend to make it easy for researchers to extend and
-improve MLX with the goal of quickly exploring new ideas. 
+improve MLX with the goal of quickly exploring new ideas.
 The design of MLX is inspired by frameworks like
 [NumPy](https://numpy.org/doc/stable/index.html),
@@ -91,7 +91,7 @@ Checkout the
 [documentation](https://ml-explore.github.io/mlx/build/html/install.html#)
 for more information on building the C++ and Python APIs from source.
-## Contributing 
+## Contributing
 Check out the [contribution guidelines](https://github.com/ml-explore/mlx/tree/main/CONTRIBUTING.md) for more information
 on contributing to MLX. See the
@@ -110,7 +110,7 @@ Hannun, Jagrit Digani, Angelos Katharopoulos, and Ronan Collobert. If you find
 MLX useful in your research and wish to cite it, please use the following
 BibTex entry:
-```
+```text
@software{mlx2023,
  author = {Awni Hannun and Jagrit Digani and Angelos Katharopoulos and Ronan Collobert},
  title = {{MLX}: Efficient and flexible machine learning on Apple silicon},
--- a/benchmarks/python/blas/bench_gemm.py
+++ b/benchmarks/python/blas/bench_gemm.py
@@ -142,9 +142,7 @@ def bench_shape(B, M, N, K, np_dtype, transpose="nn"):
    t_b = (0, 1, 2) if transpose[1] == "n" else (0, 2, 1)
    c_mlx = a_mx.transpose(t_a) @ b_mx.transpose(t_b)
-    c_npy = a_np.transpose(t_a).astype(np.float32) @ b_np.transpose(t_b).astype(
+    c_npy = a_np.transpose(t_a).astype(np_dtype) @ b_np.transpose(t_b).astype(np_dtype)
        np.float32
    )
    atol = 1e-5 if np_dtype == np.float32 else 1e-4
@@ -163,7 +161,7 @@ def get_gflop_count(B, M, N, K):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run gemm benchmarks")
-    dtypes = ("float32", "float16")
+    dtypes = ("float32", "float16", "complex64")
    transposes = ("nn", "nt", "tn")
    shapes = (
        (16, 234, 768, 3072),
@@ -187,7 +185,7 @@ if __name__ == "__main__":
                diff = gflops_mx / gflops_pt - 1.0
                print(
-                    f"{B:3d}, {M:4d}, {N:4d}, {K:4d}, {dtype}, {transpose}, {gflops_pt:05.3f}, {gflops_mx:05.3f}, {100. * diff:+5.2f}%"
+                    f"{B:3d}, {M:4d}, {N:4d}, {K:4d}, {dtype}, {transpose}, {gflops_pt:05.3f}, {gflops_mx:05.3f}, {100.0 * diff:+5.2f}%"
                )
                if gflops_pt >= 2.0 * gflops_mx:
                    print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/blas/bench_gemv.py
+++ b/benchmarks/python/blas/bench_gemv.py
@@ -196,7 +196,7 @@ def bench_with_out_len(ax, out_vec_len, in_vector_lens, dtype, transpose):
 for transpose in (False, True):
-    for dtype in ("float32", "float16"):
+    for dtype in ("float32", "float16", "complex64"):
        fig, axs = plt.subplots(
            len(in_vec_sizes), 2, figsize=(8.5, 11), layout="constrained"
        )
@@ -215,7 +215,7 @@ for transpose in (False, True):
        fig.suptitle(f"{device_name}: {dtype} {op_name}")
        fig.savefig(
            os.path.join(
-                results_dir, f'{device_name.replace(" ", "_")}_{dtype}_{op_name}.pdf'
+                results_dir, f"{device_name.replace(' ', '_')}_{dtype}_{op_name}.pdf"
            )
        )
        plt.close(fig)
--- a/cmake/FindNCCL.cmake
+++ b/cmake/FindNCCL.cmake
@@ -0,0 +1,54 @@
 # FindNCCL.cmake This module finds the NVIDIA NCCL library and its include
 # directories.
 set(NCCL_ROOT_DIR
    $ENV{NCCL_ROOT_DIR}
    CACHE PATH "Folder contains NVIDIA NCCL")
 find_path(
  NCCL_INCLUDE_DIRS
  NAMES nccl.h
  HINTS ${NCCL_INCLUDE_DIR} ${NCCL_ROOT_DIR} ${NCCL_ROOT_DIR}/include
        ${CUDA_TOOLKIT_ROOT_DIR}/include)
 if($ENV{USE_STATIC_NCCL})
  message(
    STATUS "USE_STATIC_NCCL detected. Linking against static NCCL library")
  set(NCCL_LIBNAME "libnccl_static.a")
 else()
  set(NCCL_LIBNAME "nccl")
 endif()
 find_library(
  NCCL_LIBRARIES
  NAMES ${NCCL_LIBNAME}
  HINTS ${NCCL_LIB_DIR}
        ${NCCL_ROOT_DIR}
        ${NCCL_ROOT_DIR}/lib
        ${NCCL_ROOT_DIR}/lib/x86_64-linux-gnu
        ${NCCL_ROOT_DIR}/lib64
        ${CUDA_TOOLKIT_ROOT_DIR}/lib
        ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS
                                  NCCL_LIBRARIES)
 if(NCCL_FOUND)
  set(NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
  message(
    STATUS "Determining NCCL version from the header file: ${NCCL_HEADER_FILE}")
  file(
    STRINGS ${NCCL_HEADER_FILE} NCCL_MAJOR_VERSION_DEFINED
    REGEX "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+[0-9]+.*$"
    LIMIT_COUNT 1)
  if(NCCL_MAJOR_VERSION_DEFINED)
    string(REGEX REPLACE "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+" ""
                         NCCL_MAJOR_VERSION ${NCCL_MAJOR_VERSION_DEFINED})
    message(STATUS "NCCL_MAJOR_VERSION: ${NCCL_MAJOR_VERSION}")
  endif()
  message(
    STATUS
      "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
  mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
 endif()
--- a/docs/src/dev/custom_metal_kernels.rst
+++ b/docs/src/dev/custom_metal_kernels.rst
@@ -127,7 +127,7 @@ relying on a copy from ``ensure_row_contiguous``:
      name="myexp_strided",
      input_names=["inp"],
      output_names=["out"],
-      source=source
+      source=source,
      ensure_row_contiguous=False,
  )
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -16,7 +16,7 @@ silicon computer is
 To install from PyPI your system must meet the following requirements:
 - Using an M series chip (Apple silicon)
- Using a native Python >= 3.9
+- Using a native Python >= 3.10
 - macOS >= 13.5
 .. note::
@@ -39,7 +39,7 @@ requirements:
 - Nvidia driver >= 550.54.14
 - CUDA toolkit >= 12.0
 - Linux distribution with glibc >= 2.35
- Python >= 3.9
+- Python >= 3.10
 CPU-only (Linux)
@@ -55,7 +55,7 @@ To install the CPU-only package from PyPi your system must meet the following
 requirements:
 - Linux distribution with glibc >= 2.35
- Python >= 3.9
+- Python >= 3.10
 Troubleshooting
@@ -271,7 +271,7 @@ and the CUDA toolkit. For example on Ubuntu, run the following:
   dpkg -i cuda-keyring_1.1-1_all.deb
   apt-get update -y
   apt-get -y install cuda-toolkit-12-9
-   apt-get install libblas-dev liblapack-dev liblapacke-dev -y
+   apt-get install libblas-dev liblapack-dev liblapacke-dev libcudnn9-dev-cuda-12 -y
 When building either the Python or C++ APIs make sure to pass the cmake flag
--- a/docs/src/python/nn/functions.rst
+++ b/docs/src/python/nn/functions.rst
@@ -27,6 +27,7 @@ simple functions.
   mish
   prelu
   relu
   relu2
   relu6
   selu
   sigmoid
--- a/docs/src/python/nn/layers.rst
+++ b/docs/src/python/nn/layers.rst
@@ -50,6 +50,7 @@ Layers
   QuantizedLinear
   RMSNorm
   ReLU
   ReLU2
   ReLU6
   RNN
   RoPE
--- a/docs/src/python/ops.rst
+++ b/docs/src/python/ops.rst
@@ -112,6 +112,7 @@ Operations
   max
   maximum
   mean
   median
   meshgrid
   min
   minimum
--- a/docs/src/usage/compile.rst
+++ b/docs/src/usage/compile.rst
@@ -130,8 +130,8 @@ Now make an array, and benchmark both functions:
 .. code-block:: python
  x = mx.random.uniform(shape=(32, 1000, 4096))
-  timeit(nn.gelu, x)
+  timeit(gelu, x)
-  timeit(mx.compile(nn.gelu), x)
+  timeit(mx.compile(gelu), x)
 On an M1 Max the times are 15.5 and 3.1 milliseconds. The compiled ``gelu`` is
 five times faster.
--- a/docs/src/usage/distributed.rst
+++ b/docs/src/usage/distributed.rst
@@ -184,7 +184,7 @@ almost identical to the example above:
    def step(model, x, y):
        loss, grads = loss_grad_fn(model, x, y)
-        grads = mlx.nn.average_gradients(grads) # <---- This line was added
+        grads = mx.nn.average_gradients(grads)  # <---- This line was added
        optimizer.update(model, grads)
        return loss
--- a/docs/src/usage/export.rst
+++ b/docs/src/usage/export.rst
@@ -164,11 +164,11 @@ to export a function which can be used for inputs with variable shapes:
 .. code-block:: python
-  mx.export_function("fun.mlxfn", mx.abs, mx.array(0.0), shapeless=True)
+  mx.export_function("fun.mlxfn", mx.abs, mx.array([0.0]), shapeless=True)
  imported_abs = mx.import_function("fun.mlxfn")
  # Ok
-  out, = imported_abs(mx.array(-1.0))
+  out, = imported_abs(mx.array([-1.0]))
  # Also ok
  out, = imported_abs(mx.array([-1.0, -2.0]))
--- a/docs/src/usage/indexing.rst
+++ b/docs/src/usage/indexing.rst
@@ -107,8 +107,20 @@ same array:
  >>> a
  array([1, 2, 0], dtype=int32)
 Note that unlike NumPy, slicing an array creates a copy, not a view. So
 mutating it does not mutate the original array:
-Note, unlike NumPy, updates to the same location are nondeterministic:
+.. code-block:: shell
  >>> a = mx.array([1, 2, 3])
  >>> b = a[:]
  >>> b[2] = 0
  >>> b
  array([1, 2, 0], dtype=int32)
  >>> a
  array([1, 2, 3], dtype=int32)
 Also unlike NumPy, updates to the same location are nondeterministic:
 .. code-block:: shell
--- a/mlx/array.cpp
+++ b/mlx/array.cpp
@@ -241,8 +241,8 @@ array::ArrayDesc::ArrayDesc(
    std::vector<array> inputs)
    : shape(std::move(shape)),
      dtype(dtype),
      status(Status::unscheduled),
      primitive(std::move(primitive)),
      status(Status::unscheduled),
      inputs(std::move(inputs)) {
  init();
 }
--- a/mlx/backend/common/matmul.h
+++ b/mlx/backend/common/matmul.h
@@ -13,7 +13,7 @@ inline std::tuple<Shape, Strides, Strides> collapse_batches(
    const array& a,
    const array& b) {
  if (a.ndim() == 2) {
-    return {{1}, {0}, {0}};
+    return {Shape{1}, Strides{0}, Strides{0}};
  }
  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
@@ -38,7 +38,7 @@ inline std::tuple<Shape, Strides, Strides> collapse_batches(
 inline std::tuple<Shape, Strides, Strides, Strides>
 collapse_batches(const array& a, const array& b, const array& c) {
  if (a.ndim() == 2) {
-    return {{1}, {0}, {0}, {0}};
+    return {Shape{1}, Strides{0}, Strides{0}, Strides{0}};
  }
  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
--- a/mlx/backend/common/ternary.h
+++ b/mlx/backend/common/ternary.h
@@ -11,6 +11,8 @@ namespace mlx::core {
 enum class TernaryOpType {
  ScalarScalarScalar,
  VectorVectorVector,
  VectorVectorScalar,
  VectorScalarVector,
  General,
 };
@@ -25,6 +27,14 @@ get_ternary_op_type(const array& a, const array& b, const array& c) {
      (a.flags().col_contiguous && b.flags().col_contiguous &&
       c.flags().col_contiguous)) {
    topt = TernaryOpType::VectorVectorVector;
  } else if (
      b.data_size() == 1 && a.flags().row_contiguous &&
      c.flags().row_contiguous) {
    topt = TernaryOpType::VectorScalarVector;
  } else if (
      c.data_size() == 1 && a.flags().row_contiguous &&
      b.flags().row_contiguous) {
    topt = TernaryOpType::VectorVectorScalar;
  } else {
    topt = TernaryOpType::General;
  }
@@ -59,6 +69,8 @@ inline void set_ternary_op_output_data(
            b.flags());
      }
      break;
    case TernaryOpType::VectorVectorScalar:
    case TernaryOpType::VectorScalarVector:
    case TernaryOpType::General:
      // Try to donate an input which is row_contiguous
      if (!((a.flags().row_contiguous && maybe_donate(a)) ||
--- a/mlx/backend/cpu/compiled.cpp
+++ b/mlx/backend/cpu/compiled.cpp
@@ -15,6 +15,7 @@
 #include "mlx/backend/cpu/jit_compiler.h"
 #include "mlx/device.h"
 #include "mlx/graph_utils.h"
 #include "mlx/version.h"
 namespace mlx::core {
@@ -94,7 +95,11 @@ void* compile(
    kernel_file_name = kernel_name;
  }
-  auto output_dir = std::filesystem::temp_directory_path();
+  auto output_dir =
      std::filesystem::temp_directory_path() / "mlx" / version() / "cpu";
  if (!std::filesystem::exists(output_dir)) {
    std::filesystem::create_directories(output_dir);
  }
  std::string shared_lib_name = "lib" + kernel_file_name + ".so";
  auto shared_lib_path = (output_dir / shared_lib_name).string();
--- a/mlx/backend/cpu/conv.cpp
+++ b/mlx/backend/cpu/conv.cpp
@@ -996,131 +996,6 @@ void explicit_gemm_conv_1D_cpu(
  encoder.add_temporaries(std::move(temps));
 }
 void explicit_gemm_conv_2D_cpu(
    const array& in,
    const array& wt,
    array out,
    const std::vector<int>& padding_lo,
    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    Stream stream) {
  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
  const int iH = in.shape(1); // Input spatial dim
  const int iW = in.shape(2); // Input spatial dim
  const int oH = out.shape(1); // Output spatial dim
  const int oW = out.shape(2); // Output spatial dim
  const int O = wt.shape(0); // Out channels
  const int C = wt.shape(3); // In channels
  const int wH = wt.shape(1); // Weight spatial dim
  const int wW = wt.shape(2); // Weight spatial dim
  auto conv_dtype = out.dtype();
  auto& encoder = cpu::get_command_encoder(stream);
  // Pad input
  Shape padded_shape = {
      N,
      iH + padding_lo[0] + padding_hi[0],
      iW + padding_lo[1] + padding_hi[1],
      C};
  array in_padded(padded_shape, conv_dtype, nullptr, {});
  // Fill with zeros
  std::vector<array> temps;
  temps.push_back(array(0, conv_dtype));
  copy_cpu(temps.back(), in_padded, CopyType::Scalar, stream);
  // Pick input slice from padded
  size_t data_offset = padding_lo[0] * in_padded.strides()[1] +
      padding_lo[1] * in_padded.strides()[2];
  array in_padded_slice(in.shape(), in_padded.dtype(), nullptr, {});
  in_padded_slice.copy_shared_buffer(
      in_padded,
      in_padded.strides(),
      in_padded.flags(),
      in_padded_slice.size(),
      data_offset);
  temps.push_back(in_padded_slice);
  // Copy input values into the slice
  copy_cpu_inplace(in, in_padded_slice, CopyType::GeneralGeneral, stream);
  // Make strided view
  Shape strided_shape = {N, oH, oW, wH, wW, C};
  Strides strided_strides = {
      in_padded.strides()[0],
      in_padded.strides()[1] * wt_strides[0],
      in_padded.strides()[2] * wt_strides[1],
      in_padded.strides()[1],
      in_padded.strides()[2],
      in_padded.strides()[3]};
  auto flags = in_padded.flags();
  array in_strided_view(strided_shape, in_padded.dtype(), nullptr, {});
  in_strided_view.copy_shared_buffer(
      in_padded, strided_strides, flags, in_strided_view.size(), 0);
  // Materialize strided view
  Shape strided_reshape = {N * oH * oW, wH * wW * C};
  array in_strided(strided_reshape, in_strided_view.dtype(), nullptr, {});
  copy_cpu(in_strided_view, in_strided, CopyType::General, stream);
  temps.push_back(in_strided);
  // Check wt dtype and prepare
  auto gemm_wt = wt;
  auto gemm_out = out;
  if (wt.dtype() != float32 || !wt.flags().row_contiguous) {
    auto ctype =
        wt.flags().row_contiguous ? CopyType::Vector : CopyType::General;
    gemm_wt = array(wt.shape(), float32, nullptr, {});
    copy_cpu(wt, gemm_wt, ctype, stream);
    temps.push_back(gemm_wt);
  }
  if (out.dtype() != float32) {
    gemm_out = array(out.shape(), float32, nullptr, {});
    gemm_out.set_data(allocator::malloc(gemm_out.nbytes()));
    temps.push_back(gemm_out);
  }
  encoder.set_input_array(in_strided);
  encoder.set_input_array(gemm_wt);
  encoder.set_output_array(gemm_out);
  encoder.dispatch([in_strided_ptr = in_strided.data<float>(),
                    gemm_wt_ptr = gemm_wt.data<float>(),
                    gemm_out_ptr = gemm_out.data<float>(),
                    strided_reshape = std::move(strided_reshape),
                    O]() {
    // Perform gemm
    cblas_sgemm(
        CblasRowMajor,
        CblasNoTrans, // no trans A
        CblasTrans, // transB
        strided_reshape[0], // M
        O, // N
        strided_reshape[1], // K
        1.0f, // alpha
        in_strided_ptr,
        strided_reshape[1], // lda
        gemm_wt_ptr,
        strided_reshape[1], // ldb
        0.0f, // beta
        gemm_out_ptr,
        O // ldc
    );
  });
  // Copy results if needed
  if (out.dtype() != float32) {
    copy_cpu_inplace(gemm_out, out, CopyType::Vector, stream);
  }
  encoder.add_temporaries(std::move(temps));
 }
 void explicit_gemm_conv_ND_cpu(
    const array& in,
    const array& wt,
--- a/mlx/backend/cpu/eig.cpp
+++ b/mlx/backend/cpu/eig.cpp
@@ -46,7 +46,6 @@ void eig_impl(
    int info;
    {
      T work;
      int iwork;
      geev<T>(
          &jobl,
          &jobr,
--- a/mlx/backend/cpu/gemms/bnns.cpp
+++ b/mlx/backend/cpu/gemms/bnns.cpp
@@ -1,5 +1,4 @@
 // Copyright © 2023-2024 Apple Inc.
 #include <Accelerate/Accelerate.h>
 #include "mlx/array.h"
@@ -49,9 +48,15 @@ void matmul_bnns(
  size_t K = a_shape[ndim - 1];
  BNNSDataType bnns_dtype = to_bnns_dtype<T>();
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
  if (beta != 1.0 && beta != 0.0) {
    // scale the output
    for (auto i = 0; i < batch_size * M * N; ++i) {
      out[i] *= beta;
    }
    beta = 1.0;
  }
  const BNNSLayerParametersBroadcastMatMul gemm_params{
      /* float alpha = */ alpha,
      /* float beta = */ beta,
--- a/mlx/backend/cpu/gemms/cblas.cpp
+++ b/mlx/backend/cpu/gemms/cblas.cpp
@@ -88,4 +88,47 @@ void matmul<double>(
  }
 }
 template <>
 void matmul<complex64_t>(
    const complex64_t* a,
    const complex64_t* b,
    complex64_t* out,
    bool a_transposed,
    bool b_transposed,
    size_t lda,
    size_t ldb,
    size_t ldc,
    float alpha,
    float beta,
    size_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
    const Strides& b_strides) {
  auto ndim = a_shape.size();
  size_t M = a_shape[ndim - 2];
  size_t N = b_shape[ndim - 1];
  size_t K = a_shape[ndim - 1];
  auto calpha = static_cast<complex64_t>(alpha);
  auto cbeta = static_cast<complex64_t>(beta);
  for (int i = 0; i < batch_size; ++i) {
    cblas_cgemm(
        CblasRowMajor,
        a_transposed ? CblasTrans : CblasNoTrans, // transA
        b_transposed ? CblasTrans : CblasNoTrans, // transB
        M,
        N,
        K,
        &calpha,
        a + elem_to_loc(M * K * i, a_shape, a_strides),
        lda,
        b + elem_to_loc(K * N * i, b_shape, b_strides),
        ldb,
        &cbeta,
        out + M * N * i,
        ldc);
  }
 }
 } // namespace mlx::core
--- a/mlx/backend/cpu/masked_mm.cpp
+++ b/mlx/backend/cpu/masked_mm.cpp
@@ -215,18 +215,18 @@ void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  encoder.set_input_array(a);
  encoder.set_input_array(b);
-  const void* a_mask_ptr;
+  const void* a_mask_ptr = nullptr;
-  const void* b_mask_ptr;
+  const void* b_mask_ptr = nullptr;
-  const void* out_mask_ptr;
+  const void* out_mask_ptr = nullptr;
  Shape a_mask_shape;
  Shape b_mask_shape;
  Shape out_mask_shape;
  Strides a_mask_strides;
  Strides b_mask_strides;
  Strides out_mask_strides;
-  bool a_mask_bool;
+  bool a_mask_bool = false;
-  bool b_mask_bool;
+  bool b_mask_bool = false;
-  bool out_mask_bool;
+  bool out_mask_bool = false;
  if (has_op_mask) {
    auto& a_mask = inputs[inputs.size() - 2];
    auto& b_mask = inputs[inputs.size() - 1];
@@ -423,7 +423,6 @@ void GatherMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& rhs_indices = inputs[3];
  auto batch_shape = get_batch_dims(out.shape());
  int batch_ndim = batch_shape.size();
  auto batch_shape_A = get_batch_dims(a.shape());
  auto batch_strides_A = get_batch_dims(a.strides());
--- a/mlx/backend/cpu/matmul.cpp
+++ b/mlx/backend/cpu/matmul.cpp
@@ -91,7 +91,6 @@ void matmul_general(
  auto [b_transposed, ldb, b] = check_transpose(b_pre);
  size_t M = a.shape(-2);
  size_t N = b.shape(-1);
  size_t K = a.shape(-1);
  if (M == 0 || N == 0) {
    return;
  }
@@ -108,6 +107,9 @@ void matmul_general(
  } else if (out.dtype() == float64) {
    matmul_dispatch<double>(
        a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta, stream);
  } else if (out.dtype() == complex64) {
    matmul_dispatch<complex64_t>(
        a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta, stream);
  } else {
    throw std::runtime_error("[Matmul::eval_cpu] Invalid type.");
  }
@@ -128,10 +130,6 @@ void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {
 }
 void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  if (out.dtype() != float32) {
    throw std::runtime_error(
        "[AddMM::eval_cpu] Currently only supports float32.");
  }
  if (out.size() == 0) {
    out.set_data(allocator::malloc(out.nbytes()));
    return;
--- a/mlx/backend/cpu/quantized.cpp
+++ b/mlx/backend/cpu/quantized.cpp
@@ -1,10 +1,11 @@
 // Copyright © 2023 Apple Inc.
-#include <cassert>
+#include "mlx/backend/common/unary.h"
 #include "mlx/backend/cpu/copy.h"
 #include "mlx/backend/cpu/encoder.h"
 #include "mlx/backend/cpu/simd/simd.h"
 #include "mlx/backend/cpu/unary.h"
 #include "mlx/backend/cpu/unary_ops.h"
 #include "mlx/fast_primitives.h"
 #include "mlx/primitives.h"
 #include "mlx/utils.h"
@@ -13,6 +14,35 @@ namespace mlx::core {
 namespace {
 const static float MXFP4_LUT[16] = {
    +0.0f,
    +0.5f,
    +1.0f,
    +1.5f,
    +2.0f,
    +3.0f,
    +4.0f,
    +6.0f,
    -0.0f,
    -0.5f,
    -1.0f,
    -1.5f,
    -2.0f,
    -3.0f,
    -4.0f,
    -6.0f};
 template <typename T>
 static inline T dequantize_scale(uint8_t s) {
  using FOrI = union {
    bfloat16_t f;
    uint16_t i;
  };
  FOrI out;
  out.i = (s == 0 ? 0x40 : (static_cast<uint16_t>(s) << 7));
  return static_cast<T>(out.f);
 }
 inline constexpr short get_pack_factor(int bits, int wsize = 8) {
  return (bits == 3 || bits == 5) ? 8 : (bits == 6 ? 4 : wsize / bits);
 }
@@ -407,6 +437,229 @@ void _qmm_dispatch(
  }
 }
 template <typename T>
 void mxfp4_qmm(
    T* result,
    const T* x,
    const uint32_t* w,
    const uint8_t* scales,
    int M,
    int N,
    int K) {
  constexpr int group_size = 32;
  constexpr int pack_factor = get_pack_factor(4, 8);
  constexpr int packs_in_group = group_size / pack_factor;
  for (int m = 0; m < M; m++) {
    const uint8_t* w_local = (const uint8_t*)w;
    const uint8_t* scales_local = scales;
    std::fill(result, result + N, 0);
    for (int k = 0; k < K; k++) {
      T* result_local = result;
      T xi = *x++;
      for (int n = 0; n < N; n += group_size) {
        T scale = dequantize_scale<T>(*scales_local++);
        for (int ng = 0; ng < packs_in_group; ng++) {
          uint8_t wi = *w_local++;
 #pragma clang loop unroll(full)
          for (int p = 0; p < pack_factor; p++) {
            (*result_local++) +=
                xi * scale * static_cast<T>(MXFP4_LUT[wi & 0xf]);
            wi >>= 4;
          }
        }
      }
    }
    result += N;
  }
 }
 template <typename T>
 void mxfp4_qmm_t(
    T* result,
    const T* x,
    const uint32_t* w,
    const uint8_t* scales,
    int M,
    int N,
    int K) {
  constexpr int group_size = 32;
  constexpr int pack_factor = get_pack_factor(4, 8);
  constexpr int packs_in_group = group_size / pack_factor;
  for (int m = 0; m < M; m++) {
    const uint8_t* w_local = (const uint8_t*)w;
    const uint8_t* scales_local = scales;
    for (int n = 0; n < N; n++) {
      const T* x_local = x;
      T sum = 0;
      for (int k = 0; k < K; k += group_size) {
        T scale = dequantize_scale<T>(*scales_local++);
        T gsum = 0;
        for (int kw = 0; kw < packs_in_group; kw++) {
          uint8_t wi = *w_local++;
 #pragma clang loop unroll(full)
          for (int p = 0; p < pack_factor; p++) {
            gsum += (*x_local++) * static_cast<T>(MXFP4_LUT[wi & 0xf]);
            wi >>= 4;
          }
        }
        sum += scale * gsum;
      }
      *result = sum;
      result++;
    }
    x += K;
  }
 }
 template <int S>
 simd::Simd<float, S> mxfp4_extract_bits_simd(const uint32_t* w) {
  if constexpr (S == 8) {
    constexpr std::array<uint32_t, 8> shifts_ = {{0, 4, 8, 12, 16, 20, 24, 28}};
    auto shifts(*(simd::Simd<uint32_t, S>*)&shifts_);
    auto wi = simd::Simd<uint32_t, S>(*w);
    wi = wi >> shifts;
    wi = wi & 0xf;
    simd::Simd<float, S> w_out;
    for (int i = 0; i < S; ++i) {
      w_out[i] = MXFP4_LUT[wi[i]];
    }
    return w_out;
  } else {
    // Appease compiler.. but should never get here
    throw std::runtime_error("Unsupported combination for simd qmm.");
  }
 }
 template <typename T>
 void mxfp4_qmm_t_simd(
    T* result,
    const T* x,
    const uint32_t* w,
    const uint8_t* scales,
    int M,
    int N,
    int K) {
  constexpr int group_size = 32;
  constexpr int pack_factor = 32 / 4;
  constexpr int packs_in_group = group_size / pack_factor;
  constexpr int S = simd::max_size<T>;
  static_assert(
      S % pack_factor == 0, "SIMD size must be divisible by pack factor");
  constexpr int packs_per_simd = S / pack_factor;
  for (int m = 0; m < M; m++) {
    const uint32_t* w_local = w;
    const uint8_t* scales_local = scales;
    for (int n = 0; n < N; n++) {
      simd::Simd<float, S> acc(0);
      auto x_local = x;
      for (int k = 0; k < K; k += group_size) {
        T scale = dequantize_scale<T>(*scales_local++);
        simd::Simd<float, S> g_acc(0);
        for (int kw = 0; kw < packs_in_group; kw += packs_per_simd) {
          // Extract bits
          auto wf = mxfp4_extract_bits_simd<S>(w_local);
          w_local += packs_per_simd;
          simd::Simd<float, S> x_simd = simd::load<T, S>(x_local);
          g_acc = g_acc + x_simd * wf;
          x_local += S;
        }
        acc = acc + scale * g_acc;
      }
      *result = T(simd::sum(acc));
      result++;
    }
    x += K;
  }
 }
 template <typename T>
 void mxfp4_qmm_dispatch_transpose(
    T* result,
    const T* x,
    const uint32_t* w,
    const uint8_t* scales,
    int M,
    int N,
    int K,
    bool transposed_w) {
  if (transposed_w) {
    // the simd size must be a multiple of the number of elements per word
    if constexpr (simd::max_size<T> % 8 == 0) {
      mxfp4_qmm_t_simd<T>(result, x, w, scales, M, N, K);
    } else {
      mxfp4_qmm_t<T>(result, x, w, scales, M, N, K);
    }
  } else {
    mxfp4_qmm<T>(result, x, w, scales, M, N, K);
  }
 }
 template <typename T>
 void mxfp4_qmm_dispatch_typed(
    array& out,
    const array& x,
    const array& w,
    const array& scales,
    bool transposed_w) {
  int K = x.shape(-1);
  int M = x.ndim() > 1 ? x.shape(-2) : 1;
  int N = out.shape(-1);
  int w_els = w.ndim() > 2 ? w.shape(-1) * w.shape(-2) : 0;
  int g_els = w.ndim() > 2 ? scales.shape(-1) * scales.shape(-2) : 0;
  int batch_size = x.size() / (K * M);
  auto out_ptr = out.data<T>();
  auto x_ptr = x.data<T>();
  auto w_ptr = w.data<uint32_t>();
  auto scales_ptr = scales.data<uint8_t>();
  for (int i = 0; i < batch_size; i++) {
    mxfp4_qmm_dispatch_transpose<T>(
        out_ptr + i * M * N,
        x_ptr + elem_to_loc(i * M * K, x.shape(), x.strides()),
        w_ptr + elem_to_loc(i * w_els, w.shape(), w.strides()),
        scales_ptr + elem_to_loc(i * g_els, scales.shape(), scales.strides()),
        M,
        N,
        K,
        transposed_w);
  }
 }
 void mxfp4_qmm_dispatch(
    array& out,
    const array& x,
    const array& w,
    const array& scales,
    bool transposed_w) {
  switch (x.dtype()) {
    case bfloat16:
      mxfp4_qmm_dispatch_typed<bfloat16_t>(out, x, w, scales, transposed_w);
      break;
    case float16:
      mxfp4_qmm_dispatch_typed<float16_t>(out, x, w, scales, transposed_w);
      break;
    case float32:
      mxfp4_qmm_dispatch_typed<float>(out, x, w, scales, transposed_w);
      break;
    default:
      throw std::invalid_argument(
          "[quantized_matmul] only floating types are supported");
  }
 }
 template <typename T>
 void _bs_qmm_dispatch_typed(
    array& out,
@@ -513,115 +766,198 @@ void _bs_qmm_dispatch(
  }
 }
 template <typename T>
 void mxfp4_bs_qmm_dispatch_typed(
    array& out,
    const array& x,
    const array& w,
    const array& scales,
    const array& lhs_indices,
    const array& rhs_indices,
    bool transposed_w) {
  int K = x.shape(-1);
  int M = x.shape(-2);
  int N = out.shape(-1);
  int w_els = w.shape(-1) * w.shape(-2);
  int g_els = scales.shape(-1) * scales.shape(-2);
  auto out_ptr = out.data<T>();
  auto x_ptr = x.data<T>();
  auto w_ptr = w.data<uint32_t>();
  auto scales_ptr = scales.data<uint8_t>();
  auto lhs_indices_ptr = lhs_indices.data<uint32_t>();
  auto rhs_indices_ptr = rhs_indices.data<uint32_t>();
  for (int i = 0; i < lhs_indices.size(); i++) {
    int x_idx = lhs_indices_ptr[elem_to_loc(
        i, lhs_indices.shape(), lhs_indices.strides())];
    int w_idx = rhs_indices_ptr[elem_to_loc(
        i, rhs_indices.shape(), rhs_indices.strides())];
    mxfp4_qmm_dispatch_transpose<T>(
        out_ptr + i * M * N,
        x_ptr + elem_to_loc(x_idx * M * K, x.shape(), x.strides()),
        w_ptr + elem_to_loc(w_idx * w_els, w.shape(), w.strides()),
        scales_ptr +
            elem_to_loc(w_idx * g_els, scales.shape(), scales.strides()),
        M,
        N,
        K,
        transposed_w);
  }
 }
 void mxfp4_bs_qmm_dispatch(
    array& out,
    const array& x,
    const array& w,
    const array& scales,
    const array& lhs_indices,
    const array& rhs_indices,
    bool transposed_w) {
  switch (x.dtype()) {
    case float32:
      mxfp4_bs_qmm_dispatch_typed<float>(
          out, x, w, scales, lhs_indices, rhs_indices, transposed_w);
      break;
    case float16:
      mxfp4_bs_qmm_dispatch_typed<float16_t>(
          out, x, w, scales, lhs_indices, rhs_indices, transposed_w);
      break;
    case bfloat16:
      mxfp4_bs_qmm_dispatch_typed<bfloat16_t>(
          out, x, w, scales, lhs_indices, rhs_indices, transposed_w);
      break;
    default:
      throw std::invalid_argument(
          "[quantized_matmul] only floating types are supported");
  }
 }
 } // namespace
 void QuantizedMatmul::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 4);
  auto& x_pre = inputs[0];
  auto& w_pre = inputs[1];
  auto& scales_pre = inputs[2];
  auto& biases_pre = inputs[3];
-  std::vector<array> temps;
+  auto& encoder = cpu::get_command_encoder(stream());
-  auto ensure_row_contiguous = [s = stream(), &temps](const array& arr) {
+  auto ensure_row_contiguous = [s = stream(), &encoder](const array& arr) {
    if (arr.flags().row_contiguous) {
      return arr;
    } else {
-      temps.push_back(array(arr.shape(), arr.dtype(), nullptr, {}));
+      auto arr_cpy = array(arr.shape(), arr.dtype(), nullptr, {});
-      copy_cpu(arr, temps.back(), CopyType::General, s);
+      copy_cpu(arr, arr_cpy, CopyType::General, s);
-      return temps.back();
+      encoder.add_temporary(arr_cpy);
      return arr_cpy;
    }
  };
  auto x = ensure_row_contiguous(x_pre);
  auto w = ensure_row_contiguous(w_pre);
  auto scales = ensure_row_contiguous(scales_pre);
  auto biases = ensure_row_contiguous(biases_pre);
  out.set_data(allocator::malloc(out.nbytes()));
  auto& encoder = cpu::get_command_encoder(stream());
  encoder.add_temporaries(std::move(temps));
  encoder.set_input_array(x);
  encoder.set_input_array(w);
  encoder.set_input_array(scales);
  encoder.set_input_array(biases);
  encoder.set_output_array(out);
-  encoder.dispatch([out = array::unsafe_weak_copy(out),
+  if (mode_ == QuantizationMode::Affine) {
-                    x = array::unsafe_weak_copy(x),
+    auto biases = ensure_row_contiguous(inputs[3]);
-                    w = array::unsafe_weak_copy(w),
+    encoder.set_input_array(biases);
-                    scales = array::unsafe_weak_copy(scales),
+    encoder.dispatch([out = array::unsafe_weak_copy(out),
-                    biases = array::unsafe_weak_copy(biases),
+                      x = array::unsafe_weak_copy(x),
-                    group_size_ = group_size_,
+                      w = array::unsafe_weak_copy(w),
-                    bits_ = bits_,
+                      scales = array::unsafe_weak_copy(scales),
-                    transpose_ = transpose_]() mutable {
+                      biases = array::unsafe_weak_copy(biases),
-    _qmm_dispatch(out, x, w, scales, biases, group_size_, bits_, transpose_);
+                      group_size_ = group_size_,
-  });
+                      bits_ = bits_,
                      transpose_ = transpose_]() mutable {
      _qmm_dispatch(out, x, w, scales, biases, group_size_, bits_, transpose_);
    });
  } else {
    encoder.dispatch([out = array::unsafe_weak_copy(out),
                      x = array::unsafe_weak_copy(x),
                      w = array::unsafe_weak_copy(w),
                      scales = array::unsafe_weak_copy(scales),
                      transpose_ = transpose_]() mutable {
      mxfp4_qmm_dispatch(out, x, w, scales, transpose_);
    });
  }
 }
 void GatherQMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 6);
  auto& x_pre = inputs[0];
  auto& w_pre = inputs[1];
  auto& scales_pre = inputs[2];
-  auto& biases_pre = inputs[3];
+  auto& lhs_indices = inputs[inputs.size() - 2];
-  auto& lhs_indices = inputs[4];
+  auto& rhs_indices = inputs[inputs.size() - 1];
  auto& rhs_indices = inputs[5];
-  std::vector<array> temps;
+  auto& encoder = cpu::get_command_encoder(stream());
  auto ensure_row_contiguous_last_dims = [s = stream(),
-                                          &temps](const array& arr) {
+                                          &encoder](const array& arr) {
    auto stride_0 = arr.strides()[arr.ndim() - 2];
    auto stride_1 = arr.strides()[arr.ndim() - 1];
    if (stride_0 == arr.shape(-1) && stride_1 == 1) {
      return arr;
    } else {
-      temps.push_back(array(arr.shape(), arr.dtype(), nullptr, {}));
+      auto arr_cpy = array(arr.shape(), arr.dtype(), nullptr, {});
-      copy_cpu(arr, temps.back(), CopyType::General, s);
+      copy_cpu(arr, arr_cpy, CopyType::General, s);
-      return temps.back();
+      encoder.add_temporary(arr_cpy);
      return arr_cpy;
    }
  };
  auto x = ensure_row_contiguous_last_dims(x_pre);
  auto w = ensure_row_contiguous_last_dims(w_pre);
  auto scales = ensure_row_contiguous_last_dims(scales_pre);
  auto biases = ensure_row_contiguous_last_dims(biases_pre);
  out.set_data(allocator::malloc(out.nbytes()));
  auto& encoder = cpu::get_command_encoder(stream());
  encoder.add_temporaries(std::move(temps));
  encoder.set_input_array(x);
  encoder.set_input_array(w);
  encoder.set_input_array(scales);
  encoder.set_input_array(biases);
  encoder.set_input_array(lhs_indices);
  encoder.set_input_array(rhs_indices);
  encoder.set_output_array(out);
-  encoder.dispatch([out = array::unsafe_weak_copy(out),
+  if (mode_ == QuantizationMode::Affine) {
-                    x = array::unsafe_weak_copy(x),
+    auto biases = ensure_row_contiguous_last_dims(inputs[3]);
-                    w = array::unsafe_weak_copy(w),
+    encoder.set_input_array(biases);
-                    scales = array::unsafe_weak_copy(scales),
+    encoder.dispatch([out = array::unsafe_weak_copy(out),
-                    biases = array::unsafe_weak_copy(biases),
+                      x = array::unsafe_weak_copy(x),
-                    lhs_indices = array::unsafe_weak_copy(lhs_indices),
+                      w = array::unsafe_weak_copy(w),
-                    rhs_indices = array::unsafe_weak_copy(rhs_indices),
+                      scales = array::unsafe_weak_copy(scales),
-                    group_size_ = group_size_,
+                      biases = array::unsafe_weak_copy(biases),
-                    bits_ = bits_,
+                      lhs_indices = array::unsafe_weak_copy(lhs_indices),
-                    transpose_ = transpose_]() mutable {
+                      rhs_indices = array::unsafe_weak_copy(rhs_indices),
-    _bs_qmm_dispatch(
+                      group_size_ = group_size_,
-        out,
+                      bits_ = bits_,
-        x,
+                      transpose_ = transpose_]() mutable {
-        w,
+      _bs_qmm_dispatch(
-        scales,
+          out,
-        biases,
+          x,
-        lhs_indices,
+          w,
-        rhs_indices,
+          scales,
-        group_size_,
+          biases,
-        bits_,
+          lhs_indices,
-        transpose_);
+          rhs_indices,
-  });
+          group_size_,
          bits_,
          transpose_);
    });
  } else {
    encoder.dispatch([out = array::unsafe_weak_copy(out),
                      x = array::unsafe_weak_copy(x),
                      w = array::unsafe_weak_copy(w),
                      scales = array::unsafe_weak_copy(scales),
                      lhs_indices = array::unsafe_weak_copy(lhs_indices),
                      rhs_indices = array::unsafe_weak_copy(rhs_indices),
                      transpose_ = transpose_]() mutable {
      mxfp4_bs_qmm_dispatch(
          out, x, w, scales, lhs_indices, rhs_indices, transpose_);
    });
  }
 }
 template <typename T, typename U>
@@ -705,7 +1041,7 @@ void dispatch_quantize(
      w_ptr, out_ptr, scales_ptr, biases_ptr, bits, group_size, w.size());
 }
-void fast::AffineQuantize::eval_cpu(
+void fast::Quantize::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  auto ensure_row_contiguous = [s = stream()](const array& arr) {
@@ -764,7 +1100,47 @@ void fast::AffineQuantize::eval_cpu(
      }
    } else {
      throw std::runtime_error(
-          "[fast::AffineQuantize::eval_cpu] Only supports floating point inputs");
+          "[fast::Quantize::eval_cpu] Only supports floating point inputs");
    }
  });
 }
 void fast::ConvertFP8::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  auto& in = inputs[0];
  auto& out = outputs[0];
  set_unary_output_data(in, out);
  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  encoder.dispatch([in = array::unsafe_weak_copy(in),
                    out = array::unsafe_weak_copy(out),
                    to_fp8 = to_fp8_]() mutable {
    if (to_fp8) {
      switch (in.dtype()) {
        case float16:
          unary_op<float16_t, uint8_t>(in, out, detail::ToFP8());
          break;
        case bfloat16:
          unary_op<bfloat16_t, uint8_t>(in, out, detail::ToFP8());
          break;
        default:
          unary_op<float, uint8_t>(in, out, detail::ToFP8());
          break;
      }
    } else {
      switch (out.dtype()) {
        case float16:
          unary_op<uint8_t, float16_t>(in, out, detail::FromFP8());
          break;
        case bfloat16:
          unary_op<uint8_t, bfloat16_t>(in, out, detail::FromFP8());
          break;
        default:
          unary_op<uint8_t, float>(in, out, detail::FromFP8());
          break;
      }
    }
  });
 }
--- a/mlx/backend/cpu/simd/accelerate_simd.h
+++ b/mlx/backend/cpu/simd/accelerate_simd.h
@@ -1,5 +1,6 @@
 #pragma once
 #include <arm_neon.h>
 #include <simd/math.h>
 #include <simd/vector.h>
@@ -9,7 +10,7 @@
 #include "mlx/backend/cpu/simd/base_simd.h"
-// There seems to be a bug in sims/base.h
+// There seems to be a bug in simd/base_simd.h
 // __XROS_2_0 is not defined, the expression evaluates
 // to true instead of false setting the SIMD library
 // higher than it should be even on macOS < 15
@@ -200,6 +201,15 @@ SIMD_DEFAULT_COMPARISONS(<=)
 SIMD_DEFAULT_COMPARISONS(==)
 SIMD_DEFAULT_COMPARISONS(!=)
 template <typename T, int N>
 Simd<T, N> clz(Simd<T, N> x) {
  auto a = *(uint32x4_t*)(&x);
  auto b = *((uint32x4_t*)(&x) + 1);
  a = vclzq_u32(a);
  b = vclzq_u32(b);
  return asd::make_uint8(a, b);
 }
 template <typename T, int N>
 Simd<T, N> atan2(Simd<T, N> a, Simd<T, N> b) {
  return asd::atan2(a.value, b.value);
@@ -234,6 +244,7 @@ Simd<T, N> remainder(Simd<T, N> a, Simd<T, N> b) {
 template <typename MaskT, typename T1, typename T2, int N>
 Simd<T1, N> select(Simd<MaskT, N> mask, Simd<T1, N> x, Simd<T2, N> y) {
  static_assert(std::is_same_v<MaskT, bool>);
  if constexpr (sizeof(T1) == 1) {
    return asd::bitselect(y.value, x.value, asd::convert<char>(mask.value));
  } else if constexpr (sizeof(T1) == 2) {
@@ -251,9 +262,13 @@ Simd<T, N> pow(Simd<T, N> base, Simd<T, N> exp) {
    return asd::pow(base.value, exp.value);
  } else {
    Simd<T, N> res = 1;
-    while (any(exp)) {
+    // Raising an integer to a negative power is undefined
-      res = select(exp & 1, res * base, res);
+    if (any(exp < 0)) {
-      base = select(exp, base * base, base);
+      return 0;
    }
    while (any(exp > 0)) {
      res = select((exp & 1) != 0, res * base, res);
      base = select(exp > 0, base * base, base);
      exp = exp >> 1;
    }
    return res;
--- a/mlx/backend/cpu/simd/base_simd.h
+++ b/mlx/backend/cpu/simd/base_simd.h
@@ -171,6 +171,11 @@ DEFAULT_BINARY(&)
 DEFAULT_BINARY(&&)
 DEFAULT_BINARY(||)
 template <typename T>
 Simd<T, 1> clz(Simd<T, 1> x_) {
  return __builtin_clz(x_.value);
 }
 template <typename T>
 Simd<T, 1> remainder(Simd<T, 1> a_, Simd<T, 1> b_) {
  T a = a_.value;
--- a/mlx/backend/cpu/sort.cpp
+++ b/mlx/backend/cpu/sort.cpp
@@ -15,6 +15,18 @@ namespace mlx::core {
 namespace {
 // NaN-aware comparator that places NaNs at the end
 template <typename T>
 bool nan_aware_less(T a, T b) {
  if constexpr (std::is_floating_point_v<T> || std::is_same_v<T, complex64_t>) {
    if (std::isnan(a))
      return false;
    if (std::isnan(b))
      return true;
  }
  return a < b;
 }
 template <typename T>
 struct StridedIterator {
  using iterator_category = std::random_access_iterator_tag;
@@ -27,7 +39,7 @@ struct StridedIterator {
  StridedIterator() = default;
  explicit StridedIterator(T* ptr, int64_t stride, difference_type offset = 0)
-      : ptr_(ptr + offset * stride), stride_(stride) {}
+      : stride_(stride), ptr_(ptr + offset * stride) {}
  explicit StridedIterator(array& arr, int axis, difference_type offset = 0)
      : StridedIterator(arr.data<T>(), arr.strides()[axis], offset) {}
@@ -130,7 +142,7 @@ void sort(array& out, int axis) {
    StridedIterator st(data_ptr, axis_stride, 0);
    StridedIterator ed(data_ptr, axis_stride, axis_size);
-    std::stable_sort(st, ed);
+    std::stable_sort(st, ed, nan_aware_less<T>);
    src_it.step();
  }
 }
@@ -184,6 +196,15 @@ void argsort(const array& in, array& out, int axis) {
    std::stable_sort(st, ed, [data_ptr, in_stride](IdxT a, IdxT b) {
      auto v1 = data_ptr[a * in_stride];
      auto v2 = data_ptr[b * in_stride];
      // Handle NaNs (place them at the end)
      if (std::is_floating_point<T>::value) {
        if (std::isnan(v1))
          return false;
        if (std::isnan(v2))
          return true;
      }
      return v1 < v2 || (v1 == v2 && a < b);
    });
  }
@@ -219,7 +240,7 @@ void partition(array& out, int axis, int kth) {
    StridedIterator md(data_ptr, axis_stride, kth);
    StridedIterator ed(data_ptr, axis_stride, axis_size);
-    std::nth_element(st, md, ed);
+    std::nth_element(st, md, ed, nan_aware_less<T>);
  }
 }
@@ -276,6 +297,15 @@ void argpartition(const array& in, array& out, int axis, int kth) {
    std::nth_element(st, md, ed, [data_ptr, in_stride](IdxT a, IdxT b) {
      auto v1 = data_ptr[a * in_stride];
      auto v2 = data_ptr[b * in_stride];
      // Handle NaNs (place them at the end)
      if (std::is_floating_point<T>::value) {
        if (std::isnan(v1))
          return false;
        if (std::isnan(v2))
          return true;
      }
      return v1 < v2 || (v1 == v2 && a < b);
    });
  }
--- a/mlx/backend/cpu/svd.cpp
+++ b/mlx/backend/cpu/svd.cpp
@@ -83,8 +83,6 @@ void svd_impl(
    auto jobz = (u_ptr) ? "A" : "N";
    // Will contain the number of singular values after the call has returned.
    int ns = 0;
    T workspace_dimension = 0;
    // Will contain the indices of eigenvectors that failed to converge (not
--- a/mlx/backend/cpu/unary.h
+++ b/mlx/backend/cpu/unary.h
@@ -24,9 +24,9 @@ void unary_op(const array& a, array& out, Op) {
  auto ndim = a.ndim();
  if (a.flags().contiguous) {
    auto size = a.data_size();
-    constexpr int N = simd::max_size<T>;
+    constexpr int N = std::min(simd::max_size<T>, simd::max_size<U>);
    while (size >= N) {
-      simd::store(dst, Op{}(simd::load<T, N>(src)));
+      simd::store(dst, simd::Simd<U, N>(Op{}(simd::load<T, N>(src))));
      size -= N;
      src += N;
      dst += N;
--- a/mlx/backend/cpu/unary_ops.h
+++ b/mlx/backend/cpu/unary_ops.h
@@ -77,7 +77,8 @@ struct Real {
 struct Sigmoid {
  template <int N, typename T>
  Simd<T, N> operator()(Simd<T, N> x) {
-    return 1.0f / (1.0f + simd::exp(-x));
+    auto y = 1.0f / (1.0f + simd::exp(simd::abs(x)));
    return simd::select(x < Simd<T, N>{0}, y, Simd<T, N>{1} - y);
  }
  SINGLE()
 };
@@ -107,4 +108,73 @@ struct Square {
  SINGLE()
 };
 template <int N>
 Simd<float, N> fp32_from_bits(Simd<uint32_t, N> x) {
  return *(Simd<float, N>*)(&x);
 }
 template <int N>
 Simd<uint32_t, N> fp32_to_bits(Simd<float, N> x) {
  return *(Simd<uint32_t, N>*)(&x);
 }
 struct ToFP8 {
  template <typename T, int N>
  Simd<uint8_t, N> operator()(Simd<T, N> f) {
    uint32_t fp8_max = 543 << 21;
    auto denorm_mask = Simd<uint32_t, N>(141 << 23);
    Simd<uint32_t, N> f_bits;
    Simd<float, N> f32 = f;
    f_bits = fp32_to_bits(f32);
    Simd<uint8_t, N> result = 0u;
    auto sign = f_bits & 0x80000000;
    f_bits = f_bits ^ sign;
    auto f_bits_low =
        fp32_to_bits(fp32_from_bits(f_bits) + fp32_from_bits(denorm_mask));
    auto result_low = Simd<uint8_t, N>(f_bits_low - denorm_mask);
    auto mant_odd = Simd<uint8_t, N>((f_bits >> 20) & 1);
    auto f_bits_high = f_bits + (((uint32_t)(7 - 127) << 23) + 0x7FFFF);
    f_bits_high = f_bits_high + Simd<uint32_t, N>(mant_odd);
    auto result_high = Simd<uint8_t, N>(f_bits_high >> 20);
    result = select(f_bits < (121 << 23), result_low, result_high);
    auto result_sat = Simd<uint8_t, N>(0x7E);
    result = select(f_bits >= fp8_max, result_sat, result);
    return result | Simd<uint8_t, N>(sign >> 24);
  }
  template <typename T>
  uint8_t operator()(T x) {
    return (*this)(Simd<T, 1>(x)).value;
  }
 };
 struct FromFP8 {
  template <int N>
  Simd<float, N> operator()(Simd<uint8_t, N> x) {
    auto w = Simd<uint32_t, N>(x) << 24;
    auto sign = w & 0x80000000;
    auto nonsign = w & 0x7FFFFFFF;
    auto renorm_shift = clz(nonsign);
    renorm_shift = simd::select(
        renorm_shift > Simd<uint32_t, N>{4},
        renorm_shift - Simd<uint32_t, N>{4},
        Simd<uint32_t, N>{0});
    Simd<int32_t, N> inf_nan_mask =
        (Simd<int32_t, N>(nonsign + 0x01000000) >> 8) & 0x7F800000;
    auto zero_mask = Simd<int32_t, N>(nonsign - 1) >> 31;
    auto result = sign |
        ((((nonsign << renorm_shift >> 4) + ((0x78 - renorm_shift) << 23)) |
          inf_nan_mask) &
         ~zero_mask);
    return fp32_from_bits(result);
  }
  float operator()(uint8_t x) {
    return (*this)(Simd<uint8_t, 1>(x)).value;
  }
 };
 } // namespace mlx::core::detail
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -22,6 +22,7 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/cudnn_utils.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/custom_kernel.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/distributed.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/eval.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/event.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/fence.cpp
@@ -50,12 +51,19 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/ternary.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/affine_quantize.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/fp_quantize.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/quantized.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/convert_fp8.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/worker.cpp)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/binary)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/unary)
 # fp4 is not available on < 12.8
 if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.8.0)
  target_include_directories(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/quantized/)
 endif()
 if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9.0)
  target_sources(
    mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/cublas_gemm_batched_12_9.cu)
@@ -169,7 +177,6 @@ target_link_libraries(mlx PRIVATE CUDNN::cudnn_all)
 # Suppress nvcc warnings on MLX headers.
 target_compile_options(mlx PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe
                                   --diag_suppress=997>)
 # Install CCCL headers for JIT.
 install(DIRECTORY ${cccl_SOURCE_DIR}/include/cuda
        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cccl)
--- a/mlx/backend/cuda/allocator.cpp
+++ b/mlx/backend/cuda/allocator.cpp
@@ -30,8 +30,20 @@ SmallSizePool::SmallSizePool() {
  next_free_ = buffer_;
  CHECK_CUDA_ERROR(cudaMallocManaged(&data_, small_pool_size));
-  CHECK_CUDA_ERROR(
+
-      cudaMemAdvise(data_, small_pool_size, cudaMemAdviseSetReadMostly, 0));
+  int device_count = 0;
  CHECK_CUDA_ERROR(cudaGetDeviceCount(&device_count));
  for (int i = 0; i < device_count; ++i) {
 #if CUDART_VERSION >= 13000
    cudaMemLocation loc;
    loc.type = cudaMemLocationTypeDevice;
    loc.id = i;
 #else
    int loc = i;
 #endif // CUDART_VERSION >= 13000
    CHECK_CUDA_ERROR(
        cudaMemAdvise(data_, small_pool_size, cudaMemAdviseSetAccessedBy, loc));
  }
  auto curr = next_free_;
  for (size_t i = 1; i < num_blocks; ++i) {
@@ -79,13 +91,12 @@ CudaAllocator::CudaAllocator()
  // TODO: Set memory limit for multi-device.
  size_t free, total;
  CHECK_CUDA_ERROR(cudaMemGetInfo(&free, &total));
-  memory_limit_ = total * 0.8;
+  memory_limit_ = total * 0.95;
  max_pool_size_ = memory_limit_;
 }
 Buffer CudaAllocator::malloc(size_t size) {
  // Find available buffer from cache.
  auto orig_size = size;
  std::unique_lock lock(mutex_);
  if (size <= small_block_size) {
    size = 8;
@@ -119,7 +130,7 @@ Buffer CudaAllocator::malloc(size_t size) {
    }
    lock.lock();
  }
-  active_memory_ += size;
+  active_memory_ += buf->size;
  peak_memory_ = std::max(active_memory_, peak_memory_);
  // Maintain the cache below the requested limit.
--- a/mlx/backend/cuda/arange.cu
+++ b/mlx/backend/cuda/arange.cu
@@ -6,23 +6,33 @@
 #include "mlx/dtype_utils.h"
 #include "mlx/primitives.h"
 #include <cooperative_groups.h>
 #include <nvtx3/nvtx3.hpp>
 #include <thrust/device_ptr.h>
 #include <thrust/transform.h>
 namespace mlx::core {
 namespace cu {
-template <typename T>
+namespace cg = cooperative_groups;
 struct Arange {
  const T start;
  const T step;
-  __device__ T operator()(uint32_t i) const {
+template <typename T, typename IdxT, int N_WRITES>
-    return start + i * step;
+__global__ void arange(T* out, IdxT size, T start, T step) {
  IdxT index = cg::this_grid().thread_rank();
  if ((index + 1) * N_WRITES > size) {
    for (IdxT i = index * N_WRITES; i < size; ++i) {
      out[i] = start + i * step;
    }
  } else {
    AlignedVector<T, N_WRITES> out_vec;
 #pragma unroll
    for (int i = 0; i < N_WRITES; ++i) {
      out_vec[i] = start + (index * N_WRITES + i) * step;
    }
    store_vector<N_WRITES>(out, index, out_vec);
  }
-};
+}
 } // namespace cu
@@ -36,19 +46,23 @@ void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& encoder = cu::get_command_encoder(stream());
  encoder.set_output_array(out);
  auto capture = encoder.capture_context();
  dispatch_int_float_types(out.dtype(), "Arange", [&](auto type_tag) {
    using CTYPE = MLX_GET_TYPE(type_tag);
    using OutType = cuda_type_t<CTYPE>;
-    CTYPE step =
+    constexpr int N_WRITES = 16 / sizeof(OutType);
-        static_cast<CTYPE>(start_ + step_) - static_cast<CTYPE>(start_);
+    dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
-    thrust::transform(
+      using IdxT = std::conditional_t<large(), int64_t, int32_t>;
-        cu::thrust_policy(encoder.stream()),
+      auto [num_blocks, block_dims] = get_launch_args(out, large(), N_WRITES);
-        thrust::counting_iterator<uint32_t>(0),
+      encoder.add_kernel_node(
-        thrust::counting_iterator<uint32_t>(out.data_size()),
+          cu::arange<OutType, IdxT, N_WRITES>,
-        thrust::device_pointer_cast(out.data<OutType>()),
+          num_blocks,
-        cu::Arange<OutType>{
+          block_dims,
-            static_cast<OutType>(start_), static_cast<OutType>(step)});
+          0,
          out.data<OutType>(),
          out.data_size(),
          static_cast<CTYPE>(start_),
          static_cast<CTYPE>(start_ + step_) - static_cast<CTYPE>(start_));
    });
  });
 }
--- a/mlx/backend/cuda/compiled.cpp
+++ b/mlx/backend/cuda/compiled.cpp
@@ -332,9 +332,9 @@ void Compiled::eval_gpu(
    encoder.set_output_array(out);
  }
-  auto kernel = mod.get_kernel(kernel_name);
+  auto [kernel, max_block_dims] = mod.get_kernel_and_dims(kernel_name);
  auto [num_blocks, block_dims] =
-      get_launch_args(outputs[0], large, work_per_thread);
+      get_launch_args(outputs[0], large, work_per_thread, max_block_dims);
  encoder.add_kernel_node(kernel, num_blocks, block_dims, 0, args.args());
 }
--- a/mlx/backend/cuda/conv.cpp
+++ b/mlx/backend/cuda/conv.cpp
@@ -47,7 +47,7 @@ auto& conv_cache() {
      std::pair<
          cudnnBackendDescriptorType_t,
          std::optional<cudnn_frontend::ExecutionPlan>>>
-      cache(/* capacity */ 128);
+      cache("MLX_CUDA_CONV_CACHE_SIZE", /* default_capacity */ 128);
  return cache;
 }
@@ -382,20 +382,19 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
  }
  if (op_graph) {
    // Setup inputs and outputs.
    register_args(encoder, backend_type, in, wt, out, out_);
    // Find a plan for the graph and execute it.
    auto plan = find_cudnn_plan_from_op_graph(
        encoder.device().cudnn_handle(), backend_type, dtype, *op_graph);
-    if (!plan) {
+    if (plan) {
-      throw std::runtime_error("[conv] Unable to find an execution plan.");
+      // Setup inputs and outputs.
-    }
+      register_args(encoder, backend_type, in, wt, out, out_);
-    auto [x, w, y] = dispatch_args(backend_type, in, wt, out);
+
-    if (encode_cudnn_plan(encoder, *plan, {'x', 'w', 'y'}, x, w, y)) {
+      auto [x, w, y] = dispatch_args(backend_type, in, wt, out);
-      conv_cache().emplace(
+      if (encode_cudnn_plan(encoder, *plan, {'x', 'w', 'y'}, x, w, y)) {
-          cache_key, std::make_pair(backend_type, std::move(*plan)));
+        conv_cache().emplace(
-      return;
+            cache_key, std::make_pair(backend_type, std::move(*plan)));
        return;
      }
    }
  }
--- a/mlx/backend/cuda/copy.cu
+++ b/mlx/backend/cuda/copy.cu
@@ -15,8 +15,8 @@ void copy_gpu_inplace(
    int64_t offset_out,
    CopyType ctype,
    const Stream& s,
-    const std::optional<array>& dynamic_offset_in,
+    std::optional<array> dynamic_offset_in,
-    const std::optional<array>& dynamic_offset_out) {
+    std::optional<array> dynamic_offset_out) {
  if (out.size() == 0) {
    return;
  }
@@ -44,6 +44,16 @@ void copy_gpu_inplace(
          strides_vec[0]);
    } else {
      if (dynamic_offset_in || dynamic_offset_out) {
        if (!dynamic_offset_in) {
          dynamic_offset_in = array(0, int64);
          encoder.add_temporary(*dynamic_offset_in);
        }
        if (!dynamic_offset_out) {
          dynamic_offset_out = array(0, int64);
          encoder.add_temporary(*dynamic_offset_out);
        }
        encoder.set_input_array(*dynamic_offset_in);
        encoder.set_input_array(*dynamic_offset_out);
        copy_general_dynamic(
            encoder,
            ctype,
@@ -54,8 +64,8 @@ void copy_gpu_inplace(
            shape_collapsed,
            strides_vec[0],
            strides_vec[1],
-            dynamic_offset_in ? *dynamic_offset_in : array(0, int64),
+            *dynamic_offset_in,
-            dynamic_offset_out ? *dynamic_offset_out : array(0, int64));
+            *dynamic_offset_out);
      } else {
        copy_general(
            encoder,
--- a/mlx/backend/cuda/cudnn_utils.cpp
+++ b/mlx/backend/cuda/cudnn_utils.cpp
@@ -210,6 +210,9 @@ std::optional<cudnn_frontend::ExecutionPlan> find_cudnn_plan_from_op_graph(
    Dtype dtype,
    cudnn_frontend::OperationGraph& op_graph) {
  auto engine_configs = get_cudnn_engine_configs(backend_type, dtype, op_graph);
  if (engine_configs.empty()) {
    return std::nullopt;
  }
  return find_cudnn_plan_from_engine_configs(handle, engine_configs, op_graph);
 }
--- a/mlx/backend/cuda/device.cpp
+++ b/mlx/backend/cuda/device.cpp
@@ -14,10 +14,6 @@ namespace mlx::core::cu {
 namespace {
 // Can be tuned with MLX_MAX_OPS_PER_BUFFER
 // This should be less than 255
 constexpr int default_max_nodes_per_graph = 20;
 #define CHECK_CUDNN_ERROR(cmd) check_cudnn_error(#cmd, (cmd))
 void check_cudnn_error(const char* name, cudnnStatus_t err) {
@@ -27,11 +23,11 @@ void check_cudnn_error(const char* name, cudnnStatus_t err) {
  }
 }
-int cuda_graph_cache_size() {
+bool use_cuda_graphs() {
-  static int cache_size = []() {
+  static bool use_graphs = []() {
-    return env::get_var("MLX_CUDA_GRAPH_CACHE_SIZE", 100);
+    return env::get_var("MLX_USE_CUDA_GRAPHS", true);
  }();
-  return cache_size;
+  return use_graphs;
 }
 } // namespace
@@ -68,8 +64,8 @@ Device::~Device() {
 void Device::make_current() {
  // We need to set/get current CUDA device very frequently, cache it to reduce
-  // actual calls of CUDA APIs. This function assumes single-thread in host.
+  // actual calls of CUDA APIs.
-  static int current = 0;
+  static thread_local int current = 0;
  if (current != device_) {
    CHECK_CUDA_ERROR(cudaSetDevice(device_));
    current = device_;
@@ -86,11 +82,19 @@ CommandEncoder& Device::get_command_encoder(Stream s) {
 CommandEncoder::CaptureContext::CaptureContext(CommandEncoder& enc) : enc(enc) {
  enc.device().make_current();
  if (!use_cuda_graphs()) {
    return;
  }
  CHECK_CUDA_ERROR(
      cudaStreamBeginCapture(enc.stream(), cudaStreamCaptureModeGlobal));
 }
 CommandEncoder::CaptureContext::~CaptureContext() {
  if (!use_cuda_graphs()) {
    enc.node_count_++;
    return;
  }
  graph.end_capture(enc.stream());
  if (discard) {
    return;
@@ -105,6 +109,9 @@ CommandEncoder::ConcurrentContext::ConcurrentContext(CommandEncoder& enc)
 CommandEncoder::ConcurrentContext::~ConcurrentContext() {
  enc.in_concurrent_ = false;
  if (!use_cuda_graphs()) {
    return;
  }
  // Use an empty graph node for synchronization
  CommandEncoder::GraphNode empty{NULL, 'E', std::to_string(enc.node_count_++)};
@@ -186,35 +193,43 @@ CommandEncoder::CommandEncoder(Device& d)
    : device_(d),
      stream_(d),
      graph_(d),
-      graph_cache_(cuda_graph_cache_size()) {}
+      worker_(d),
      graph_cache_("MLX_CUDA_GRAPH_CACHE_SIZE", /* default_capacity */ 400) {}
 void CommandEncoder::add_completed_handler(std::function<void()> task) {
  worker_.add_task(std::move(task));
 }
 void CommandEncoder::set_input_array(const array& arr) {
  if (!use_cuda_graphs()) {
    return;
  }
  auto id = reinterpret_cast<std::uintptr_t>(arr.buffer().ptr());
  active_deps_.push_back(id);
 }
 void CommandEncoder::set_output_array(const array& arr) {
  if (!use_cuda_graphs()) {
    return;
  }
  auto id = reinterpret_cast<std::uintptr_t>(arr.buffer().ptr());
  active_deps_.push_back(id);
  active_outputs_.push_back(id);
 }
 void CommandEncoder::maybe_commit() {
  if (node_count_ >= env::max_ops_per_buffer(default_max_nodes_per_graph)) {
    commit();
  }
 }
 void CommandEncoder::add_kernel_node(
    void* func,
    dim3 grid_dim,
    dim3 block_dim,
    uint32_t smem_bytes,
    void** params) {
  if (!use_cuda_graphs()) {
    node_count_++;
    CHECK_CUDA_ERROR(cudaLaunchKernel(
        func, grid_dim, block_dim, params, smem_bytes, stream()));
    return;
  }
  cudaKernelNodeParams kernel_params = {0};
  kernel_params.func = func;
  kernel_params.gridDim = grid_dim;
@@ -230,6 +245,23 @@ void CommandEncoder::add_kernel_node(
    dim3 block_dim,
    uint32_t smem_bytes,
    void** params) {
  if (!use_cuda_graphs()) {
    node_count_++;
    CHECK_CUDA_ERROR(cuLaunchKernel(
        func,
        grid_dim.x,
        grid_dim.y,
        grid_dim.z,
        block_dim.x,
        block_dim.y,
        block_dim.z,
        smem_bytes,
        stream(),
        params,
        nullptr));
    return;
  }
  CUDA_KERNEL_NODE_PARAMS kernel_params = {0};
  kernel_params.func = func;
  kernel_params.gridDimX = grid_dim.x;
@@ -256,20 +288,38 @@ void CommandEncoder::add_kernel_node(const CUDA_KERNEL_NODE_PARAMS& params) {
 }
 void CommandEncoder::add_graph_node(cudaGraph_t child) {
  if (!use_cuda_graphs()) {
    node_count_++;
    CudaGraphExec graph_exec;
    graph_exec.instantiate(child);
    device_.make_current();
    CHECK_CUDA_ERROR(cudaGraphLaunch(graph_exec, stream()));
    return;
  }
  cudaGraphNode_t node;
  CHECK_CUDA_ERROR(cudaGraphAddChildGraphNode(&node, graph_, NULL, 0, child));
  insert_graph_dependencies(GraphNode{node, 'G'});
 }
 int CommandEncoder::get_num_ops() {
  return node_count_;
 }
 void CommandEncoder::commit() {
  nvtx3::scoped_range r("CommandEncoder::commit");
  if (!temporaries_.empty()) {
    add_completed_handler([temporaries = std::move(temporaries_)]() {});
  }
-  if (node_count_ > 0) {
+  if (use_cuda_graphs() && node_count_ > 0) {
    if (!from_nodes_.empty()) {
      CHECK_CUDA_ERROR(cudaGraphAddDependencies(
-          graph_, from_nodes_.data(), to_nodes_.data(), from_nodes_.size()));
+          graph_,
          from_nodes_.data(),
          to_nodes_.data(),
 #if CUDART_VERSION >= 13000
          nullptr, // edgeData
 #endif // CUDART_VERSION >= 13000
          from_nodes_.size()));
    }
    graph_key_ += ".";
@@ -303,7 +353,6 @@ void CommandEncoder::commit() {
    CHECK_CUDA_ERROR(cudaGraphLaunch(graph_exec, stream_));
    // Reset state
    node_count_ = 0;
    graph_node_count_ = 0;
    empty_node_count_ = 0;
    from_nodes_.clear();
@@ -315,6 +364,7 @@ void CommandEncoder::commit() {
  // Put completion handlers in a batch.
  worker_.commit(stream_);
  node_count_ = 0;
 }
 void CommandEncoder::synchronize() {
--- a/mlx/backend/cuda/device.h
+++ b/mlx/backend/cuda/device.h
@@ -76,9 +76,6 @@ class CommandEncoder {
      uint32_t smem_bytes,
      void** params);
  // Low-level graph helpers.
  void add_kernel_node(const cudaKernelNodeParams& params);
  void add_kernel_node(const CUDA_KERNEL_NODE_PARAMS& params);
  void add_graph_node(cudaGraph_t child);
  void add_temporary(const array& arr) {
@@ -86,7 +83,7 @@ class CommandEncoder {
  }
  void add_completed_handler(std::function<void()> task);
-  void maybe_commit();
+  int get_num_ops();
  void commit();
  Device& device() {
@@ -101,6 +98,9 @@ class CommandEncoder {
  void synchronize();
 private:
  void add_kernel_node(const cudaKernelNodeParams& params);
  void add_kernel_node(const CUDA_KERNEL_NODE_PARAMS& params);
  struct GraphNode {
    cudaGraphNode_t node;
    // K = kernel
@@ -140,7 +140,7 @@ class Device {
  Device(const Device&) = delete;
  Device& operator=(const Device&) = delete;
-  // Make this device the current cuda device, required by some cuda calls.
+  // Make this device the current cuda device, this method is thread-safe.
  void make_current();
  CommandEncoder& get_command_encoder(Stream s);
--- a/mlx/backend/cuda/device/binary_ops.cuh
+++ b/mlx/backend/cuda/device/binary_ops.cuh
@@ -204,6 +204,12 @@ struct Power {
  __device__ T operator()(T base, T exp) {
    if constexpr (cuda::std::is_integral_v<T>) {
      T res = 1;
      // Raising an integer to a negative power is undefined
      if constexpr (cuda::std::is_signed_v<T>) {
        if (exp < 0) {
          return 0;
        }
      }
      while (exp) {
        if (exp & 1) {
          res *= base;
--- a/mlx/backend/cuda/device/cast_op.cuh
+++ b/mlx/backend/cuda/device/cast_op.cuh
@@ -6,7 +6,6 @@
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #include <thrust/iterator/transform_iterator.h>
 namespace mlx::core::cu {
@@ -116,15 +115,4 @@ inline __host__ __device__ auto cast_to(SrcT x) {
  return CastOp<SrcT, DstT>{}(x);
 }
 // Return an iterator that cast the value to DstT using CastOp.
 template <typename DstT, typename Iterator>
 inline __host__ __device__ auto make_cast_iterator(Iterator it) {
  using SrcT = typename cuda::std::iterator_traits<Iterator>::value_type;
  if constexpr (std::is_same_v<SrcT, DstT>) {
    return it;
  } else {
    return thrust::make_transform_iterator(it, CastOp<SrcT, DstT>{});
  }
 }
 } // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/unary_ops.cuh
+++ b/mlx/backend/cuda/device/unary_ops.cuh
@@ -2,6 +2,8 @@
 #pragma once
 #include <cuda_fp8.h>
 #include "mlx/backend/cuda/device/fp16_math.cuh"
 #include "mlx/backend/cuda/device/utils.cuh"
@@ -257,8 +259,8 @@ struct Round {
 struct Sigmoid {
  template <typename T>
  __device__ T operator()(T x) {
-    T y = 1 / (1 + exp(-abs(x)));
+    T y = 1 / (1 + exp(abs(x)));
-    return (x < 0) ? 1 - y : y;
+    return (x < 0) ? y : 1 - y;
  }
 };
@@ -334,4 +336,17 @@ struct Tanh {
  }
 };
 struct ToFP8 {
  template <typename T>
  __device__ uint8_t operator()(T x) {
    return __nv_fp8_e4m3(x).__x;
  }
 };
 struct FromFP8 {
  __device__ float operator()(uint8_t x) {
    return float(*(__nv_fp8_e4m3*)(&x));
  }
 };
 } // namespace mlx::core::cu
--- a/mlx/backend/cuda/device/utils.cuh
+++ b/mlx/backend/cuda/device/utils.cuh
@@ -1,6 +1,6 @@
 // Copyright © 2025 Apple Inc.
-// This file must not include any host-only code, utilies that work under both
+// This file must not include any host-only code, utilities that work under both
 // host and device can be put here.
 //
 // See more about the requirements at:
@@ -202,7 +202,7 @@ struct Limits<
  }
 };
-// CUDA 11 does not have host side arithmatic operators for half types.
+// CUDA 11 does not have host side arithmetic operators for half types.
 template <typename T>
 struct Limits<
    T,
--- a/mlx/backend/cuda/distributed.cu
+++ b/mlx/backend/cuda/distributed.cu
@@ -0,0 +1,56 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/backend/gpu/copy.h"
 #include "mlx/distributed/primitives.h"
 #include "mlx/primitives.h"
 #include <cassert>
 namespace mlx::core::distributed {
 void AllReduce::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() == 1);
  assert(outputs.size() == 1);
  auto set_input_output =
      [s = stream()](const array& in, array& out) -> std::pair<array, array> {
    if (!in.flags().row_contiguous) {
      copy_gpu(in, out, CopyType::General, s);
      return {out, out};
    } else if (in.is_donatable()) {
      out.copy_shared_buffer(in);
      return {in, out};
    } else {
      out.set_data(allocator::malloc(out.nbytes()));
      return {in, out};
    }
  };
  auto [input, output] = set_input_output(inputs[0], outputs[0]);
  auto& encoder = cu::get_command_encoder(stream());
  encoder.set_input_array(input);
  encoder.set_output_array(output);
  auto capture = encoder.capture_context();
  auto& s = stream();
  switch (reduce_type_) {
    case Sum:
      distributed::detail::all_sum(group(), input, output, s);
      break;
    case Max:
      distributed::detail::all_max(group(), input, output, s);
      break;
    case Min:
      distributed::detail::all_min(group(), input, output, s);
      break;
    default:
      throw std::runtime_error(
          "Only all reduce sum, max, and min are supported.");
  }
 }
 } // namespace mlx::core::distributed
--- a/mlx/backend/cuda/eval.cpp
+++ b/mlx/backend/cuda/eval.cpp
@@ -5,18 +5,24 @@
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/gpu/available.h"
 #include "mlx/primitives.h"
 #include "mlx/scheduler.h"
 #include <nvtx3/nvtx3.hpp>
 namespace mlx::core::gpu {
 // Can be tuned with MLX_MAX_OPS_PER_BUFFER
 constexpr int default_max_nodes_per_graph = 20;
 bool is_available() {
  return true;
 }
 void new_stream(Stream s) {
-  // Force initalization of cuda, so cuda runtime get destroyed at last.
+  // Force initalization of CUDA, so CUDA runtime get destroyed at last.
  cudaFree(nullptr);
  // Make sure CUDA event pool get destroyed after device and stream.
  cu::CudaEvent::init_pool();
  // Ensure the static stream objects get created.
  cu::get_command_encoder(s);
 }
@@ -34,7 +40,8 @@ void eval(array& arr) {
    arr.primitive().eval_gpu(arr.inputs(), outputs);
  }
-  auto& encoder = cu::get_command_encoder(arr.primitive().stream());
+  auto& stream = arr.primitive().stream();
  auto& encoder = cu::get_command_encoder(stream);
  // Keep used buffers alive until kernel finishes running.
  for (auto& in : arr.inputs()) {
    // Except for the donated one.
@@ -45,7 +52,14 @@ void eval(array& arr) {
  for (auto& s : arr.siblings()) {
    encoder.add_temporary(s);
  }
-  encoder.maybe_commit();
+
  if (encoder.get_num_ops() >=
      env::max_ops_per_buffer(default_max_nodes_per_graph)) {
    scheduler::notify_new_task(stream);
    encoder.add_completed_handler(
        [stream]() { scheduler::notify_task_completion(stream); });
    encoder.commit();
  }
 }
 void finalize(Stream s) {
--- a/mlx/backend/cuda/event.cu
+++ b/mlx/backend/cuda/event.cu
@@ -3,10 +3,12 @@
 #include "mlx/backend/cuda/allocator.h"
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/event.h"
 #include "mlx/backend/cuda/utils.h"
 #include "mlx/event.h"
 #include "mlx/scheduler.h"
 #include <map>
 #include <vector>
 #include <nvtx3/nvtx3.hpp>
 namespace mlx::core {
@@ -17,104 +19,180 @@ namespace cu {
 // CudaEvent implementations
 ///////////////////////////////////////////////////////////////////////////////
-// Cuda event managed with RAII.
+namespace {
-class CudaEventHandle {
+
 // Manage cached cudaEvent_t objects.
 class CudaEventPool {
 public:
-  CudaEventHandle() {
+  CudaEventHandle create(Device& d, int flags) {
-    CHECK_CUDA_ERROR(cudaEventCreateWithFlags(
+    if (!on_creation_thread()) {
-        &event_, cudaEventDisableTiming | cudaEventBlockingSync));
+      return CudaEventHandle(d, flags);
    }
    auto& cache = cache_for(d, flags);
    if (cache.empty()) {
      return CudaEventHandle(d, flags);
    } else {
      CudaEventHandle ret = std::move(cache.back());
      cache.pop_back();
      return ret;
    }
  }
-  ~CudaEventHandle() {
+  void release(CudaEventHandle event) {
-    CHECK_CUDA_ERROR(cudaEventDestroy(event_));
+    if (!on_creation_thread()) {
-  }
+      // Event will be destroyed directly instead of getting moved to cache.
-
+      return;
-  CudaEventHandle(const CudaEventHandle&) = delete;
+    }
-  CudaEventHandle& operator=(const CudaEventHandle&) = delete;
+    cache_for(event.device, event.flags).push_back(std::move(event));
  operator cudaEvent_t() const {
    return event_;
  }
 private:
-  cudaEvent_t event_;
+  std::vector<CudaEventHandle>& cache_for(Device& d, int flags) {
    return cache_[d.cuda_device()][flags];
  }
  bool on_creation_thread() {
    return std::this_thread::get_id() == thread_id_;
  }
  // The CudaEvent may be created and destroyed on different threads (for
  // example when waiting on GPU work in CPU stream), we don't want to make
  // the cache thread-safe as it adds overhead, so we just skip cache when
  // using events in worker threads.
  std::thread::id thread_id_{std::this_thread::get_id()};
  // {device: {flags: [events]}}
  std::map<int, std::map<int, std::vector<CudaEventHandle>>> cache_;
 };
-CudaEvent::CudaEvent() : event_(std::make_shared<CudaEventHandle>()) {}
+CudaEventPool& cuda_event_pool() {
  static CudaEventPool pool;
  return pool;
 }
 } // namespace
 CudaEventHandle::CudaEventHandle(Device& d, int flags)
    : device(d), flags(flags) {
  device.make_current();
  CHECK_CUDA_ERROR(cudaEventCreateWithFlags(&handle_, flags));
  assert(handle_ != nullptr);
 }
 CudaEvent::CudaEvent(Device& d, int flags)
    : event_(cuda_event_pool().create(d, flags)) {}
 CudaEvent::~CudaEvent() {
  cuda_event_pool().release(std::move(event_));
 }
 void CudaEvent::wait() {
  nvtx3::scoped_range r("cu::CudaEvent::wait");
-  if (!recorded_) {
+  event_.device.make_current();
-    throw std::runtime_error("Should not wait on a CudaEvent before record.");
+  cudaEventSynchronize(event_);
  }
  cudaEventSynchronize(*event_);
 }
 void CudaEvent::wait(cudaStream_t stream) {
-  if (!recorded_) {
+  event_.device.make_current();
-    throw std::runtime_error("Should not wait on a CudaEvent before record.");
+  cudaStreamWaitEvent(stream, event_);
  }
  cudaStreamWaitEvent(stream, *event_);
 }
 void CudaEvent::wait(Stream s) {
  if (s.device == mlx::core::Device::cpu) {
    scheduler::enqueue(s, [*this]() mutable { wait(); });
  } else {
    auto& enc = cu::get_command_encoder(s);
    enc.commit();
    wait(enc.stream());
  }
 }
 void CudaEvent::record(cudaStream_t stream) {
-  cudaEventRecord(*event_, stream);
+  event_.device.make_current();
-  recorded_ = true;
+  cudaEventRecord(event_, stream);
 }
 void CudaEvent::record(Stream s) {
  if (s.device == mlx::core::Device::cpu) {
    throw std::runtime_error("CudaEvent can not wait on cpu stream.");
  } else {
    auto& enc = cu::get_command_encoder(s);
    enc.commit();
    record(enc.stream());
  }
 }
 bool CudaEvent::completed() const {
-  return cudaEventQuery(*event_) == cudaSuccess;
+  // Note: cudaEventQuery can be safely called from any device.
  return cudaEventQuery(event_) == cudaSuccess;
 }
 // static
 void CudaEvent::init_pool() {
  cuda_event_pool();
 }
 // Wraps CudaEvent with a few features:
 // 1. The class can be copied.
 // 2. Make wait/record work with CPU streams.
 // 3. Add checks for waiting on un-recorded event.
 class CopyableCudaEvent {
 public:
  explicit CopyableCudaEvent(Device& d)
      : event_(std::make_shared<CudaEvent>(
            d,
            cudaEventDisableTiming | cudaEventBlockingSync)) {}
  void wait() {
    event_->wait();
  }
  void wait(Stream s) {
    if (s.device == mlx::core::Device::cpu) {
      scheduler::enqueue(s, [*this]() mutable {
        check_recorded();
        event_->wait();
      });
    } else {
      check_recorded();
      auto& encoder = cu::get_command_encoder(s);
      encoder.commit();
      event_->wait(encoder.stream());
    }
  }
  void record(Stream s) {
    if (s.device == mlx::core::Device::cpu) {
      throw std::runtime_error("CudaEvent can not wait on CPU stream.");
    } else {
      auto& encoder = cu::get_command_encoder(s);
      encoder.commit();
      event_->record(encoder.stream());
      recorded_ = true;
    }
  }
  bool is_signaled() const {
    return recorded_ && event_->completed();
  }
 private:
  void check_recorded() const {
    if (!recorded_) {
      throw std::runtime_error(
          "Should not wait on a CudaEvent before recording.");
    }
  }
  std::shared_ptr<CudaEvent> event_;
  bool recorded_{false};
 };
 ///////////////////////////////////////////////////////////////////////////////
-// SharedEvent implementations
+// AtomicEvent implementations
 ///////////////////////////////////////////////////////////////////////////////
-__host__ __device__ void event_wait(SharedEvent::Atomic* ac, uint64_t value) {
+__host__ __device__ void event_wait(AtomicEvent::Atomic* ac, uint64_t value) {
  uint64_t current;
  while ((current = ac->load()) < value) {
    ac->wait(current);
  }
 }
-__host__ __device__ void event_signal(SharedEvent::Atomic* ac, uint64_t value) {
+__host__ __device__ void event_signal(AtomicEvent::Atomic* ac, uint64_t value) {
  ac->store(value);
  ac->notify_all();
 }
-__global__ void event_wait_kernel(SharedEvent::Atomic* ac, uint64_t value) {
+__global__ void event_wait_kernel(AtomicEvent::Atomic* ac, uint64_t value) {
  event_wait(ac, value);
 }
-__global__ void event_signal_kernel(SharedEvent::Atomic* ac, uint64_t value) {
+__global__ void event_signal_kernel(AtomicEvent::Atomic* ac, uint64_t value) {
  event_signal(ac, value);
 }
-SharedEvent::Atomic* to_atomic(std::shared_ptr<Buffer> buf) {
+AtomicEvent::AtomicEvent() {
  return static_cast<SharedEvent::Atomic*>(buf->raw_ptr());
 }
 SharedEvent::SharedEvent() {
  buf_ = std::shared_ptr<Buffer>(
      new Buffer{allocator().malloc(sizeof(Atomic))}, [](Buffer* ptr) {
        allocator().free(*ptr);
@@ -123,17 +201,17 @@ SharedEvent::SharedEvent() {
  *static_cast<uint64_t*>(buf_->raw_ptr()) = 0;
 }
-void SharedEvent::wait(uint64_t value) {
+void AtomicEvent::wait(uint64_t value) {
-  nvtx3::scoped_range r("cu::SharedEvent::wait");
+  nvtx3::scoped_range r("cu::AtomicEvent::wait");
-  event_wait(to_atomic(buf_), value);
+  event_wait(atomic(), value);
 }
-void SharedEvent::wait(cudaStream_t stream, uint64_t value) {
+void AtomicEvent::wait(cudaStream_t stream, uint64_t value) {
-  event_wait_kernel<<<1, 1, 0, stream>>>(to_atomic(buf_), value);
+  event_wait_kernel<<<1, 1, 0, stream>>>(atomic(), value);
 }
-void SharedEvent::wait(Stream s, uint64_t value) {
+void AtomicEvent::wait(Stream s, uint64_t value) {
-  nvtx3::scoped_range r("cu::SharedEvent::wait(s)");
+  nvtx3::scoped_range r("cu::AtomicEvent::wait(s)");
  if (s.device == mlx::core::Device::cpu) {
    scheduler::enqueue(s, [*this, value]() mutable { wait(value); });
  } else {
@@ -144,17 +222,17 @@ void SharedEvent::wait(Stream s, uint64_t value) {
  }
 }
-void SharedEvent::signal(uint64_t value) {
+void AtomicEvent::signal(uint64_t value) {
-  nvtx3::scoped_range r("cu::SharedEvent::signal");
+  nvtx3::scoped_range r("cu::AtomicEvent::signal");
-  event_signal(to_atomic(buf_), value);
+  event_signal(atomic(), value);
 }
-void SharedEvent::signal(cudaStream_t stream, uint64_t value) {
+void AtomicEvent::signal(cudaStream_t stream, uint64_t value) {
-  event_signal_kernel<<<1, 1, 0, stream>>>(to_atomic(buf_), value);
+  event_signal_kernel<<<1, 1, 0, stream>>>(atomic(), value);
 }
-void SharedEvent::signal(Stream s, uint64_t value) {
+void AtomicEvent::signal(Stream s, uint64_t value) {
-  nvtx3::scoped_range r("cu::SharedEvent::signal(s)");
+  nvtx3::scoped_range r("cu::AtomicEvent::signal(s)");
  if (s.device == mlx::core::Device::cpu) {
    // Signal through a GPU stream so the atomic is updated in GPU - updating
    // the atomic in CPU sometimes does not get GPU notified.
@@ -168,14 +246,14 @@ void SharedEvent::signal(Stream s, uint64_t value) {
  }
 }
-bool SharedEvent::is_signaled(uint64_t value) const {
+bool AtomicEvent::is_signaled(uint64_t value) const {
-  nvtx3::scoped_range r("cu::SharedEvent::is_signaled");
+  nvtx3::scoped_range r("cu::AtomicEvent::is_signaled");
-  return to_atomic(buf_)->load() >= value;
+  return atomic()->load() >= value;
 }
-uint64_t SharedEvent::value() const {
+uint64_t AtomicEvent::value() const {
-  nvtx3::scoped_range r("cu::SharedEvent::value");
+  nvtx3::scoped_range r("cu::AtomicEvent::value");
-  return to_atomic(buf_)->load();
+  return atomic()->load();
 }
 } // namespace cu
@@ -188,14 +266,14 @@ namespace {
 struct EventImpl {
  // CudaEvent is preferred when possible because it is fast, however we have
-  // to fallback to SharedEvent in following cases:
+  // to fallback to AtomicEvent in following cases:
  // 1. the event is used to wait/signal a cpu stream;
  // 2. signal value other than 1 has been specified.
-  std::unique_ptr<cu::CudaEvent> cuda;
+  std::unique_ptr<cu::CopyableCudaEvent> cuda;
-  std::unique_ptr<cu::SharedEvent> shared;
+  std::unique_ptr<cu::AtomicEvent> atomic;
  bool is_created() const {
-    return cuda || shared;
+    return cuda || atomic;
  }
  void ensure_created(Stream s, uint64_t signal_value) {
@@ -203,10 +281,10 @@ struct EventImpl {
      return;
    }
    if (s.device == mlx::core::Device::cpu || signal_value > 1) {
-      nvtx3::mark("Using slow SharedEvent");
+      nvtx3::mark("Using slow AtomicEvent");
-      shared = std::make_unique<cu::SharedEvent>();
+      atomic = std::make_unique<cu::AtomicEvent>();
    } else {
-      cuda = std::make_unique<cu::CudaEvent>();
+      cuda = std::make_unique<cu::CopyableCudaEvent>(cu::device(s.device));
    }
  }
 };
@@ -225,7 +303,7 @@ void Event::wait() {
    assert(value() == 1);
    event->cuda->wait();
  } else {
-    event->shared->wait(value());
+    event->atomic->wait(value());
  }
 }
@@ -236,7 +314,7 @@ void Event::wait(Stream s) {
    assert(value() == 1);
    event->cuda->wait(s);
  } else {
-    event->shared->wait(s, value());
+    event->atomic->wait(s, value());
  }
 }
@@ -247,7 +325,7 @@ void Event::signal(Stream s) {
    assert(value() == 1);
    event->cuda->record(s);
  } else {
-    event->shared->signal(s, value());
+    event->atomic->signal(s, value());
  }
 }
@@ -258,9 +336,9 @@ bool Event::is_signaled() const {
  }
  if (event->cuda) {
    assert(value() == 1);
-    return event->cuda->recorded() && event->cuda->completed();
+    return event->cuda->is_signaled();
  } else {
-    return event->shared->is_signaled(value());
+    return event->atomic->is_signaled(value());
  }
 }
--- a/mlx/backend/cuda/event.h
+++ b/mlx/backend/cuda/event.h
@@ -3,49 +3,60 @@
 #pragma once
 #include "mlx/allocator.h"
 #include "mlx/backend/cuda/utils.h"
 #include "mlx/stream.h"
 #include <memory>
 #include <cuda_runtime.h>
 #include <cuda/atomic>
 #include <memory>
 namespace mlx::core::cu {
-class CudaEventHandle;
+class Device;
 // RAII-managed move-only wrapper of cudaEvent_t.
 struct CudaEventHandle : public CudaHandle<cudaEvent_t, cudaEventDestroy> {
  CudaEventHandle(Device& d, int flags);
  Device& device;
  int flags;
 };
 // Wrapper of native cuda event. It can synchronize between GPU streams, or wait
 // on GPU stream in CPU stream, but can not wait on CPU stream.
 class CudaEvent {
 public:
-  CudaEvent();
+  CudaEvent(Device& d, int flags);
  ~CudaEvent();
  CudaEvent(CudaEvent&&) = default;
  CudaEvent& operator=(CudaEvent&&) = default;
  CudaEvent(const CudaEvent&) = delete;
  CudaEvent& operator=(const CudaEvent&) = delete;
  void wait();
  void wait(cudaStream_t stream);
  void wait(Stream s);
  void record(cudaStream_t stream);
  void record(Stream s);
  // Return whether the recorded kernels have completed. Note that this method
  // returns true if record() has not been called.
  bool completed() const;
-  bool recorded() const {
+  // Internal: make sure event pool is initialized.
-    return recorded_;
+  static void init_pool();
  }
 private:
-  bool recorded_{false};
+  CudaEventHandle event_;
  std::shared_ptr<CudaEventHandle> event_;
 };
 // Event that can synchronize between CPU and GPU. It is much slower than
 // CudaEvent so the latter should always be preferred when possible.
-class SharedEvent {
+class AtomicEvent {
 public:
  using Atomic = cuda::atomic<uint64_t>;
-  SharedEvent();
+  AtomicEvent();
  void wait(uint64_t value);
  void wait(cudaStream_t stream, uint64_t value);
@@ -57,7 +68,11 @@ class SharedEvent {
  uint64_t value() const;
 private:
-  std::shared_ptr<mlx::core::allocator::Buffer> buf_;
+  Atomic* atomic() const {
    return static_cast<AtomicEvent::Atomic*>(buf_->raw_ptr());
  }
  std::shared_ptr<allocator::Buffer> buf_;
 };
 } // namespace mlx::core::cu
--- a/mlx/backend/cuda/fence.cpp
+++ b/mlx/backend/cuda/fence.cpp
@@ -7,7 +7,7 @@ namespace mlx::core {
 struct FenceImpl {
  uint32_t count;
-  cu::SharedEvent event;
+  cu::AtomicEvent event;
 };
 Fence::Fence(Stream s) {
--- a/mlx/backend/cuda/gemms/cublas_gemm.cpp
+++ b/mlx/backend/cuda/gemms/cublas_gemm.cpp
@@ -50,8 +50,10 @@ cublasComputeType_t dtype_to_compute_type(Dtype dtype) {
      return mlx::core::env::enable_tf32() ? CUBLAS_COMPUTE_32F_FAST_TF32
                                           : CUBLAS_COMPUTE_32F;
    case float64:
    case complex64:
      return CUBLAS_COMPUTE_64F;
    case complex64:
      return mlx::core::env::enable_tf32() ? CUBLAS_COMPUTE_32F_FAST_TF32
                                           : CUBLAS_COMPUTE_32F;
    default:
      throw std::runtime_error(fmt::format(
          "Unsupported dtype in CublasGemm: {}.", dtype_to_string(dtype)));
@@ -85,10 +87,10 @@ cublasLtMatrixLayout_t create_matrix_layout(
    int32_t batch_count,
    int64_t batch_stride) {
  cublasLtMatrixLayout_t desc;
  if (transposed) {
    std::swap(rows, cols);
  }
  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutCreate(&desc, type, rows, cols, ld));
  cublasLtOrder_t order = transposed ? CUBLASLT_ORDER_COL : CUBLASLT_ORDER_ROW;
  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutSetAttribute(
      desc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order, sizeof(cublasLtOrder_t)));
  if (batch_count > 1) {
    CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutSetAttribute(
        desc,
@@ -126,37 +128,47 @@ CublasGemm::CublasGemm(
      N_(b_cols) {
  heuristic_.state = CUBLAS_STATUS_NOT_INITIALIZED;
-  auto scale_type = dtype_to_cublas_type(dtype);
+  scale_type_ = dtype_to_cublas_type(dtype);
  if (dtype == bfloat16 || dtype == float16) {
-    scale_type = CUDA_R_32F;
+    scale_type_ = CUDA_R_32F;
  }
  CHECK_CUBLAS_ERROR(cublasLtMatmulDescCreate(
-      &matmul_desc_, dtype_to_compute_type(dtype), scale_type));
+      &matmul_desc_, dtype_to_compute_type(dtype), scale_type_));
  int32_t pointer_mode = CUBLASLT_POINTER_MODE_HOST;
  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
      matmul_desc_,
      CUBLASLT_MATMUL_DESC_POINTER_MODE,
      &pointer_mode,
      sizeof(int32_t)));
-  cublasOperation_t op = CUBLAS_OP_N;
+
  // In cublasLt matrices use column-major layout, while it is possible to use
  // the CUBLASLT_ORDER_ROW option to switch to row-major layout, the bias
  // epilogue does not work with the option. So instead we swap A and B to make
  // cublasLt return the row-major result, which works because:
  // - the data of a matrix in row-major layout is identical to its transpose in
  //   column-major layout
  // - C^T = (A @ B)^T = B^T @ A^T
  cublasOperation_t a_op = b_transposed ? CUBLAS_OP_T : CUBLAS_OP_N;
  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
      matmul_desc_,
      CUBLASLT_MATMUL_DESC_TRANSA,
-      &op,
+      &a_op,
      sizeof(cublasOperation_t)));
  cublasOperation_t b_op = a_transposed ? CUBLAS_OP_T : CUBLAS_OP_N;
  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
      matmul_desc_,
      CUBLASLT_MATMUL_DESC_TRANSB,
-      &op,
+      &b_op,
      sizeof(cublasOperation_t)));
  auto type = dtype_to_cublas_type(dtype);
  a_desc_ = create_matrix_layout(
-      type, a_rows, a_cols, a_transposed, lda, batch_count, a_batch_stride);
+      type, b_cols, b_rows, b_transposed, ldb, batch_count, b_batch_stride);
  b_desc_ = create_matrix_layout(
-      type, b_rows, b_cols, b_transposed, ldb, batch_count, b_batch_stride);
+      type, a_cols, a_rows, a_transposed, lda, batch_count, a_batch_stride);
  out_desc_ = create_matrix_layout(
-      type, a_rows, b_cols, false, b_cols, batch_count, a_rows * b_cols);
+      type, b_cols, a_rows, false, b_cols, batch_count, a_rows * b_cols);
 }
 CublasGemm::CublasGemm(
@@ -191,7 +203,7 @@ CublasGemm::CublasGemm(
          b_batch_stride) {
  auto type = dtype_to_cublas_type(dtype);
  c_desc_ = create_matrix_layout(
-      type, a_rows, b_cols, false, ldc, batch_count, c_batch_stride);
+      type, b_cols, a_rows, false, ldc, batch_count, c_batch_stride);
 }
 CublasGemm::~CublasGemm() {
@@ -213,14 +225,30 @@ void CublasGemm::set_out(
  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutDestroy(out_desc_));
  out_desc_ = create_matrix_layout(
      dtype_to_cublas_type(dtype),
      rows,
      cols,
      rows,
      transposed,
      ld,
      batch_count,
      batch_stride);
 }
 void CublasGemm::set_bias(cu::CommandEncoder& encoder, const array& bias) {
  encoder.set_input_array(bias);
  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS;
  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
      matmul_desc_,
      CUBLASLT_MATMUL_DESC_EPILOGUE,
      &epilogue,
      sizeof(epilogue)));
  auto* bias_ptr = bias.data<void>();
  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
      matmul_desc_,
      CUBLASLT_MATMUL_DESC_BIAS_POINTER,
      &bias_ptr,
      sizeof(bias_ptr)));
 }
 void CublasGemm::run(
    cu::CommandEncoder& encoder,
    array& out,
@@ -228,11 +256,19 @@ void CublasGemm::run(
    const array& b,
    const Shape& batch_shape,
    const Strides& a_batch_strides,
-    const Strides& b_batch_strides) {
+    const Strides& b_batch_strides,
    float alpha) {
  int batch_count = out.size() / (M_ * N_);
  if (batch_count / batch_shape.back() > 1) {
    run_batched(
-        encoder, out, a, b, batch_shape, a_batch_strides, b_batch_strides);
+        encoder,
        out,
        a,
        b,
        batch_shape,
        a_batch_strides,
        b_batch_strides,
        alpha);
    return;
  }
@@ -240,7 +276,13 @@ void CublasGemm::run(
  encoder.set_input_array(b);
  encoder.set_output_array(out);
-  execute(encoder, out.data<void>(), a.data<void>(), b.data<void>(), nullptr);
+  execute(
      encoder,
      out.data<void>(),
      a.data<void>(),
      b.data<void>(),
      nullptr,
      alpha);
 }
 void CublasGemm::run(
@@ -313,6 +355,16 @@ void CublasGemm::execute(
    }
  }
  const void* alpha_ptr = &alpha;
  const void* beta_ptr = &beta;
  complex64_t alpha_c, beta_c;
  if (scale_type_ == CUDA_C_32F) {
    alpha_c = complex64_t{alpha, 0.0f};
    beta_c = complex64_t{beta, 0.0f};
    alpha_ptr = &alpha_c;
    beta_ptr = &beta_c;
  }
  void* workspace_ptr = nullptr;
  if (heuristic_.workspaceSize > 0) {
    // Ensure workspace is 256-byte aligned
@@ -329,12 +381,12 @@ void CublasGemm::execute(
  CHECK_CUBLAS_ERROR(cublasLtMatmul(
      handle_,
      matmul_desc_,
-      &alpha,
+      alpha_ptr,
-      a,
+      b, // a and b are swapped
      a_desc_,
-      b,
+      a,
      b_desc_,
-      &beta,
+      beta_ptr,
      c ? c : out,
      c ? c_desc_ : out_desc_,
      out,
--- a/mlx/backend/cuda/gemms/cublas_gemm.h
+++ b/mlx/backend/cuda/gemms/cublas_gemm.h
@@ -55,6 +55,8 @@ class CublasGemm {
      int32_t batch_count,
      int64_t batch_stride);
  void set_bias(cu::CommandEncoder& encoder, const array& bias);
  void run(
      cu::CommandEncoder& encoder,
      array& out,
@@ -62,7 +64,8 @@ class CublasGemm {
      const array& b,
      const Shape& batch_shape,
      const Strides& a_batch_strides,
-      const Strides& b_batch_strides);
+      const Strides& b_batch_strides,
      float alpha = 1.0f);
  void run(
      cu::CommandEncoder& encoder,
@@ -85,7 +88,8 @@ class CublasGemm {
      const array& b,
      const Shape& batch_shape,
      const Strides& a_batch_strides,
-      const Strides& b_batch_strides);
+      const Strides& b_batch_strides,
      float alpha);
  void run_batched(
      cu::CommandEncoder& encoder,
@@ -111,6 +115,7 @@ class CublasGemm {
  uint64_t M_;
  uint64_t N_;
  cudaDataType_t scale_type_;
  cublasLtMatmulPreference_t pref_{nullptr};
  cublasLtHandle_t handle_{nullptr};
  cublasLtMatmulDesc_t matmul_desc_{nullptr};
--- a/mlx/backend/cuda/gemms/cublas_gemm_batched_12_0.cpp
+++ b/mlx/backend/cuda/gemms/cublas_gemm_batched_12_0.cpp
@@ -13,7 +13,8 @@ void CublasGemm::run_batched(
    const array& b,
    const Shape& batch_shape,
    const Strides& a_batch_strides,
-    const Strides& b_batch_strides) {
+    const Strides& b_batch_strides,
    float alpha) {
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_output_array(out);
@@ -27,7 +28,8 @@ void CublasGemm::run_batched(
        out.data<int8_t>() + out.itemsize() * i * batch_shape.back() * M_ * N_,
        a.data<int8_t>() + a.itemsize() * a_it.loc,
        b.data<int8_t>() + b.itemsize() * b_it.loc,
-        nullptr);
+        nullptr,
        alpha);
    a_it.step();
    b_it.step();
  }
--- a/mlx/backend/cuda/gemms/cublas_gemm_batched_12_9.cu
+++ b/mlx/backend/cuda/gemms/cublas_gemm_batched_12_9.cu
@@ -154,7 +154,8 @@ void CublasGemm::run_batched(
    const array& b,
    const Shape& batch_shape,
    const Strides& a_batch_strides,
-    const Strides& b_batch_strides) {
+    const Strides& b_batch_strides,
    float alpha) {
  int batch_count = out.size() / (M_ * N_);
  set_pointer_mode(a_desc_, batch_count);
  set_pointer_mode(b_desc_, batch_count);
@@ -226,7 +227,8 @@ void CublasGemm::run_batched(
      reinterpret_cast<void*>(out_pointers),
      reinterpret_cast<void*>(a_pointers),
      reinterpret_cast<void*>(b_pointers),
-      nullptr);
+      nullptr,
      alpha);
 }
 void CublasGemm::run_batched(
--- a/mlx/backend/cuda/gemms/gemv.cu
+++ b/mlx/backend/cuda/gemms/gemv.cu
@@ -13,6 +13,37 @@ namespace cg = cooperative_groups;
 static constexpr int rows_per_block = 8;
 // Accumulator type selection per input element type T.
 template <typename T>
 struct GemvAccType {
  using type = T;
 };
 template <>
 struct GemvAccType<__half> {
  using type = float;
 };
 template <>
 struct GemvAccType<__nv_bfloat16> {
  using type = float;
 };
 template <>
 struct GemvAccType<float> {
  using type = float;
 };
 template <>
 struct GemvAccType<double> {
  using type = double;
 };
 template <>
 struct GemvAccType<cu::complex64_t> {
  using type = cu::complex64_t;
 };
 template <typename T, int rows_per_block, int n_per_thread>
 __device__ void
 gemv_impl(const T* mat, const T* vec, T* out, int rows, int cols) {
@@ -24,7 +55,8 @@ gemv_impl(const T* mat, const T* vec, T* out, int rows, int cols) {
  int row = g_idx.x * rows_per_block + t_idx.y;
  if (row < rows) {
-    float sum = 0.0f;
+    using Acc = typename GemvAccType<T>::type;
    Acc sum = Acc(0);
    for (int col = n_per_thread * warp.thread_rank(); col < cols;
         col += (WARP_SIZE * n_per_thread)) {
      auto local_mat =
@@ -32,12 +64,11 @@ gemv_impl(const T* mat, const T* vec, T* out, int rows, int cols) {
      auto local_vec = unsafe_load_vector<n_per_thread>(vec + col, 0);
 #pragma unroll
      for (int j = 0; j < n_per_thread; ++j) {
-        sum +=
+        sum += static_cast<Acc>(local_mat[j]) * static_cast<Acc>(local_vec[j]);
            static_cast<float>(local_mat[j]) * static_cast<float>(local_vec[j]);
      }
    }
-    sum = cg::reduce(warp, sum, cg::plus<float>{});
+    sum = cg::reduce(warp, sum, cg::plus<Acc>{});
    if (warp.thread_rank() == 0) {
      out[row] = static_cast<T>(sum);
    }
@@ -107,7 +138,7 @@ void gemv(
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_output_array(out);
-  dispatch_float_types(out.dtype(), "gemv", [&](auto type_tag) {
+  dispatch_inexact_types(out.dtype(), "gemv", [&](auto type_tag) {
    using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
    dim3 block_dims{WARP_SIZE, rows_per_block};
    const DataType* mat;
--- a/mlx/backend/cuda/indexing.cpp
+++ b/mlx/backend/cuda/indexing.cpp
@@ -110,7 +110,7 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
  args.append<int32_t>(src.ndim());
  args.append_ndim(slice_sizes_);
  args.append(slice_size);
-  args.append(SmallVector<int32_t>(axes_.begin(), axes_.end()));
+  args.append(axes_);
  append_indices_arg(args, inputs, nidx, idx_ndim);
  std::string kernel_name = fmt::format(
@@ -211,7 +211,7 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
  args.append_ndim(out.shape());
  args.append_ndim(out.strides());
  args.append<int32_t>(out.ndim());
-  args.append(SmallVector<int32_t>(axes_.begin(), axes_.end()));
+  args.append(axes_);
  append_indices_arg(args, inputs, nidx, idx_ndim);
  std::string kernel_name = fmt::format(
--- a/mlx/backend/cuda/jit_module.cpp
+++ b/mlx/backend/cuda/jit_module.cpp
@@ -67,9 +67,11 @@ const std::string& cccl_dir() {
      return path.string();
    }
    // Finally check the environment variable.
-    path = std::getenv("MLX_CCCL_DIR");
+    if (const char* env = std::getenv("MLX_CCCL_DIR"); env) {
-    if (!path.empty() && std::filesystem::exists(path)) {
+      path = env;
-      return path.string();
+      if (!path.empty() && std::filesystem::exists(path)) {
        return path.string();
      }
    }
    return std::string();
  }();
@@ -97,6 +99,30 @@ const std::filesystem::path& ptx_cache_dir() {
  return cache;
 }
 std::filesystem::path get_ptx_path(
    const std::filesystem::path& cache_dir,
    const std::string& module_name) {
 #ifdef _WIN32
  constexpr int max_file_name_length = 140;
 #else
  constexpr int max_file_name_length = 245;
 #endif
  if (module_name.size() <= max_file_name_length) {
    return cache_dir / (module_name + ".ptx");
  }
  auto ptx_path = cache_dir;
  int offset = 0;
  while (module_name.size() - offset > max_file_name_length) {
    ptx_path /= module_name.substr(offset, max_file_name_length);
    offset += max_file_name_length;
  }
  ptx_path /= module_name.substr(offset) + ".ptx";
  return ptx_path;
 }
 // Try to read the cached |ptx| and |ptx_kernels| from |cache_dir|.
 bool read_cached_ptx(
    const std::filesystem::path& cache_dir,
@@ -107,7 +133,7 @@ bool read_cached_ptx(
    return false;
  }
-  auto ptx_path = cache_dir / (module_name + ".ptx");
+  auto ptx_path = get_ptx_path(cache_dir, module_name);
  std::error_code error;
  auto ptx_size = std::filesystem::file_size(ptx_path, error);
  if (error) {
@@ -120,7 +146,7 @@ bool read_cached_ptx(
  ptx.resize(ptx_size);
  ptx_file.read(ptx.data(), ptx_size);
-  std::ifstream txt_file(cache_dir / (module_name + ".txt"), std::ios::binary);
+  std::ifstream txt_file(ptx_path.replace_extension(".txt"), std::ios::binary);
  std::string line;
  while (std::getline(txt_file, line)) {
    auto tab = line.find('\t');
@@ -142,16 +168,26 @@ void write_cached_ptx(
    return;
  }
-  std::ofstream ptx_file(cache_dir / (module_name + ".ptx"), std::ios::binary);
+  auto ptx_path = get_ptx_path(cache_dir, module_name);
  // Ensure that the directory exists
  auto parent = ptx_path.parent_path();
  if (parent != cache_dir) {
    std::filesystem::create_directories(parent);
  }
  // Write the compiled code and mangled names
  std::ofstream ptx_file(ptx_path, std::ios::binary);
  if (!ptx.empty()) {
    ptx_file.write(&ptx.front(), ptx.size());
  }
-  std::ofstream txt_file(cache_dir / (module_name + ".txt"), std::ios::binary);
+  std::ofstream txt_file(ptx_path.replace_extension(".txt"), std::ios::binary);
  for (const auto& [name, mangled] : ptx_kernels) {
    txt_file << name << "\t" << mangled << std::endl;
  }
-  std::ofstream source_file(cache_dir / (module_name + ".cu"));
+  // Write the generated code
  std::ofstream source_file(ptx_path.replace_extension(".cu"));
  source_file << source_code;
 }
@@ -295,7 +331,8 @@ void load_module(
    const std::string& ptx,
    const std::vector<std::pair<std::string, std::string>>& ptx_kernels,
    CUmodule& module_,
-    std::unordered_map<std::string, std::pair<CUfunction, bool>>& kernels) {
+    std::unordered_map<std::string, std::tuple<CUfunction, bool, uint>>&
        kernels) {
  // Load module.
  char jit_log[4089] = {};
  CUjit_option options[] = {
@@ -312,7 +349,7 @@ void load_module(
  for (const auto& [name, mangled] : ptx_kernels) {
    CUfunction kernel;
    CHECK_CUDA_ERROR(cuModuleGetFunction(&kernel, module_, mangled.c_str()));
-    kernels[name] = std::make_pair(kernel, false);
+    kernels[name] = std::make_tuple(kernel, false, 0);
  }
 }
@@ -356,7 +393,7 @@ JitModule::~JitModule() {
  CHECK_CUDA_ERROR(cuModuleUnload(module_));
 }
-CUfunction JitModule::get_kernel(
+std::pair<CUfunction, uint> JitModule::get_kernel_and_dims(
    const std::string& kernel_name,
    std::function<void(CUfunction)> configure_kernel) {
  auto it = kernels_.find(kernel_name);
@@ -367,14 +404,22 @@ CUfunction JitModule::get_kernel(
  // If it is the first time we run this kernel then configure it. Do it only
  // once!
-  if (!it->second.second) {
+  auto kernel = std::get<0>(it->second);
  if (!std::get<1>(it->second)) {
    if (configure_kernel) {
-      configure_kernel(it->second.first);
+      configure_kernel(kernel);
    }
-    it->second.second = true;
+    std::get<1>(it->second) = true;
    std::get<2>(it->second) = max_occupancy_block_dim(kernel);
  }
-  return it->second.first;
+  return {kernel, std::get<2>(it->second)};
 }
 CUfunction JitModule::get_kernel(
    const std::string& kernel_name,
    std::function<void(CUfunction)> configure_kernel) {
  return get_kernel_and_dims(kernel_name, std::move(configure_kernel)).first;
 }
 std::unordered_map<std::string, JitModule>& get_jit_module_cache() {
--- a/mlx/backend/cuda/jit_module.h
+++ b/mlx/backend/cuda/jit_module.h
@@ -46,6 +46,11 @@ struct KernelArgs {
    append_ptr(std::get<SmallVector<T>>(storage_.back()).data());
  }
  template <typename T>
  void append(const std::vector<T>& vec) {
    append(SmallVector<T>(vec.begin(), vec.end()));
  }
  // Make sure the arg is copied to an array with size of NDIM.
  template <size_t NDIM = MAX_NDIM, typename T>
  void append_ndim(SmallVector<T> vec) {
@@ -94,10 +99,13 @@ class JitModule {
  CUfunction get_kernel(
      const std::string& kernel_name,
      std::function<void(CUfunction)> configure_kernel = nullptr);
  std::pair<CUfunction, uint> get_kernel_and_dims(
      const std::string& kernel_name,
      std::function<void(CUfunction)> configure_kernel = nullptr);
 private:
  CUmodule module_{nullptr};
-  std::unordered_map<std::string, std::pair<CUfunction, bool>> kernels_;
+  std::unordered_map<std::string, std::tuple<CUfunction, bool, uint>> kernels_;
 };
 std::unordered_map<std::string, JitModule>& get_jit_module_cache();
--- a/mlx/backend/cuda/kernel_utils.cu
+++ b/mlx/backend/cuda/kernel_utils.cu
@@ -35,12 +35,10 @@ std::tuple<dim3, uint> get_launch_args(
    const Shape& shape,
    const Strides& strides,
    bool large,
-    int work_per_thread) {
+    int work_per_thread /* = 1 */,
    uint max_block_dim /* = 1024 */) {
  size_t nthreads = cuda::ceil_div(size, work_per_thread);
-  uint block_dim = 1024;
+  uint block_dim = max_block_dim < nthreads ? max_block_dim : nthreads;
  if (block_dim > nthreads) {
    block_dim = nthreads;
  }
  dim3 num_blocks;
  if (large) {
    num_blocks = get_2d_grid_dims(shape, strides, work_per_thread);
--- a/mlx/backend/cuda/kernel_utils.cuh
+++ b/mlx/backend/cuda/kernel_utils.cuh
@@ -1,8 +1,8 @@
 // Copyright © 2025 Apple Inc.
-// This file includes host-only utilies for writing CUDA kernels, the difference
+// This file includes host-only utilities for writing CUDA kernels, the
-// from backend/cuda/device/utils.cuh is that the latter file only include
+// difference from backend/cuda/device/utils.cuh is that the latter file only
-// device-only code.
+// include device-only code.
 #pragma once
@@ -120,19 +120,28 @@ dim3 get_2d_grid_dims(
    size_t divisor);
 std::pair<dim3, dim3> get_grid_and_block(int dim0, int dim1, int dim2);
-// Get the num_blocks and block_dims that maximize occupancy for |kernel|,
+// Get the num_blocks and block_dims assuming each thread handles
-// assuming each thread handles |work_per_thread| elements of |arr|.
+// |work_per_thread| elements of |arr|.
 std::tuple<dim3, uint> get_launch_args(
    size_t size,
    const Shape& shape,
    const Strides& strides,
    bool large,
-    int work_per_thread = 1);
+    int work_per_thread = 1,
    uint max_block_dim = 1024);
-inline std::tuple<dim3, uint>
+inline std::tuple<dim3, uint> get_launch_args(
-get_launch_args(const array& arr, bool large, int work_per_thread = 1) {
+    const array& arr,
    bool large,
    int work_per_thread = 1,
    uint max_block_dim = 1024) {
  return get_launch_args(
-      arr.size(), arr.shape(), arr.strides(), large, work_per_thread);
+      arr.size(),
      arr.shape(),
      arr.strides(),
      large,
      work_per_thread,
      max_block_dim);
 }
 } // namespace mlx::core
--- a/mlx/backend/cuda/lru_cache.h
+++ b/mlx/backend/cuda/lru_cache.h
@@ -2,11 +2,15 @@
 #pragma once
 #include "mlx/utils.h"
 #include <cstring>
 #include <list>
 #include <unordered_map>
 #include <utility>
 #include <fmt/format.h>
 namespace mlx::core {
 template <
@@ -27,6 +31,14 @@ class LRUCache {
    }
  }
  // Initialize with capacity read from |env_name|.
  LRUCache(const char* env_name, int default_capacity)
      : LRUCache(env::get_var(env_name, default_capacity)) {
    if (env::get_var("MLX_ENABLE_CACHE_THRASHING_CHECK", 1)) {
      env_name_ = env_name;
    }
  }
  size_t size() const {
    return map_.size();
  }
@@ -76,6 +88,14 @@ class LRUCache {
      return {it->second, false};
    }
    if (env_name_ && ++cache_misses_ > 2 * capacity_) {
      throw std::runtime_error(fmt::format(
          "Cache thrashing is happening, please set the environment variable "
          "{} to a larger value than {} to fix degraded performance.",
          env_name_,
          capacity_));
    }
    vlist_.emplace_front(key, std::forward<U>(value));
    map_[key] = vlist_.begin();
@@ -106,6 +126,9 @@ class LRUCache {
    }
  }
  const char* env_name_{nullptr};
  size_t cache_misses_{0};
  list_type vlist_;
  map_type map_;
  size_t capacity_;
--- a/mlx/backend/cuda/matmul.cpp
+++ b/mlx/backend/cuda/matmul.cpp
@@ -11,6 +11,7 @@
 #include <numeric>
 namespace mlx::core {
 namespace {
 std::tuple<bool, int64_t, array>
@@ -28,6 +29,80 @@ check_transpose(cu::CommandEncoder& enc, const Stream& s, const array& arr) {
  }
 }
 void gemm_and_bias(
    cu::CommandEncoder& encoder,
    int M,
    int N,
    int K,
    bool a_transposed,
    int64_t lda,
    bool b_transposed,
    int64_t ldb,
    array& out,
    const array& a,
    const array& b,
    const std::optional<array>& bias = std::nullopt,
    float alpha = 1.0f) {
  // Check and collapse batch dimensions
  auto [batch_shape, a_batch_strides, b_batch_strides] = collapse_batches(a, b);
  auto batch_count = out.size() / (M * N);
  // Collapse batches into M if needed
  if (batch_count > 1 && !a_transposed && batch_shape.size() == 1 &&
      a.strides()[a.ndim() - 2] == K && a_batch_strides.back() == M * K &&
      b_batch_strides.back() == 0) {
    M *= batch_shape.back();
    batch_count = 1;
    a_batch_strides = {0};
    b_batch_strides = {0};
    batch_shape = {1};
  }
  // Use gemmv when possible
  if (!bias && cu::can_use_gemv(M, N, K, a_transposed, b_transposed)) {
    cu::gemv(
        a,
        b,
        out,
        M,
        N,
        K,
        batch_count,
        batch_shape,
        a_batch_strides,
        b_batch_strides,
        encoder);
    return;
  }
  // Invoke cublasLt
  CublasGemm gemm(
      encoder.device(),
      a.dtype(),
      a_transposed,
      M,
      K,
      lda,
      b_transposed,
      K,
      N,
      ldb,
      batch_shape.back(),
      a_batch_strides.back(),
      b_batch_strides.back());
  if (bias) {
    if (a.dtype() == complex64) {
      throw std::runtime_error(
          "[gemm_and_bias] complex64 bias epilogue isn’t supported in cublasLtMatmul.");
    }
    gemm.set_bias(encoder, *bias);
  }
  gemm.run(
      encoder, out, a, b, batch_shape, a_batch_strides, b_batch_strides, alpha);
 }
 } // namespace
 void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
@@ -48,9 +123,6 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
  out.set_data(allocator::malloc(out.nbytes()));
  /////////////////////////////////////////////////////////////////////////////
  // Init checks and prep
  int M = a_pre.shape(-2);
  int N = b_pre.shape(-1);
  int K = a_pre.shape(-1);
@@ -60,58 +132,8 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto [a_transposed, lda, a] = check_transpose(encoder, s, a_pre);
  auto [b_transposed, ldb, b] = check_transpose(encoder, s, b_pre);
-  /////////////////////////////////////////////////////////////////////////////
+  gemm_and_bias(
-  // Check and collapse batch dimensions
+      encoder, M, N, K, a_transposed, lda, b_transposed, ldb, out, a, b);
  auto [batch_shape, a_batch_strides, b_batch_strides] = collapse_batches(a, b);
  auto batch_count = out.size() / (M * N);
  // Collapse batches into M if needed
  if (batch_count > 1 && !a_transposed && batch_shape.size() == 1 &&
      a.strides()[a.ndim() - 2] == K && a_batch_strides.back() == M * K &&
      b_batch_strides.back() == 0) {
    M *= batch_shape.back();
    batch_count = 1;
    a_batch_strides = {0};
    b_batch_strides = {0};
    batch_shape = {1};
  }
  if (cu::can_use_gemv(M, N, K, a_transposed, b_transposed)) {
    cu::gemv(
        a,
        b,
        out,
        M,
        N,
        K,
        batch_count,
        batch_shape,
        a_batch_strides,
        b_batch_strides,
        encoder);
    return;
  }
  /////////////////////////////////////////////////////////////////////////////
  // Invoke cublasLt
  CublasGemm gemm(
      cu::device(s.device),
      a.dtype(),
      a_transposed,
      M,
      K,
      lda,
      b_transposed,
      K,
      N,
      ldb,
      batch_shape.back(),
      a_batch_strides.back(),
      b_batch_strides.back());
  gemm.run(encoder, out, a, b, batch_shape, a_batch_strides, b_batch_strides);
 }
 void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
@@ -136,6 +158,29 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto [a_transposed, lda, a] = check_transpose(encoder, s, a_pre);
  auto [b_transposed, ldb, b] = check_transpose(encoder, s, b_pre);
  /////////////////////////////////////////////////////////////////////////////
  // Dispatch to GEMM with epilogue or AddMM
  if (beta_ == 1 && a.dtype() != complex64 && c.strides(-1) == 1 &&
      c.data_size() == out.shape(-1)) {
    out.set_data(allocator::malloc(out.nbytes()));
    gemm_and_bias(
        encoder,
        M,
        N,
        K,
        a_transposed,
        lda,
        b_transposed,
        ldb,
        out,
        a,
        b,
        c,
        alpha_);
    return;
  }
  int64_t ldc;
  {
    auto stx = c.strides()[c.ndim() - 2];
@@ -177,7 +222,7 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  }
  /////////////////////////////////////////////////////////////////////////////
-  // Invoke cublasLt
+  // Invoke cublasLt with AddMM settings
  CublasGemm gemm(
      cu::device(s.device),
--- a/mlx/backend/cuda/primitives.cpp
+++ b/mlx/backend/cuda/primitives.cpp
@@ -24,8 +24,6 @@ namespace mlx::core {
  }
 NO_GPU(BlockMaskedMM)
 NO_GPU(DynamicSlice)
 NO_GPU(DynamicSliceUpdate)
 NO_GPU(FFT)
 NO_GPU(GatherMM)
 NO_GPU(GatherQMM)
@@ -42,7 +40,6 @@ NO_GPU_MULTI(Eig)
 NO_GPU_MULTI(Eigh)
 namespace distributed {
 NO_GPU_MULTI(AllReduce)
 NO_GPU_MULTI(AllGather)
 NO_GPU_MULTI(Send)
 NO_GPU_MULTI(Recv)
--- a/mlx/backend/cuda/quantized/affine_quantize.cu
+++ b/mlx/backend/cuda/quantized/affine_quantize.cu
@@ -306,7 +306,7 @@ void affine_dequantize(
  enc.set_input_array(scales);
  enc.set_input_array(biases);
  enc.set_output_array(w);
-  dispatch_float_types(w.dtype(), "affine_quantize", [&](auto type_tag) {
+  dispatch_float_types(w.dtype(), "affine_dequantize", [&](auto type_tag) {
    dispatch_groups(group_size_, [&](auto group_size) {
      dispatch_bits(bits_, [&](auto bits) {
        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
--- a/mlx/backend/cuda/quantized/convert_fp8.cu
+++ b/mlx/backend/cuda/quantized/convert_fp8.cu
@@ -0,0 +1,19 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/cuda/unary/unary.cuh"
 #include "mlx/fast_primitives.h"
 namespace mlx::core {
 void fast::ConvertFP8::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  nvtx3::scoped_range r("ConvertFP8::eval_gpu");
  auto& in = inputs[0];
  auto& out = outputs[0];
  auto& s = out.primitive().stream();
  if (to_fp8_) {
    unary_op_gpu<cu::ToFP8>(inputs, out, name(), s);
  } else {
    unary_op_gpu<cu::FromFP8>(inputs, out, name(), s);
  }
 }
 } // namespace mlx::core
--- a/mlx/backend/cuda/quantized/cuda_fp4.h
+++ b/mlx/backend/cuda/quantized/cuda_fp4.h
@@ -0,0 +1,83 @@
 #pragma once
 struct __nv_fp8_e8m0 {
  __device__ __nv_fp8_e8m0(float x) {
    if (!std::isfinite(x)) {
      __x = 0xFF;
      return;
    }
    if (x < 0.0f) {
      __x = 0x00;
      return;
    }
    float le = std::log2f(x);
    int n = static_cast<int>(std::nearbyintf(le));
    n = n < -127 ? -127 : n;
    n = n > 127 ? 127 : n;
    __x = static_cast<uint8_t>(n + 127);
  }
  __device__ operator float() {
    if (__x == 0xFF) {
      return std::numeric_limits<float>::quiet_NaN();
    }
    return std::ldexp(1.0f, static_cast<int>(__x) - 127);
  }
  uint8_t __x{0};
 };
 struct __nv_fp4_e2m1 {
  __device__ __nv_fp4_e2m1(float x) {
    if (std::isnan(x)) {
      __x = 0x7;
      return;
    }
    const uint8_t sign_bit = (std::signbit(x)) ? 0x8 : 0x0;
    x = std::abs(x);
    if (x > 5.0f) {
      __x = 0x7;
    } else if (x >= 3.5f) {
      __x = 0x6;
    } else if (x > 2.5f) {
      __x = 0x5;
    } else if (x >= 1.75f) {
      __x = 0x4;
    } else if (x > 1.25f) {
      __x = 0x3;
    } else if (x >= 0.75f) {
      __x = 0x2;
    } else if (x > 0.25f) {
      __x = 0x1;
    } else {
      __x = 0x0;
    }
    __x |= sign_bit;
  }
  __device__ operator float() {
    static const float LUT[16] = {
        0.0f,
        0.5f,
        1.0f,
        1.5f,
        2.0f,
        3.0f,
        4.0f,
        6.0f,
        -0.0f,
        -0.5f,
        -1.0f,
        -1.5f,
        -2.0f,
        -3.0f,
        -4.0f,
        -6.0f};
    return LUT[__x];
  }
  uint8_t __x{0};
 };
--- a/mlx/backend/cuda/quantized/fp_quantize.cu
+++ b/mlx/backend/cuda/quantized/fp_quantize.cu
@@ -0,0 +1,216 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/backend/cuda/quantized/quantized.h"
 #include "mlx/dtype_utils.h"
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 #include <cuda_fp4.h>
 #include <cuda_fp8.h>
 namespace mlx::core {
 namespace cu {
 template <int bits>
 struct Quantize {
  __device__ uint8_t operator()(float x) {
    if constexpr (bits == 8) {
      return __nv_fp8_e4m3(x).__x;
    } else {
      return __nv_fp4_e2m1(x).__x;
    }
  }
 };
 template <int bits>
 struct Dequantize {
  __device__ float operator()(uint8_t x) {
    if constexpr (bits == 8) {
      return float(*(__nv_fp8_e4m3*)(&x));
    } else {
      return float(*(__nv_fp4_e2m1*)(&x));
    }
  }
 };
 namespace cg = cooperative_groups;
 template <typename T, int group_size, int bits, bool use_mx_scale>
 __global__ void
 fp_quantize(const T* w, uint8_t* out, uint8_t* scales, size_t size) {
  auto block_size = cg::this_thread_block().dim_threads();
  auto block_idx = cg::this_thread_block().group_index();
  auto idx_in_block = cg::this_thread_block().thread_index();
  auto tidx = block_idx.x * block_size.x + idx_in_block.x;
  auto tidy = block_idx.y * block_size.y + idx_in_block.y;
  auto grid_dim_x =
      cg::this_grid().dim_blocks().x * cg::this_grid().block_index().x;
  size_t index = tidx + grid_dim_x * size_t(tidy);
  if (index >= size) {
    return;
  }
  float w_thread = w[index];
  cg::greater<float> max_op;
  auto warp = cg::tiled_partition<group_size>(cg::this_thread_block());
  float scale = cg::reduce(warp, abs(w_thread), max_op);
  scale /= bits == 4 ? 6.0f : 448.0f;
  // Convert to mx scale or nv scale
  using ScaleType =
      std::conditional_t<use_mx_scale, __nv_fp8_e8m0, __nv_fp8_e4m3>;
  auto s = ScaleType(scale);
  uint8_t q_scale = s.__x;
  scale = float(s);
  // Write out the scales
  size_t gindex = index / group_size;
  if (index % group_size == 0) {
    scales[gindex] = q_scale;
  }
  uint8_t output = Quantize<bits>{}(scale == 0 ? 0.0f : w_thread / scale);
  if (bits == 4) {
    uint8_t sval = warp.shfl_down(output, 1);
    output |= sval << bits;
  }
  constexpr int pack_factor = bits == 8 ? 1 : 2;
  if (index % pack_factor == 0) {
    out[index / pack_factor] = output;
  }
 }
 template <typename T, int group_size, int bits, bool use_mx_scale>
 __global__ void
 fp_dequantize(const uint8_t* w, const uint8_t* scales, T* out, size_t size) {
  auto block_size = cg::this_thread_block().dim_threads();
  auto block_idx = cg::this_thread_block().group_index();
  auto idx_in_block = cg::this_thread_block().thread_index();
  auto tidx = block_idx.x * block_size.x + idx_in_block.x;
  auto tidy = block_idx.y * block_size.y + idx_in_block.y;
  auto grid_dim_x =
      cg::this_grid().dim_blocks().x * cg::this_grid().block_index().x;
  constexpr int pack_factor = bits == 8 ? 1 : 2;
  size_t offset = tidx + grid_dim_x * size_t(tidy);
  size_t oindex = offset * pack_factor;
  if (oindex >= size) {
    return;
  }
  size_t gindex = oindex / group_size;
  using ScaleType =
      std::conditional_t<use_mx_scale, __nv_fp8_e8m0, __nv_fp8_e4m3>;
  auto scale = float(((ScaleType*)(scales))[gindex]);
  out += oindex;
  uint val = w[offset];
 #pragma clang loop unroll(full)
  for (int i = 0; i < pack_factor; i++) {
    uint8_t d;
    if (bits == 4) {
      d = (val >> (bits * i)) & 0x0f;
    } else if (bits == 8) {
      d = val;
    }
    out[i] = static_cast<T>(scale * Dequantize<bits>{}(d));
  }
 }
 } // namespace cu
 void fp_quantize(
    const array& w,
    array& wq,
    array& scales,
    int group_size,
    int bits,
    cu::CommandEncoder& enc,
    const Stream& s) {
  enc.set_input_array(w);
  enc.set_output_array(wq);
  enc.set_output_array(scales);
  dispatch_float_types(w.dtype(), "fp_quantize", [&](auto type_tag) {
    using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
    if constexpr (!std::is_same_v<T, double>) {
      auto kernel = cu::fp_quantize<T, 32, 4, true>;
      if (bits == 8) {
        kernel = cu::fp_quantize<T, 32, 8, true>;
      } else if (group_size == 16) {
        kernel = cu::fp_quantize<T, 16, 4, false>;
      }
      bool large = w.size() > UINT_MAX;
      auto [num_blocks, block_dims] =
          get_launch_args(w.size(), w.shape(), w.strides(), large);
      enc.add_kernel_node(
          kernel,
          num_blocks,
          block_dims,
          0,
          w.data<T>(),
          wq.data<uint8_t>(),
          scales.data<uint8_t>(),
          w.size());
    } else {
      throw std::runtime_error(
          "[Quantize::eval_gpu] Can not quantize input with type float64.");
    }
  });
 }
 void fp_dequantize(
    const array& wq,
    const array& scales,
    array& w,
    int group_size,
    int bits,
    cu::CommandEncoder& enc,
    const Stream& s) {
  constexpr int uint8_per_uint32 = 4;
  int packs_per_int = 8 / bits;
  size_t size = w.size() / packs_per_int;
  bool large = size > UINT_MAX;
  auto grid_shape = w.shape();
  grid_shape.back() *= uint8_per_uint32;
  enc.set_input_array(wq);
  enc.set_input_array(scales);
  enc.set_output_array(w);
  dispatch_float_types(w.dtype(), "fp_dequantize", [&](auto type_tag) {
    using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
    if constexpr (!std::is_same_v<T, double>) {
      auto kernel = cu::fp_dequantize<T, 32, 4, true>;
      if (bits == 8) {
        kernel = cu::fp_dequantize<T, 32, 8, true>;
      } else if (group_size == 16) {
        kernel = cu::fp_dequantize<T, 16, 4, false>;
      }
      auto [num_blocks, block_dims] =
          get_launch_args(size, grid_shape, w.strides(), large);
      enc.add_kernel_node(
          kernel,
          num_blocks,
          block_dims,
          0,
          wq.data<uint8_t>(),
          scales.data<T>(),
          w.data<T>(),
          w.size());
    } else {
      throw std::runtime_error(
          "[Quantize::eval_gpu] Can not dequantize to output with type float64.");
    }
  });
 }
 } // namespace mlx::core
--- a/mlx/backend/cuda/quantized/quantized.cpp
+++ b/mlx/backend/cuda/quantized/quantized.cpp
@@ -46,10 +46,10 @@ inline array ensure_row_contiguous_matrix(
 } // namespace
-void fast::AffineQuantize::eval_gpu(
+void fast::Quantize::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
-  nvtx3::scoped_range r("AffineQuantize::eval_gpu");
+  nvtx3::scoped_range r("Quantize::eval_gpu");
  auto& s = stream();
  auto& d = cu::device(s.device);
  auto& enc = d.get_command_encoder(s);
@@ -57,23 +57,30 @@ void fast::AffineQuantize::eval_gpu(
  if (dequantize_) {
    auto wq = ensure_row_contiguous(inputs[0], enc, s);
    auto scales = ensure_row_contiguous(inputs[1], enc, s);
    auto biases = ensure_row_contiguous(inputs[2], enc, s);
    auto& w = outputs[0];
    w.set_data(allocator::malloc(w.nbytes()));
-    affine_dequantize(wq, scales, biases, w, group_size_, bits_, enc, s);
+    if (mode_ == QuantizationMode::Affine) {
      auto biases = ensure_row_contiguous(inputs[2], enc, s);
      affine_dequantize(wq, scales, biases, w, group_size_, bits_, enc, s);
    } else {
      fp_dequantize(wq, scales, w, group_size_, bits_, enc, s);
    }
  } else {
    auto w = ensure_row_contiguous(inputs[0], enc, s);
    auto& wq = outputs[0];
    auto& scales = outputs[1];
    auto& biases = outputs[2];
    wq.set_data(allocator::malloc(wq.nbytes()));
    scales.set_data(allocator::malloc(scales.nbytes()));
-    biases.set_data(allocator::malloc(biases.nbytes()));
+    if (mode_ == QuantizationMode::Affine) {
-
+      auto& biases = outputs[2];
-    affine_quantize(w, wq, scales, biases, group_size_, bits_, enc, s);
+      biases.set_data(allocator::malloc(biases.nbytes()));
      affine_quantize(w, wq, scales, biases, group_size_, bits_, enc, s);
    } else {
      fp_quantize(w, wq, scales, group_size_, bits_, enc, s);
    }
  }
 }
--- a/mlx/backend/cuda/quantized/quantized.h
+++ b/mlx/backend/cuda/quantized/quantized.h
@@ -24,4 +24,22 @@ void affine_dequantize(
    cu::CommandEncoder& enc,
    const Stream& s);
 void fp_quantize(
    const array& w,
    array& wq,
    array& scales,
    int group_size,
    int bits,
    cu::CommandEncoder& enc,
    const Stream& s);
 void fp_dequantize(
    const array& wq,
    const array& scales,
    array& w,
    int group_size,
    int bits,
    cu::CommandEncoder& enc,
    const Stream& s);
 } // namespace mlx::core
--- a/mlx/backend/cuda/reduce/col_reduce.cu
+++ b/mlx/backend/cuda/reduce/col_reduce.cu
@@ -181,6 +181,47 @@ col_reduce_looped(T* in, U* out, const __grid_constant__ ColReduceArgs args) {
  }
 }
 template <typename T, typename U, typename Op, int N_READS = 4>
 __global__ void col_reduce_small(
    const T* in,
    U* out,
    const __grid_constant__ ColReduceArgs args,
    size_t total) {
  Op op;
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();
  const auto idx = grid.thread_rank() * N_READS;
  const auto before_axis = idx / args.reduction_stride;
  const auto after_axis = idx % args.reduction_stride;
  const auto offset =
      before_axis * args.reduction_stride * args.reduction_size + after_axis;
  if (idx >= total) {
    return;
  }
  in += offset;
  out += idx;
  AlignedVector<U, N_READS> accumulator;
  for (int i = 0; i < N_READS; i++) {
    accumulator[i] = ReduceInit<Op, T>::value();
  }
  for (int i = 0; i < args.reduction_size; i++) {
    auto values = load_vector<N_READS>(in, 0);
    for (int j = 0; j < N_READS; j++) {
      accumulator[j] = op(accumulator[j], cast_to<U>(values[j]));
    }
    in += args.reduction_stride;
  }
  store_vector(out, 0, accumulator);
 }
 } // namespace cu
 inline auto output_grid_for_col_reduce(
@@ -206,7 +247,7 @@ void col_reduce_looped(
    Reduce::ReduceType reduce_type,
    const std::vector<int>& axes,
    const ReductionPlan& plan,
-    cu::ColReduceArgs args) {
+    const cu::ColReduceArgs& args) {
  // Allocate data for the output using in's layout to access them as
  // contiguously as possible.
  allocate_same_layout(out, in, axes);
@@ -230,12 +271,55 @@ void col_reduce_looped(
        auto kernel =
            cu::col_reduce_looped<T, U, OP, reduce_ndim(), BM, BN, N_READS>;
        encoder.add_kernel_node(
-            kernel, grid, blocks, 0, indata, out.data<U>(), args);
+            kernel,
            grid,
            blocks,
            0,
            indata,
            out.data<U>(),
            static_cast<cu::ColReduceArgs>(args));
      });
    });
  });
 }
 void col_reduce_small(
    cu::CommandEncoder& encoder,
    const array& in,
    array& out,
    Reduce::ReduceType reduce_type,
    const std::vector<int>& axes,
    const ReductionPlan& plan,
    const cu::ColReduceArgs& args) {
  // Allocate data for the output using in's layout to access them as
  // contiguously as possible.
  allocate_same_layout(out, in, axes);
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  dispatch_all_types(in.dtype(), [&](auto type_tag) {
    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
      using OP = MLX_GET_TYPE(reduce_type_tag);
      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
      using U = typename cu::ReduceResult<OP, T>::type;
      constexpr int N_READS = 16 / sizeof(T);
      auto tmp_grid = get_2d_grid_dims(out.shape(), out.strides());
      auto [grid, block] = get_grid_and_block(tmp_grid.x, tmp_grid.y, 1);
      auto kernel = cu::col_reduce_small<T, U, OP, N_READS>;
      encoder.add_kernel_node(
          kernel,
          grid,
          block,
          0,
          in.data<T>(),
          out.data<U>(),
          static_cast<cu::ColReduceArgs>(args),
          out.size());
    });
  });
 }
 void col_reduce(
    cu::CommandEncoder& encoder,
    const array& in,
@@ -258,6 +342,13 @@ void col_reduce(
  // Make the args struct to help route to the best kernel
  cu::ColReduceArgs args(in, plan, axes);
  // Small col reduce with a single or contiguous reduction axis
  if (args.non_col_reductions == 1 && args.reduction_size <= 32 &&
      args.reduction_stride % (16 / in.itemsize()) == 0) {
    col_reduce_small(encoder, in, out, reduce_type, axes, plan, args);
    return;
  }
  // Fallback col reduce
  col_reduce_looped(encoder, in, out, reduce_type, axes, plan, args);
 }
--- a/mlx/backend/cuda/reduce/row_reduce.cu
+++ b/mlx/backend/cuda/reduce/row_reduce.cu
@@ -7,8 +7,6 @@
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_reduce.cuh>
 namespace mlx::core {
@@ -83,7 +81,8 @@ struct RowReduceArgs {
 };
 template <typename T, typename U, typename ReduceOp, int N = 4, int M = 1>
-__global__ void row_reduce_simple(T* in, U* out, size_t n_rows, int size) {
+__global__ void
 row_reduce_simple(const T* in, U* out, size_t n_rows, int size) {
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();
  auto warp = cg::tiled_partition<WARP_SIZE>(block);
@@ -91,8 +90,8 @@ __global__ void row_reduce_simple(T* in, U* out, size_t n_rows, int size) {
  const U init = cu::ReduceInit<ReduceOp, T>::value();
  ReduceOp op;
-  T vals[M][N];
+  AlignedVector<T, N> vals[M];
-  U accs[M];
+  AlignedVector<U, M> accs;
  for (int i = 0; i < M; i++) {
    accs[i] = init;
  }
@@ -101,43 +100,31 @@ __global__ void row_reduce_simple(T* in, U* out, size_t n_rows, int size) {
      min(n_rows - M, static_cast<size_t>(grid.block_rank() * M));
  const size_t full_blocks = size / (block.size() * N);
  const size_t final_offset = full_blocks * (block.size() * N);
-  in += start_row * size;
+  in += start_row * size + block.thread_rank() * N;
  out += start_row;
-  if (size % N == 0) {
+  for (size_t r = 0; r < full_blocks; r++) {
-    for (size_t r = 0; r < full_blocks; r++) {
+    for (int k = 0; k < M; k++) {
-      for (int k = 0; k < M; k++) {
+      vals[k] = load_vector<N>(in + k * size, 0);
-        cub::LoadDirectBlockedVectorized<T, N>(
+    }
-            block.thread_rank(),
+    for (int k = 0; k < M; k++) {
-            in + k * size + r * (block.size() * N),
+      for (int j = 0; j < N; j++) {
-            vals[k]);
+        accs[k] = op(accs[k], cast_to<U>(vals[k][j]));
        for (int j = 0; j < N; j++) {
          accs[k] = op(accs[k], cast_to<U>(vals[k][j]));
        }
      }
    }
  } else {
    for (size_t r = 0; r < full_blocks; r++) {
      for (int k = 0; k < M; k++) {
        cub::LoadDirectBlocked(
            block.thread_rank(),
            in + k * size + r * (block.size() * N),
            vals[k]);
        for (int j = 0; j < N; j++) {
          accs[k] = op(accs[k], cast_to<U>(vals[k][j]));
        }
      }
    }
    in += block.size() * N;
  }
  if (final_offset < size) {
    for (int k = 0; k < M; k++) {
-      cub::LoadDirectBlocked(
+      for (int i = 0; i < N; i++) {
-          block.thread_rank(),
+        vals[k][i] = ((final_offset + block.thread_rank() * N + i) < size)
-          in + k * size + final_offset,
+            ? in[k * size + i]
-          vals[k],
+            : cast_to<T>(init);
-          size,
+      }
-          cast_to<T>(init));
+    }
    for (int k = 0; k < M; k++) {
      for (int j = 0; j < N; j++) {
        accs[k] = op(accs[k], cast_to<U>(vals[k][j]));
      }
@@ -145,13 +132,11 @@ __global__ void row_reduce_simple(T* in, U* out, size_t n_rows, int size) {
  }
  __shared__ U shared_accumulators[32 * M];
-  block_reduce(block, warp, accs, shared_accumulators, op, init);
+  block_reduce(block, warp, accs.val, shared_accumulators, op, init);
  if (block.thread_rank() == 0) {
    if (grid.block_rank() * M + M <= n_rows) {
-      for (int i = 0; i < M; i++) {
+      store_vector(out, 0, accs);
        out[i] = accs[i];
      }
    } else {
      short offset = grid.block_rank() * M + M - n_rows;
      for (int i = offset; i < M; i++) {
@@ -161,17 +146,10 @@ __global__ void row_reduce_simple(T* in, U* out, size_t n_rows, int size) {
  }
 }
-template <
+template <typename T, typename U, typename Op, int NDIM, int N_READS = 4>
    typename T,
    typename U,
    typename Op,
    int NDIM,
    int BLOCK_DIM,
    int N_READS = 4>
 __global__ void row_reduce_looped(
-    T* in,
+    const T* in,
    U* out,
    size_t out_size,
    const __grid_constant__ RowReduceArgs args) {
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();
@@ -185,36 +163,60 @@ __global__ void row_reduce_looped(
  U init = ReduceInit<Op, T>::value();
  total[0] = init;
  LoopedElemToLoc<NDIM, (NDIM > 2)> loop(args.reduce_ndim);
-  size_t full_blocks = args.row_size / (BLOCK_DIM * N_READS);
+  const size_t full_blocks = args.row_size / (block.size() * N_READS);
-  size_t final_offset = full_blocks * BLOCK_DIM * N_READS;
+  const size_t final_offset = full_blocks * (block.size() * N_READS);
  in += elem_to_loc(out_idx, args.shape.data(), args.strides.data(), args.ndim);
  in += block.thread_rank() * N_READS;
-  for (size_t n = 0; n < args.non_row_reductions; n++) {
+  // Unaligned reduce
-    for (size_t r = 0; r < full_blocks; r++) {
+  if (final_offset < args.row_size) {
-      T vals[N_READS];
+    bool mask[N_READS];
-      cub::LoadDirectBlockedVectorized<T, N_READS>(
+    for (int i = 0; i < N_READS; i++) {
-          block.thread_rank(),
+      mask[i] =
-          in + loop.location() + r * BLOCK_DIM * N_READS,
+          (final_offset + block.thread_rank() * N_READS + i) < args.row_size;
          vals);
      for (int i = 0; i < N_READS; i++) {
        total[0] = op(total[0], cast_to<U>(vals[i]));
      }
    }
-    if (final_offset < args.row_size) {
+
-      T vals[N_READS];
+    for (size_t n = 0; n < args.non_row_reductions; n++) {
-      cub::LoadDirectBlocked(
+      const T* inlocal = in + loop.location();
-          block.thread_rank(),
+
-          in + loop.location() + final_offset,
+      for (size_t r = 0; r < full_blocks; r++) {
-          vals,
+        auto vals = load_vector<N_READS>(inlocal, 0);
-          args.row_size - final_offset,
+        for (int i = 0; i < N_READS; i++) {
-          cast_to<T>(init));
+          total[0] = op(total[0], cast_to<U>(vals[i]));
-      for (int i = 0; i < N_READS; i++) {
+        }
-        total[0] = op(total[0], cast_to<U>(vals[i]));
+        inlocal += block.size() * N_READS;
      }
      {
        T vals[N_READS];
        for (int i = 0; i < N_READS; i++) {
          vals[i] = mask[i] ? inlocal[i] : cast_to<T>(init);
        }
        for (int i = 0; i < N_READS; i++) {
          total[0] = op(total[0], cast_to<U>(vals[i]));
        }
      }
      loop.next(args.reduce_shape.data(), args.reduce_strides.data());
    }
  }
  // Aligned case
  else {
    for (size_t n = 0; n < args.non_row_reductions; n++) {
      const T* inlocal = in + loop.location();
      for (size_t r = 0; r < full_blocks; r++) {
        auto vals = load_vector<N_READS>(inlocal, 0);
        for (int i = 0; i < N_READS; i++) {
          total[0] = op(total[0], cast_to<U>(vals[i]));
        }
        inlocal += block.size() * N_READS;
      }
      loop.next(args.reduce_shape.data(), args.reduce_strides.data());
    }
    // TODO: Maybe block.sync() here?
    loop.next(args.reduce_shape.data(), args.reduce_strides.data());
  }
  __shared__ U shared_accumulators[32];
@@ -234,8 +236,6 @@ void row_reduce_simple(
    Reduce::ReduceType reduce_type,
    const std::vector<int>& axes,
    const ReductionPlan& plan) {
  constexpr int N_READS = 8;
  // Allocate data for the output using in's layout to avoid elem_to_loc in the
  // kernel.
  allocate_same_layout(out, in, axes);
@@ -250,14 +250,15 @@ void row_reduce_simple(
      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
      using U = typename cu::ReduceResult<OP, T>::type;
-      // Cub doesn't like const pointers for vectorized loads. (sigh)
+      constexpr int N_READS = 16 / sizeof(T);
      T* indata = const_cast<T*>(in.data<T>());
      // Calculate the grid and block dims
      size_t reductions = (plan.shape.back() + N_READS - 1) / N_READS;
      dim3 grid = get_2d_grid_dims(out.shape(), out.strides());
-      int threads = std::min(1024UL, reductions);
+      int warps = (reductions + WARP_SIZE - 1) / WARP_SIZE;
-      threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
+      warps /= 4;
      warps = std::max(std::min(warps, 32), 1);
      int threads = warps * WARP_SIZE;
      dim3 block(threads, 1, 1);
      // Pick the kernel
@@ -267,6 +268,7 @@ void row_reduce_simple(
        kernel = cu::row_reduce_simple<T, U, OP, N_READS, 2>;
      }
      T* indata = const_cast<T*>(in.data<T>());
      int size = plan.shape.back();
      encoder.add_kernel_node(
          kernel, grid, block, 0, indata, out.data<U>(), out.size(), size);
@@ -282,8 +284,6 @@ void row_reduce_looped(
    const std::vector<int>& axes,
    const ReductionPlan& plan,
    cu::RowReduceArgs args) {
  constexpr int N_READS = 8;
  // Allocate data for the output using in's layout to access them as
  // contiguously as possible.
  allocate_same_layout(out, in, axes);
@@ -295,34 +295,27 @@ void row_reduce_looped(
      using OP = MLX_GET_TYPE(reduce_type_tag);
      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
      using U = typename cu::ReduceResult<OP, T>::type;
-      // Cub doesn't like const pointers for vectorized loads. (sigh)
+
-      T* indata = const_cast<T*>(in.data<T>());
+      constexpr int N_READS = 16 / sizeof(T);
      // Calculate the grid and block dims
      args.sort_access_pattern(in, axes);
      dim3 grid = get_2d_grid_dims(out.shape(), out.strides());
      size_t reductions = (args.row_size + N_READS - 1) / N_READS;
-      int threads = std::min(1024UL, reductions);
+      int warps = (reductions + WARP_SIZE - 1) / WARP_SIZE;
-      threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
+      warps /= 4;
      warps = std::max(std::min(warps, 32), 1);
      int threads = warps * WARP_SIZE;
      dim3 block(threads, 1, 1);
      // Pick the kernel
-      auto kernel = cu::row_reduce_looped<T, U, OP, 1, 32, N_READS>;
+      auto kernel = cu::row_reduce_looped<T, U, OP, 1, N_READS>;
      dispatch_reduce_ndim(args.reduce_ndim, [&](auto reduce_ndim) {
-        dispatch_block_dim(threads, [&](auto threads_constant) {
+        kernel = cu::row_reduce_looped<T, U, OP, reduce_ndim.value, N_READS>;
          kernel = cu::row_reduce_looped<
              T,
              U,
              OP,
              reduce_ndim.value,
              threads_constant.value,
              N_READS>;
          block.x = threads_constant.value;
        });
      });
      encoder.add_kernel_node(
-          kernel, grid, block, 0, indata, out.data<U>(), out.size(), args);
+          kernel, grid, block, 0, in.data<T>(), out.data<U>(), args);
    });
  });
 }
--- a/mlx/backend/cuda/rope.cu
+++ b/mlx/backend/cuda/rope.cu
@@ -103,15 +103,21 @@ template <typename T, bool traditional, bool forward, int N = 4>
 __device__ void rope_impl(
    const T* in,
    T* out,
-    int offset,
+    const int* offset,
    float inv_freq,
    float scale,
    const cuda::std::array<int64_t, 3> strides,
    const cuda::std::array<int64_t, 3> out_strides,
-    int64_t n_batch,
+    int64_t offset_stride,
    int n_head,
    uint3 pos,
    uint3 dims) {
-  float L = scale * static_cast<float>(pos.y + offset);
+  auto n_head_up = N * ((n_head + N - 1) / N);
  auto head_idx = static_cast<int>((pos.z * N) % n_head_up);
  auto batch_idx = (pos.z * N) / n_head_up;
  auto batch_offset = offset[batch_idx * offset_stride];
  float L = scale * static_cast<float>(pos.y + batch_offset);
  auto mat_idx = batch_idx * n_head + head_idx;
  // Compute costheta, sintheta
  float theta = L * inv_freq;
@@ -123,20 +129,19 @@ __device__ void rope_impl(
  size_t out_index_1, out_index_2;
  if (traditional) {
    out_index_1 = 2 * pos.x * out_strides[2] + pos.y * out_strides[1] +
-        N * pos.z * out_strides[0];
+        mat_idx * out_strides[0];
    out_index_2 = out_index_1 + 1;
    in_index_1 =
-        2 * pos.x * strides[2] + pos.y * strides[1] + N * pos.z * strides[0];
+        2 * pos.x * strides[2] + pos.y * strides[1] + mat_idx * strides[0];
    in_index_2 = in_index_1 + strides[2];
  } else {
    out_index_1 = pos.x * out_strides[2] + pos.y * out_strides[1] +
-        N * pos.z * out_strides[0];
+        mat_idx * out_strides[0];
    out_index_2 = out_index_1 + dims.x * out_strides[2];
-    in_index_1 =
+    in_index_1 = pos.x * strides[2] + pos.y * strides[1] + mat_idx * strides[0];
        pos.x * strides[2] + pos.y * strides[1] + N * pos.z * strides[0];
    in_index_2 = in_index_1 + dims.x * strides[2];
  }
-  for (int i = 0; i < N && pos.z * N + i < n_batch; ++i) {
+  for (int i = 0; i < N && head_idx + i < n_head; ++i) {
    // Read and write the output
    float x1 = static_cast<float>(in[in_index_1]);
    float x2 = static_cast<float>(in[in_index_2]);
@@ -167,7 +172,8 @@ __global__ void rope(
    float base,
    const __grid_constant__ cuda::std::array<int64_t, 3> strides,
    const __grid_constant__ cuda::std::array<int64_t, 3> out_strides,
-    int64_t n_batch,
+    int64_t offset_stride,
    int n_head,
    uint3 dims) {
  uint3 pos = make_uint3(
      blockIdx.x * blockDim.x + threadIdx.x,
@@ -182,12 +188,13 @@ __global__ void rope(
  rope_impl<T, traditional, forward>(
      in,
      out,
-      *offset,
+      offset,
      inv_freq,
      scale,
      strides,
      out_strides,
-      n_batch,
+      offset_stride,
      n_head,
      pos,
      dims);
 }
@@ -202,7 +209,8 @@ __global__ void rope_freqs(
    float base,
    const __grid_constant__ cuda::std::array<int64_t, 3> strides,
    const __grid_constant__ cuda::std::array<int64_t, 3> out_strides,
-    int64_t n_batch,
+    int64_t offset_stride,
    int n_head,
    uint3 dims,
    int64_t freq_stride) {
  uint3 pos = make_uint3(
@@ -217,12 +225,13 @@ __global__ void rope_freqs(
  rope_impl<T, traditional, forward>(
      in,
      out,
-      *offset,
+      offset,
      inv_freq,
      scale,
      strides,
      out_strides,
-      n_batch,
+      offset_stride,
      n_head,
      pos,
      dims);
 }
@@ -245,23 +254,28 @@ void RoPE::eval_gpu(
  auto& offset = inputs[1];
  auto& out = outputs[0];
  if (in.ndim() < 3) {
    throw std::runtime_error("[RoPE] Input must have at least 3 dimensions");
  }
  cuda::std::array<int64_t, 3> strides;
  cuda::std::array<int64_t, 3> out_strides;
  bool donated = false;
  int ndim = in.ndim();
-  int dispatch_ndim = in.ndim();
+
  int B = in.shape(0);
  int T = in.shape(-2);
  int D = in.shape(-1);
  size_t mat_size = T * D;
  int dispatch_ndim = ndim;
  while (in.shape(-dispatch_ndim) == 1 && dispatch_ndim > 3) {
    dispatch_ndim--;
  }
-  size_t mat_size = in.shape(-2) * in.shape(-1);
+
  int N = 1;
  for (int i = 1; i < (ndim - 2); ++i) {
    N *= in.shape(i);
  }
  // We apply rope to less that the whole vector so copy to output and then
  // apply in-place.
-  if (dims_ < in.shape(-1)) {
+  if (dims_ < D) {
    donated = true;
    auto ctype =
        (in.flags().row_contiguous) ? CopyType::Vector : CopyType::General;
@@ -302,7 +316,7 @@ void RoPE::eval_gpu(
  out_strides[2] = out.strides()[ndim - 1];
  // Some flags to help us dispatch below
-  bool single = in.flags().row_contiguous && (mat_size == in.shape(-1));
+  bool single = in.flags().row_contiguous && B == 1 && T == 1;
  bool with_freqs = inputs.size() == 3;
  auto& encoder = cu::get_command_encoder(s);
@@ -319,7 +333,7 @@ void RoPE::eval_gpu(
        if (single && !with_freqs) {
          auto kernel =
              cu::rope_single<DataType, traditional.value, forward.value>;
-          uint2 dims = make_uint2(dims_ / 2, in.size() / mat_size);
+          uint2 dims = make_uint2(dims_ / 2, N);
          auto [grid, block] = get_grid_and_block(dims.x, dims.y, 1);
          encoder.add_kernel_node(
              kernel,
@@ -336,7 +350,7 @@ void RoPE::eval_gpu(
        } else if (single) {
          auto kernel =
              cu::rope_single_freqs<DataType, traditional.value, forward.value>;
-          uint2 dims = make_uint2(dims_ / 2, in.size() / mat_size);
+          uint2 dims = make_uint2(dims_ / 2, N);
          auto [grid, block] = get_grid_and_block(dims.x, dims.y, 1);
          encoder.add_kernel_node(
              kernel,
@@ -354,10 +368,14 @@ void RoPE::eval_gpu(
        } else if (with_freqs) {
          auto kernel =
              cu::rope_freqs<DataType, traditional.value, forward.value>;
-          uint3 dims =
+          int n_per_thread = 4;
-              make_uint3(dims_ / 2, in.shape(-2), in.size() / mat_size);
+          uint32_t dimz = B * ((N + n_per_thread - 1) / n_per_thread);
-          dims.z = (dims.z + 3) / 4;
+          uint3 dims = make_uint3(dims_ / 2, T, dimz);
          auto [grid, block] = get_grid_and_block(dims.x, dims.y, dims.z);
          int64_t offset_stride = 0;
          if (inputs[1].ndim() > 0) {
            offset_stride = inputs[1].strides()[0];
          }
          encoder.add_kernel_node(
              kernel,
              grid,
@@ -371,15 +389,20 @@ void RoPE::eval_gpu(
              std::log2(base_),
              strides,
              out_strides,
-              in.size() / mat_size,
+              offset_stride,
              N,
              dims,
              inputs[2].strides(0));
        } else {
          auto kernel = cu::rope<DataType, traditional.value, forward.value>;
-          uint3 dims =
+          int n_per_thread = 4;
-              make_uint3(dims_ / 2, in.shape(-2), in.size() / mat_size);
+          uint32_t dimz = B * ((N + n_per_thread - 1) / n_per_thread);
-          dims.z = (dims.z + 3) / 4;
+          uint3 dims = make_uint3(dims_ / 2, T, dimz);
          auto [grid, block] = get_grid_and_block(dims.x, dims.y, dims.z);
          int64_t offset_stride = 0;
          if (inputs[1].ndim() > 0) {
            offset_stride = inputs[1].strides()[0];
          }
          encoder.add_kernel_node(
              kernel,
              grid,
@@ -392,7 +415,8 @@ void RoPE::eval_gpu(
              std::log2(base_),
              strides,
              out_strides,
-              in.size() / mat_size,
+              offset_stride,
              N,
              dims);
        }
      });
--- a/mlx/backend/cuda/scaled_dot_product_attention.cu
+++ b/mlx/backend/cuda/scaled_dot_product_attention.cu
@@ -4,7 +4,6 @@
 #include "mlx/backend/cuda/device/config.h"
 #include "mlx/backend/cuda/device/utils.cuh"
 #include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/backend/cuda/lru_cache.h"
 #include "mlx/backend/gpu/copy.h"
 #include "mlx/dtype_utils.h"
 #include "mlx/fast_primitives.h"
@@ -46,6 +45,7 @@ __global__ void kernel_sdpav_1pass(
    const T* K,
    const T* V,
    T* O,
    const T* sinks,
    __grid_constant__ const AttnParams params) {
  constexpr int BN = 32;
  constexpr int BD = 32;
@@ -65,7 +65,7 @@ __global__ void kernel_sdpav_1pass(
  __shared__ U max_scores[BN];
  __shared__ U sum_exp_scores[BN];
-  const U scale_log2 = params.scale * 1.44269504089f;
+  const U scale_log2 = params.scale * M_LOG2E;
  auto block = cg::this_thread_block();
  auto warp = cg::tiled_partition<32>(block);
@@ -108,8 +108,12 @@ __global__ void kernel_sdpav_1pass(
    o[i] = 0.f;
  }
-  U max_score = -INFINITY;
+  U max_score = Limits<U>::finite_min();
  U sum_exp_score = 0.f;
  if (sinks && warp_idx == 0) {
    max_score = M_LOG2E * static_cast<U>(sinks[head_idx]);
    sum_exp_score = 1.f;
  }
  // For each key
  for (int i = kv_seq_idx; i < params.kL; i += BN) {
@@ -167,7 +171,7 @@ __global__ void kernel_sdpav_1pass(
  U factor = exp2f(max_score - new_max);
  sum_exp_score =
      cg::reduce(warp, sum_exp_scores[lane_idx] * factor, cg::plus<U>());
-  sum_exp_score = __frcp_rn(sum_exp_score);
+  sum_exp_score = sum_exp_score == 0 ? 0 : __frcp_rn(sum_exp_score);
  // Now we need to aggregate all the outputs
  PRAGMA_LOOP_UNROLL
@@ -193,6 +197,7 @@ __global__ void kernel_sdpav_2pass_1(
    const T* Q,
    const T* K,
    const T* V,
    const T* sinks,
    float* partials,
    float* sums,
    float* maxs,
@@ -268,8 +273,12 @@ __global__ void kernel_sdpav_2pass_1(
    o[i] = 0.f;
  }
-  U max_score = -1e9;
+  U max_score = Limits<U>::finite_min();
  U sum_exp_score = 0.f;
  if (sinks && warp_idx == 0 && block_idx == 0) {
    max_score = M_LOG2E * static_cast<U>(sinks[head_idx]);
    sum_exp_score = 1.f;
  }
  // For each key
  for (int i = kv_seq_idx; i < params.kL; i += blocks * BN) {
@@ -410,7 +419,7 @@ __global__ void kernel_sdpav_2pass_2(
  U new_max = cg::reduce(warp, max_score, cg::greater<U>());
  U factor = exp2f(max_score - new_max);
  U sum_exp_score = cg::reduce(warp, sums[lane_idx] * factor, cg::plus<U>());
-  sum_exp_score = __frcp_rn(sum_exp_score);
+  sum_exp_score = sum_exp_score == 0 ? 0 : __frcp_rn(sum_exp_score);
  PRAGMA_LOOP_UNROLL
  for (int i = 0; i < v_per_thread; i++) {
@@ -463,10 +472,14 @@ void sdpa_vector_1pass_fallback(
    const array& v,
    const float scale,
    array& o,
-    bool do_causal_ = false) {
+    bool do_causal,
    const std::optional<array>& sinks) {
  encoder.set_input_array(q);
  encoder.set_input_array(k);
  encoder.set_input_array(v);
  if (sinks) {
    encoder.set_input_array(*sinks);
  }
  encoder.set_output_array(o);
  cu::AttnParams params{
@@ -489,7 +502,7 @@ void sdpa_vector_1pass_fallback(
  dim3 block_dim(1024, 1, 1);
  dispatch_float_types(o.dtype(), "kernel_sdpav_1pass", [&](auto type_tag) {
-    dispatch_bool(do_causal_, [&](auto do_causal) {
+    dispatch_bool(do_causal, [&](auto do_causal) {
      dispatch_headdim(params.D, [&](auto headdim) {
        using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
@@ -504,6 +517,7 @@ void sdpa_vector_1pass_fallback(
            k.data<DataType>(),
            v.data<DataType>(),
            o.data<DataType>(),
            sinks ? (*sinks).data<DataType>() : nullptr,
            params);
      });
    });
@@ -518,7 +532,8 @@ void sdpa_vector_2pass_fallback(
    const array& v,
    const float scale,
    array& o,
-    bool do_causal_ = false) {
+    bool do_causal,
    const std::optional<array>& sinks) {
  cu::AttnParams params{
      /* int B = */ q.shape(0),
      /* int H = */ q.shape(1),
@@ -559,7 +574,7 @@ void sdpa_vector_2pass_fallback(
  encoder.add_temporary(maxs);
  dispatch_float_types(o.dtype(), "kernel_sdpav_2pass", [&](auto type_tag) {
-    dispatch_bool(do_causal_, [&](auto do_causal) {
+    dispatch_bool(do_causal, [&](auto do_causal) {
      dispatch_headdim(params.D, [&](auto headdim) {
        using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
@@ -570,6 +585,10 @@ void sdpa_vector_2pass_fallback(
          encoder.set_input_array(q);
          encoder.set_input_array(k);
          encoder.set_input_array(v);
          if (sinks) {
            encoder.set_input_array(*sinks);
          }
          encoder.set_output_array(intermediate);
          encoder.set_output_array(sums);
          encoder.set_output_array(maxs);
@@ -585,6 +604,7 @@ void sdpa_vector_2pass_fallback(
              q.data<DataType>(),
              k.data<DataType>(),
              v.data<DataType>(),
              sinks ? (*sinks).data<DataType>() : nullptr,
              intermediate.data<float>(),
              sums.data<float>(),
              maxs.data<float>(),
@@ -627,15 +647,16 @@ void sdpa_vector_fallback(
    const array& v,
    const float scale,
    array& o,
-    bool do_causal_ = false) {
+    bool do_causal,
    const std::optional<array>& sinks) {
  int kL = k.shape(2);
  if (kL > 1024) {
    return sdpa_vector_2pass_fallback(
-        s, encoder, q, k, v, scale, o, do_causal_);
+        s, encoder, q, k, v, scale, o, do_causal, sinks);
  } else {
    return sdpa_vector_1pass_fallback(
-        s, encoder, q, k, v, scale, o, do_causal_);
+        s, encoder, q, k, v, scale, o, do_causal, sinks);
  }
 }
@@ -691,7 +712,7 @@ void ScaledDotProductAttention::eval_gpu(
  // Define some copy functions to ensure the layout of the inputs is as
  // expected.
-  copies.reserve(3);
+  copies.reserve(inputs.size());
  auto copy_unless = [&copies, &s](
                         auto predicate, const array& arr) -> const array& {
    if (!predicate(arr)) {
@@ -703,6 +724,16 @@ void ScaledDotProductAttention::eval_gpu(
    }
  };
  // Checks that the headdim dimension has stride 1.
  auto is_matrix_contiguous = [](const array& arr) {
    return arr.strides(-1) == 1;
  };
  std::optional<array> sinks = std::nullopt;
  if (has_sinks_) {
    sinks = copy_unless(is_matrix_contiguous, inputs.back());
  }
  // We are in vector mode ie single query
  if (q_pre.shape(2) < 4) {
    auto q_copy_unless = [](const array& arr) {
@@ -740,10 +771,6 @@ void ScaledDotProductAttention::eval_gpu(
    const auto& k = copy_unless(kv_copy_unless, k_pre);
    const auto& v = copy_unless(kv_copy_unless, v_pre);
    for (const auto& cp : copies) {
      encoder.add_temporary(cp);
    }
    // Donate the query if possible
    if (q.is_donatable() && q.flags().row_contiguous && q.size() == o.size()) {
      o.copy_shared_buffer(q);
@@ -752,22 +779,26 @@ void ScaledDotProductAttention::eval_gpu(
      int64_t str_oH = o.shape(3);
      int64_t str_oL = o.shape(1) * str_oH;
      int64_t str_oB = o.shape(2) * str_oL;
      size_t data_size = o.shape(0) * str_oB;
      array::Flags flags{
          /* bool contiguous = */ 1,
          /* bool row_contiguous = */ o.shape(2) == 1,
-          /* bool col_contiguous = */ 0,
+          /* bool col_contiguous = */ o.size() == o.shape(3),
      };
      o.set_data(
          allocator::malloc(o.nbytes()),
-          data_size,
+          o.size(),
          {str_oB, str_oH, str_oL, str_oD},
          flags);
    }
-    return sdpa_vector_fallback(s, encoder, q, k, v, scale_, o, do_causal_);
+    for (const auto& cp : copies) {
      encoder.add_temporary(cp);
    }
    return sdpa_vector_fallback(
        s, encoder, q, k, v, scale_, o, do_causal_, sinks);
  }
  // Full attention mode should never reach here
--- a/mlx/backend/cuda/slicing.cpp
+++ b/mlx/backend/cuda/slicing.cpp
@@ -1,8 +1,11 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/common/slicing.h"
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/jit_module.h"
 #include "mlx/backend/gpu/copy.h"
 #include "mlx/backend/gpu/slicing.h"
 #include "mlx/dtype_utils.h"
 #include <numeric>
@@ -27,8 +30,7 @@ void concatenate_gpu(
  flags.row_contiguous = false;
  flags.col_contiguous = false;
  flags.contiguous = false;
-  // TODO: Handle concurrent outputs:
+  auto concurrent = cu::get_command_encoder(s).concurrent_context();
  // https://github.com/ml-explore/mlx/pull/2145#discussion_r2070753816
  for (int i = 0; i < inputs.size(); i++) {
    array out_slice(inputs[i].shape(), out.dtype(), nullptr, {});
    size_t data_offset = strides[axis] * sizes[i];
@@ -38,4 +40,71 @@ void concatenate_gpu(
  }
 }
 array compute_dynamic_offset(
    const array& indices,
    const Strides& strides,
    const std::vector<int>& axes,
    const Stream& s) {
  Dtype dtype = indices.dtype();
  int nidx = axes.size();
  std::string module_name =
      fmt::format("compute_dynamic_offset_{}_{}", dtype_to_string(dtype), nidx);
  std::string kernel_name = fmt::format(
      "mlx::core::cu::compute_dynamic_offset<{}, {}>",
      dtype_to_cuda_type(dtype),
      nidx);
  cu::JitModule& mod = cu::get_jit_module(s.device, module_name, [&]() {
    std::string source = R"(
        #include "mlx/backend/cuda/device/utils.cuh"
        namespace mlx::core::cu {
        template <typename T, int NIDX>
        __global__ void compute_dynamic_offset(
            const T* indices,
            int64_t* offset,
            const __grid_constant__ Strides strides,
            const __grid_constant__ cuda::std::array<int, NIDX> axes) {
          int64_t acc = 0;
          #pragma unroll
          for (int i = 0; i < NIDX; ++i) {
            acc += indices[i] * strides[axes[i]];
          }
          *offset = acc;
        }
        } // namespace mlx::core::cu
    )";
    return std::make_tuple(false, std::move(source), std::vector{kernel_name});
  });
  // Prepare output.
  array offset({1}, int64, nullptr, {});
  bool donate = indices.is_donatable() &&
      (indices.data_size() * indices.itemsize()) >= offset.itemsize();
  if (donate) {
    offset.copy_shared_buffer(indices);
  } else {
    offset.set_data(allocator::malloc(offset.itemsize()));
  }
  auto& encoder = cu::get_command_encoder(s);
  encoder.add_temporary(offset);
  encoder.set_input_array(indices);
  encoder.set_output_array(offset);
  cu::KernelArgs args;
  args.append(indices);
  args.append(offset);
  args.append_ndim(strides);
  args.append(axes);
  auto kernel = mod.get_kernel(kernel_name);
  encoder.add_kernel_node(kernel, 1, 1, 0, args.args());
  return offset;
 }
 } // namespace mlx::core
--- a/mlx/backend/cuda/sort.cu
+++ b/mlx/backend/cuda/sort.cu
@@ -9,7 +9,7 @@
 #include <nvtx3/nvtx3.hpp>
 #include <thrust/device_ptr.h>
 #include <thrust/transform.h>
-#include <cub/device/device_segmented_sort.cuh>
+#include <cub/device/device_segmented_radix_sort.cuh>
 #include <cassert>
@@ -79,7 +79,7 @@ void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
        encoder.add_temporary(discard);
        size_t size;
-        CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortPairs(
+        CHECK_CUDA_ERROR(cub::DeviceSegmentedRadixSort::SortPairs(
            nullptr,
            size,
            in.data<Type>(),
@@ -90,6 +90,8 @@ void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
            in.data_size() / nsort,
            offsets,
            offsets + 1,
            0,
            sizeof(Type) * 8,
            stream));
        array temp(allocator::malloc(size), {static_cast<int>(size)}, uint8);
@@ -104,7 +106,7 @@ void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
            thrust::device_pointer_cast(indices.data<uint32_t>()),
            ModOp<uint32_t>{static_cast<uint32_t>(nsort)});
-        CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortPairs(
+        CHECK_CUDA_ERROR(cub::DeviceSegmentedRadixSort::SortPairs(
            temp.data<void>(),
            size,
            in.data<Type>(),
@@ -115,10 +117,12 @@ void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
            in.data_size() / nsort,
            offsets,
            offsets + 1,
            0,
            sizeof(Type) * 8,
            stream));
      } else {
        size_t size;
-        CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortKeys(
+        CHECK_CUDA_ERROR(cub::DeviceSegmentedRadixSort::SortKeys(
            nullptr,
            size,
            in.data<Type>(),
@@ -127,6 +131,8 @@ void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
            in.data_size() / nsort,
            offsets,
            offsets + 1,
            0,
            sizeof(Type) * 8,
            stream));
        array temp(allocator::malloc(size), {static_cast<int>(size)}, uint8);
@@ -134,7 +140,7 @@ void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
        // Start capturing after allocations
        auto capture = encoder.capture_context();
-        CHECK_CUDA_ERROR(cub::DeviceSegmentedSort::StableSortKeys(
+        CHECK_CUDA_ERROR(cub::DeviceSegmentedRadixSort::SortKeys(
            temp.data<void>(),
            size,
            in.data<Type>(),
@@ -143,6 +149,8 @@ void gpu_sort(const Stream& s, array in, array& out_, int axis, bool argsort) {
            in.data_size() / nsort,
            offsets,
            offsets + 1,
            0,
            sizeof(Type) * 8,
            stream));
      }
    } else {
--- a/mlx/backend/cuda/ternary.cu
+++ b/mlx/backend/cuda/ternary.cu
@@ -156,7 +156,25 @@ void ternary_op_gpu_inplace(
    using DType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
    auto topt = get_ternary_op_type(a, b, c);
-    if (topt == TernaryOpType::General) {
+    if (topt == TernaryOpType::VectorVectorVector ||
        topt == TernaryOpType::ScalarScalarScalar) {
      dispatch_bool(out.data_size() > UINT32_MAX, [&](auto large) {
        using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
        constexpr int N_READS = 16 / sizeof(DType);
        auto [num_blocks, block_dims] = get_launch_args(
            out.data_size(), out.shape(), out.strides(), large(), N_READS);
        encoder.add_kernel_node(
            cu::ternary_v<Op, DType, IdxT, N_READS>,
            num_blocks,
            block_dims,
            0,
            a.data<bool>(),
            b.data<DType>(),
            c.data<DType>(),
            out.data<DType>(),
            out.data_size());
      });
    } else {
      dispatch_bool(
          a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
              c.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
@@ -225,23 +243,6 @@ void ternary_op_gpu_inplace(
                  ndim);
            }
          });
    } else {
      dispatch_bool(out.data_size() > UINT32_MAX, [&](auto large) {
        using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
        constexpr int N_READS = 16 / sizeof(DType);
        auto [num_blocks, block_dims] = get_launch_args(
            out.data_size(), out.shape(), out.strides(), large(), N_READS);
        encoder.add_kernel_node(
            cu::ternary_v<Op, DType, IdxT, N_READS>,
            num_blocks,
            block_dims,
            0,
            a.data<bool>(),
            b.data<DType>(),
            c.data<DType>(),
            out.data<DType>(),
            out.data_size());
      });
    }
  });
 }
--- a/mlx/backend/cuda/unary.cu
+++ b/mlx/backend/cuda/unary.cu
@@ -1,284 +0,0 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/common/unary.h"
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/device/unary_ops.cuh"
 #include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/dtype_utils.h"
 #include "mlx/primitives.h"
 #include <cooperative_groups.h>
 #include <nvtx3/nvtx3.hpp>
 namespace mlx::core {
 namespace cu {
 namespace cg = cooperative_groups;
 template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void unary_v(const In* in, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
  if ((index + 1) * N_READS > size) {
    for (IdxT i = index * N_READS; i < size; ++i) {
      out[i] = Op{}(in[i]);
    }
  } else {
    auto in_vec = load_vector<N_READS>(in, index);
    AlignedVector<Out, N_READS> out_vec;
 #pragma unroll
    for (int i = 0; i < N_READS; ++i) {
      out_vec[i] = Op{}(in_vec[i]);
    }
    store_vector<N_READS>(out, index, out_vec);
  }
 }
 template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void unary_g(
    const In* in,
    Out* out,
    IdxT size_rest,
    const __grid_constant__ Shape shape,
    const __grid_constant__ Strides strides,
    int ndim) {
  auto block = cg::this_thread_block();
  auto grid = cg::this_grid();
  IdxT index_rest =
      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
  if (index_rest >= size_rest) {
    return;
  }
  auto shape_x = shape[ndim - 1];
  auto stride_x = strides[ndim - 1];
  IdxT index_x =
      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
  auto idx =
      elem_to_loc(index_rest * shape_x, shape.data(), strides.data(), ndim);
  auto in_vec =
      load_vector<N_READS>(in + idx, index_x, shape_x, stride_x, In(0));
  AlignedVector<Out, N_READS> out_vec;
 #pragma unroll
  for (int i = 0; i < N_READS; ++i) {
    out_vec[i] = Op{}(in_vec[i]);
  }
  store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
 }
 template <typename Op, typename In, typename Out>
 constexpr bool supports_unary_op() {
  if (std::is_same_v<Op, Abs> || std::is_same_v<Op, Negative> ||
      std::is_same_v<Op, Sign> || std::is_same_v<Op, Square>) {
    return std::is_same_v<In, Out>;
  }
  if (std::is_same_v<Op, ArcCosh> || std::is_same_v<Op, ArcSinh> ||
      std::is_same_v<Op, ArcTanh> || std::is_same_v<Op, Erf> ||
      std::is_same_v<Op, ErfInv> || std::is_same_v<Op, Expm1> ||
      std::is_same_v<Op, Sigmoid>) {
    return std::is_same_v<In, Out> && is_floating_v<In>;
  }
  if (std::is_same_v<Op, BitwiseInvert>) {
    return std::is_same_v<In, Out> && std::is_integral_v<In> &&
        !std::is_same_v<In, bool>;
  }
  if (std::is_same_v<Op, Ceil> || std::is_same_v<Op, Floor>) {
    return std::is_same_v<In, Out> && !mlx::core::is_complex_v<In>;
  }
  if (std::is_same_v<Op, Conjugate>) {
    return std::is_same_v<In, Out> && mlx::core::is_complex_v<In>;
  }
  if (std::is_same_v<Op, ArcCos> || std::is_same_v<Op, ArcSin> ||
      std::is_same_v<Op, ArcTan> || std::is_same_v<Op, Cos> ||
      std::is_same_v<Op, Cosh> || std::is_same_v<Op, Exp> ||
      std::is_same_v<Op, Log> || std::is_same_v<Op, Log2> ||
      std::is_same_v<Op, Log10> || std::is_same_v<Op, Log1p> ||
      std::is_same_v<Op, Round> || std::is_same_v<Op, Rsqrt> ||
      std::is_same_v<Op, Sqrt> || std::is_same_v<Op, Sin> ||
      std::is_same_v<Op, Sinh> || std::is_same_v<Op, Tan> ||
      std::is_same_v<Op, Tanh>) {
    return std::is_same_v<In, Out> && is_inexact_v<In>;
  }
  if (std::is_same_v<Op, Imag> || std::is_same_v<Op, Real>) {
    return mlx::core::is_complex_v<In> && std::is_same_v<Out, float>;
  }
  if (std::is_same_v<Op, LogicalNot>) {
    return std::is_same_v<In, Out> && std::is_same_v<In, bool>;
  }
  return false;
 }
 } // namespace cu
 template <typename Op>
 void unary_op_gpu_inplace(
    const std::vector<array>& inputs,
    array& out,
    const char* op,
    const Stream& s) {
  auto& in = inputs[0];
  if (in.size() == 0) {
    return;
  }
  bool contig = in.flags().contiguous;
  bool large;
  if (!contig) {
    large = in.data_size() > INT32_MAX || out.size() > INT32_MAX;
  } else {
    large = in.data_size() > UINT32_MAX;
  }
  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
      using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
      using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
      if constexpr (cu::supports_unary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
        dispatch_bool(large, [&](auto large) {
          using InType = cuda_type_t<CTYPE_IN>;
          using OutType = cuda_type_t<CTYPE_OUT>;
          if (contig) {
            using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
            constexpr int N_READS = 16 / sizeof(OutType);
            auto [num_blocks, block_dims] = get_launch_args(
                out.data_size(), out.shape(), out.strides(), large, N_READS);
            encoder.add_kernel_node(
                cu::unary_v<Op, InType, OutType, IdxT, N_READS>,
                num_blocks,
                block_dims,
                0,
                in.data<InType>(),
                out.data<OutType>(),
                out.data_size());
          } else {
            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
            auto [shape, strides] = collapse_contiguous_dims(in);
            auto ndim = shape.size();
            int work_per_thread = 1;
            auto kernel = cu::unary_g<Op, InType, OutType, IdxT, 1>;
            auto dim0 = ndim > 0 ? shape.back() : 1;
            auto rest = out.size() / dim0;
            if (dim0 >= 4) {
              kernel = cu::unary_g<Op, InType, OutType, IdxT, 4>;
              work_per_thread = 4;
            }
            dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
            auto block_dims = get_block_dims(dim0, rest, 1);
            uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
            uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);
            encoder.add_kernel_node(
                kernel,
                {num_blocks_x, num_blocks_y},
                block_dims,
                0,
                in.data<InType>(),
                out.data<OutType>(),
                rest,
                const_param(shape),
                const_param(strides),
                ndim);
          }
        });
      } else {
        throw std::runtime_error(fmt::format(
            "Can not do unary op {} on input of {} with output of {}.",
            op,
            dtype_to_string(in.dtype()),
            dtype_to_string(out.dtype())));
      }
    });
  });
 }
 template <typename Op>
 void unary_op_gpu(
    const std::vector<array>& inputs,
    array& out,
    const char* op,
    const Stream& s) {
  set_unary_output_data(inputs[0], out);
  unary_op_gpu_inplace<Op>(inputs, out, op, s);
 }
 #define UNARY_GPU(func)                                               \
  void func::eval_gpu(const std::vector<array>& inputs, array& out) { \
    nvtx3::scoped_range r(#func "::eval_gpu");                        \
    auto& s = out.primitive().stream();                               \
    unary_op_gpu<cu::func>(inputs, out, name(), s);                   \
  }
 UNARY_GPU(Abs)
 UNARY_GPU(ArcCos)
 UNARY_GPU(ArcCosh)
 UNARY_GPU(ArcSin)
 UNARY_GPU(ArcSinh)
 UNARY_GPU(ArcTan)
 UNARY_GPU(ArcTanh)
 UNARY_GPU(BitwiseInvert)
 UNARY_GPU(Ceil)
 UNARY_GPU(Conjugate)
 UNARY_GPU(Cos)
 UNARY_GPU(Cosh)
 UNARY_GPU(Erf)
 UNARY_GPU(ErfInv)
 UNARY_GPU(Exp)
 UNARY_GPU(Expm1)
 UNARY_GPU(Floor)
 UNARY_GPU(Imag)
 UNARY_GPU(Log1p)
 UNARY_GPU(LogicalNot)
 UNARY_GPU(Negative)
 UNARY_GPU(Real)
 UNARY_GPU(Sigmoid)
 UNARY_GPU(Sign)
 UNARY_GPU(Sin)
 UNARY_GPU(Sinh)
 UNARY_GPU(Square)
 UNARY_GPU(Tan)
 UNARY_GPU(Tanh)
 void Log::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("Log::eval_gpu");
  auto& s = out.primitive().stream();
  switch (base_) {
    case Base::e:
      unary_op_gpu<cu::Log>(inputs, out, name(), s);
      break;
    case Base::two:
      unary_op_gpu<cu::Log2>(inputs, out, name(), s);
      break;
    case Base::ten:
      unary_op_gpu<cu::Log10>(inputs, out, name(), s);
      break;
  }
 }
 void Round::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("Round::eval_gpu");
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  auto& s = out.primitive().stream();
  if (issubdtype(in.dtype(), inexact)) {
    unary_op_gpu<cu::Round>(inputs, out, name(), s);
  } else {
    // No-op integer types
    out.copy_shared_buffer(in);
  }
 }
 void Sqrt::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("Sort::eval_gpu");
  auto& s = out.primitive().stream();
  if (recip_) {
    unary_op_gpu<cu::Rsqrt>(inputs, out, "Rsqrt", s);
  } else {
    unary_op_gpu<cu::Sqrt>(inputs, out, "Sqrt", s);
  }
 }
 } // namespace mlx::core
--- a/mlx/backend/cuda/unary/unary.cuh
+++ b/mlx/backend/cuda/unary/unary.cuh
@@ -108,6 +108,12 @@ constexpr bool supports_unary_op() {
  if (std::is_same_v<Op, LogicalNot>) {
    return std::is_same_v<In, Out> && std::is_same_v<In, bool>;
  }
  if (std::is_same_v<Op, ToFP8>) {
    return std::is_same_v<Out, uint8_t> && is_floating_v<In>;
  }
  if (std::is_same_v<Op, FromFP8>) {
    return std::is_same_v<In, uint8_t> && is_floating_v<Out>;
  }
  return false;
 }
--- a/mlx/backend/cuda/utils.h
+++ b/mlx/backend/cuda/utils.h
@@ -1,6 +1,6 @@
 // Copyright © 2025 Apple Inc.
-// This file include utilies that are used by C++ code (i.e. .cpp files).
+// This file include utilities that are used by C++ code (i.e. .cpp files).
 #pragma once
@@ -12,6 +12,7 @@ namespace mlx::core {
 namespace cu {
 class Device;
 }
 struct Dtype;
@@ -86,4 +87,17 @@ class CudaStream : public CudaHandle<cudaStream_t, cudaStreamDestroy> {
  explicit CudaStream(cu::Device& device);
 };
 template <typename T>
 inline uint max_occupancy_block_dim(T kernel) {
  int _, block_dim;
  if constexpr (std::is_same_v<T, CUfunction>) {
    CHECK_CUDA_ERROR(
        cuOccupancyMaxPotentialBlockSize(&_, &block_dim, kernel, 0, 0, 0));
  } else {
    CHECK_CUDA_ERROR(
        cudaOccupancyMaxPotentialBlockSize(&_, &block_dim, kernel));
  }
  return block_dim;
 }
 } // namespace mlx::core
--- a/mlx/backend/cuda/worker.cpp
+++ b/mlx/backend/cuda/worker.cpp
@@ -5,8 +5,9 @@
 namespace mlx::core::cu {
-Worker::Worker()
+Worker::Worker(Device& d)
-    : signal_stream_(device(mlx::core::Device::gpu)),
+    : signal_stream_(d),
      signal_event_(d, cudaEventDisableTiming | cudaEventBlockingSync),
      worker_(&Worker::thread_fn, this) {}
 Worker::~Worker() {
--- a/mlx/backend/cuda/worker.h
+++ b/mlx/backend/cuda/worker.h
@@ -3,7 +3,6 @@
 #pragma once
 #include "mlx/backend/cuda/event.h"
 #include "mlx/backend/cuda/utils.h"
 #include <condition_variable>
 #include <functional>
@@ -16,7 +15,7 @@ namespace mlx::core::cu {
 // Run tasks in worker thread, synchronized with cuda stream.
 class Worker {
 public:
-  Worker();
+  explicit Worker(Device& d);
  ~Worker();
  Worker(const Worker&) = delete;
--- a/mlx/backend/gpu/copy.h
+++ b/mlx/backend/gpu/copy.h
@@ -20,8 +20,8 @@ void copy_gpu_inplace(
    int64_t o_offset,
    CopyType ctype,
    const Stream& s,
-    const std::optional<array>& dynamic_i_offset = std::nullopt,
+    std::optional<array> dynamic_i_offset = std::nullopt,
-    const std::optional<array>& dynamic_o_offset = std::nullopt);
+    std::optional<array> dynamic_o_offset = std::nullopt);
 void copy_gpu(const array& src, array& out, CopyType ctype, const Stream& s);
 void copy_gpu(const array& src, array& out, CopyType ctype);
--- a/mlx/backend/gpu/primitives.cpp
+++ b/mlx/backend/gpu/primitives.cpp
@@ -80,6 +80,74 @@ void Depends::eval_gpu(
  eval(inputs, outputs);
 }
 void DynamicSlice::eval_gpu(const std::vector<array>& inputs, array& out) {
  MLX_PROFILER_RANGE("DynamicSlice::eval_gpu");
  if (out.size() == 0) {
    out.set_data(nullptr);
    return;
  }
  auto& in = inputs[0];
  auto& start = inputs[1];
  out.set_data(allocator::malloc(out.nbytes()));
  auto s = stream();
  auto in_offset = compute_dynamic_offset(start, in.strides(), axes_, s);
  copy_gpu_inplace(
      /* const array& src = */ in,
      /* array& dst = */ out,
      /* const Shape& data_shape = */ out.shape(),
      /* const Strides& i_strides = */ in.strides(),
      /* const Strides& o_strides = */ out.strides(),
      /* int64_t i_offset = */ 0,
      /* int64_t o_offset = */ 0,
      /* CopyType ctype = */ CopyType::GeneralGeneral,
      /* const Stream& s = */ s,
      /* std::optional<array> dynamic_i_offset = */ std::move(in_offset),
      /* std::optional<array> dynamic_o_offset = */ std::nullopt);
 }
 void DynamicSliceUpdate::eval_gpu(
    const std::vector<array>& inputs,
    array& out) {
  MLX_PROFILER_RANGE("DynamicSliceUpdate::eval_gpu");
  if (out.size() == 0) {
    out.set_data(nullptr);
    return;
  }
  auto& in = inputs[0];
  auto& upd = inputs[1];
  auto& start_indices = inputs[2];
  if (upd.size() == 0) {
    out.copy_shared_buffer(in);
    return;
  }
  // Copy or donate input to output
  auto s = stream();
  auto ctype = in.flags().contiguous && in.size() == in.data_size()
      ? CopyType::Vector
      : CopyType::General;
  copy_gpu(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype, s);
  auto out_offset =
      compute_dynamic_offset(start_indices, out.strides(), axes_, s);
  copy_gpu_inplace(
      /* const array& src = */ upd,
      /* array& dst = */ out,
      /* const Shape& data_shape = */ upd.shape(),
      /* const Strides& i_strides = */ upd.strides(),
      /* const Strides& o_strides = */ out.strides(),
      /* int64_t i_offset = */ 0,
      /* int64_t o_offset = */ 0,
      /* CopyType ctype = */ CopyType::GeneralGeneral,
      /* const Stream& s = */ s,
      /* std::optional<array> dynamic_i_offset = */ std::nullopt,
      /* std::optional<array> dynamic_o_offset = */ std::move(out_offset));
 }
 void ExpandDims::eval_gpu(const std::vector<array>& inputs, array& out) {
  MLX_PROFILER_RANGE("ExpandDims::eval_gpu");
  eval(inputs, out);
--- a/mlx/backend/gpu/slicing.h
+++ b/mlx/backend/gpu/slicing.h
@@ -27,4 +27,10 @@ void pad_gpu(
    const Shape& low_pad_size,
    const Stream& s);
 array compute_dynamic_offset(
    const array& indices,
    const Strides& strides,
    const std::vector<int>& axes,
    const Stream& s);
 } // namespace mlx::core
--- a/mlx/backend/metal/CMakeLists.txt
+++ b/mlx/backend/metal/CMakeLists.txt
@@ -29,14 +29,15 @@ make_jit_source(
  kernels/bf16_math.h
  kernels/complex.h
  kernels/defines.h)
-make_jit_source(unary_ops kernels/erf.h kernels/expm1f.h)
+make_jit_source(unary_ops kernels/erf.h kernels/expm1f.h kernels/fp8.h)
 make_jit_source(binary_ops)
 make_jit_source(ternary_ops)
 make_jit_source(reduce_utils kernels/atomic.h kernels/reduction/ops.h)
-make_jit_source(scatter kernels/indexing.h)
+make_jit_source(indexing/scatter kernels/indexing/indexing.h)
-make_jit_source(gather kernels/indexing.h)
+make_jit_source(indexing/gather kernels/indexing/indexing.h)
-make_jit_source(gather_axis)
+make_jit_source(indexing/gather_front kernels/indexing/indexing.h)
-make_jit_source(scatter_axis)
+make_jit_source(indexing/gather_axis)
 make_jit_source(indexing/scatter_axis)
 make_jit_source(hadamard)
 if(MLX_METAL_JIT)
@@ -77,7 +78,11 @@ if(MLX_METAL_JIT)
  make_jit_source(steel/conv/kernels/steel_conv)
  make_jit_source(steel/conv/kernels/steel_conv_general kernels/steel/defines.h
                  kernels/steel/conv/loaders/loader_general.h)
-  make_jit_source(quantized)
+
  make_jit_source(quantized_utils)
  make_jit_source(quantized kernels/quantized_utils.h)
  make_jit_source(fp_quantized kernels/quantized_utils.h kernels/fp8.h
                  kernels/fp4.h)
  make_jit_source(gemv_masked)
 else()
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/nojit_kernels.cpp)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Awni Hannun	39b04ce638	use faster dequant for fp4 qmv (#2720 ) Some checks failed Nightly Build / build_linux_release (3.10) (push) Has been cancelled Details Nightly Build / build_linux_release (3.14) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.10) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.11) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.12) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.13) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.14) (push) Has been cancelled Details Nightly Build / build_mac_release (3.10) (push) Has been cancelled Details Nightly Build / build_mac_release (3.13) (push) Has been cancelled Details Nightly Build / build_cuda_with_tests (push) Has been cancelled Details Nightly Build / build_cuda_release (push) Has been cancelled Details	2025-10-31 11:49:59 -07:00
Mike Drob	d9e6349657	fix docs path (#2719 )	2025-10-30 19:12:49 -05:00
Angelos Katharopoulos	b901a9f311	Fix the order of hosts in the ring (#2718 ) Some checks failed Nightly Build / build_linux_release (3.10) (push) Has been cancelled Details Nightly Build / build_linux_release (3.14) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.10) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.11) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.12) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.13) (push) Has been cancelled Details Nightly Build / build_linux_with_tests (3.14) (push) Has been cancelled Details Nightly Build / build_mac_release (3.10) (push) Has been cancelled Details Nightly Build / build_mac_release (3.13) (push) Has been cancelled Details Nightly Build / build_cuda_with_tests (push) Has been cancelled Details Nightly Build / build_cuda_release (push) Has been cancelled Details	2025-10-30 15:02:39 -07:00
Awni Hannun	68c5fa1c95	fix memory count bug (#2717 )	2025-10-30 14:27:15 -07:00
Christopher Webb	793a31eeb6	Fix missing domain_uuid_key in thunderbolt ring setup (#2682 )	2025-10-30 13:17:20 -07:00
Mike Drob	74c1ed25bb	Migrate CircleCI to GitHub Actions (#2716 ) Co-authored-by: Joseph Heck <j_heck@apple.com>	2025-10-30 12:26:55 -05:00
Awni Hannun	ec72b44417	Add quantize/dequantize for mxfp8 and nvfp4 (#2688 ) * Add quantize/dequantize slow path for mxfp8 and nvfp4 * fast cuda kernel for mx/nv quantization * fallback for cuda < 12.8 (#2697) * format (#2700) * fix (#2701) * metal kernels * docs * fix jit * add default bits and group sizes * improve quant docs * fix output type of mxfp4 matmuls	2025-10-28 16:23:12 -07:00
Melissa Kilby	460691a0e8	fix: linux-{fedora}x86_64-build (#2707 ) Signed-off-by: Melissa Kilby <mkilby@apple.com>	2025-10-27 16:36:08 -07:00
Awni Hannun	969924cc69	Fp8 conversion (#2686 ) * add fp8 e4m3 converters * add cuda * default saturate to min/max * fix for older OS * fix no gpu/cpu * fix saturate * fix compile	2025-10-27 16:35:50 -07:00
Awni Hannun	d1e06117e8	bump python (#2694 )	2025-10-27 11:34:31 -07:00
Awni Hannun	539d8322d1	add median op (#2705 )	2025-10-27 11:33:42 -07:00
Awni Hannun	c4767d110f	fix addmm cpu (#2699 )	2025-10-27 11:33:32 -07:00
David Koski	895217f25b	optionally load metallib from framework (#2702 ) * optionally load metallib from framework * pre-commit * adjust logic	2025-10-27 07:52:03 -07:00
Manuel Villanueva	0cfeeb60ca	Einsum error msg improvement (#2690 ) * Improved error message for Einsum * Modifications via pre-commit * format * nits --------- Co-authored-by: Awni Hannun <awni@apple.com>	2025-10-27 06:31:47 -07:00
Ronan Collobert	8f8af61a37	fix warnings showing up with -Wall (#2692 )	2025-10-24 11:43:35 -07:00
Manuel Villanueva	233384161e	Improved mx.split() docs (#2689 ) * Improved mx.split() documentation * Fix typo in docstring for array split function * add example --------- Co-authored-by: Awni Hannun <awni@apple.com>	2025-10-24 09:48:41 -07:00
Awni Hannun	5bcf3a6794	format	2025-10-22 16:08:47 -07:00
wickedcoder	7707196297	Merge commit from fork * add length validation to the header * fix accessing out of bound index with .at()	2025-10-22 15:31:25 -07:00
wickedcoder	7e3471c987	Merge commit from fork * add tensor->weights_data validation * add null pointer check for tensor	2025-10-22 15:31:03 -07:00
Awni Hannun	9f0ba3ddf1	patch bump (#2680 )	2025-10-17 12:12:07 -07:00
Awni Hannun	4bce5f9b2d	suppress gcc 10.1 warnings (#2679 ) * suppress gcc 10.1 warnings * suppress gcc 10.1 warnings	2025-10-17 12:09:21 -07:00
Anastasiia Filippova	e9eab527eb	Nccl timeout (#2673 ) * print the error & delete nccl group * timeout for nccl binding * typo * revert error * fixed a typo	2025-10-14 12:29:54 -07:00
Awni Hannun	36ca62dba8	remove unused unary file (#2672 )	2025-10-13 19:36:26 -07:00
Manuel Villanueva	9cbb1b0148	Modified sort behavior when running CPU or Metal to match NumPy/JAX (#2667 ) * Modified sort behavior when running CPU or Metal to match NumPy/JAX sorting behavior. * Modified sort behavior when running CPU or Metal to match NumPy/JAX * nits --------- Co-authored-by: Awni Hannun <awni@apple.com>	2025-10-13 14:36:45 -07:00
Fabrizio Milo	9bfc476d72	Normalize README bullet formatting (#2671 )	2025-10-13 12:13:30 -07:00
Awni Hannun	25e2356316	speed up scalars (#2669 )	2025-10-13 12:10:15 -07:00
Awni Hannun	226a1d24e0	Debug cuda conv (#2662 ) * use t4 * use t4	2025-10-10 16:12:47 -07:00
Awni Hannun	630350ad3e	Precise sigmoid (#2659 ) * bump patch * Sigmoid matches PyTorch and is more precise on tails	2025-10-10 10:05:23 -07:00
Awni Hannun	380aeb58ae	enable admm low-precision cpu (#2661 )	2025-10-10 09:50:54 -07:00
Awni Hannun	f37389d100	bump patch (#2658 )	2025-10-10 08:36:41 -07:00
Awni Hannun	e89e8b4272	Export with callback (#2612 ) * export with callback * export with callback * Add types, fix kwarg ordering bug + test * cleanup, test, fix * typos	2025-10-08 19:24:33 -07:00
AN Long	85a8824a8c	Fix cumulative operations when axis=None (#2653 )	2025-10-08 15:25:38 -07:00
Awni Hannun	f5d4397e5c	Fix fast synch when fence is waited before a command buffer is created (#2657 )	2025-10-08 11:23:46 -07:00
Awni Hannun	343e33b6d5	fix all_gather vjp (#2654 )	2025-10-07 06:05:23 -07:00
Angelos Katharopoulos	0073096dd1	Split name into directories for cuda jit (#2656 )	2025-10-07 01:52:58 -07:00
Angelos Katharopoulos	e3d004fed9	Fix and refactor row-reduce (#2650 )	2025-10-07 01:51:08 -07:00
Awni Hannun	a393435d28	Speed up compile for node with many parents (#2649 )	2025-10-03 19:30:36 -07:00
Awni Hannun	a7a94b29d7	Fix compile when outputs change (#2648 )	2025-10-03 08:40:57 -07:00
Daniel Yeh	22a5da76c8	Faster complex matmul (#2571 )	2025-10-02 23:33:15 -07:00
Andrey Portnoy	287c63a093	Configure CMake to export compile_commands.json (#2645 ) This helps enable LSP for code navigation using clangd.	2025-10-02 15:40:32 -07:00
Awni Hannun	1c9ae1eaa1	cuda fix flaky test (#2646 )	2025-10-02 15:40:04 -07:00
Angelos Katharopoulos	c2c3e0b0a2	[CUDA] Add a small column specialization to reduce (#2642 )	2025-10-02 14:41:05 -07:00
Awni Hannun	b0cc71ae71	Faster triu, tril, where with scalar (#2644 )	2025-10-02 12:21:27 -07:00
Awni Hannun	e88f2d4a8e	fix cross entropy axis param (#2641 ) * fix cross entropy axis param * faster grad clipping	2025-10-01 16:49:55 -07:00
Angelos Katharopoulos	9cee557423	Fix status message (#2638 )	2025-10-01 16:43:45 -07:00
Awni Hannun	bbf1423953	wait for tasks in cuda (#2636 )	2025-09-30 16:08:46 -07:00
Angelos Katharopoulos	eb24267b56	Compile now can attach arbitrary data to an entry (#2634 )	2025-09-30 13:33:27 -07:00
Awni Hannun	dc371ae7a5	fix for max block dim (#2631 )	2025-09-29 08:59:25 -07:00
AN Long	e76a8dd5c5	Fix incorrect path and typos (#2630 )	2025-09-28 06:03:04 -07:00
Cheng	b466dea982	[CUDA] Make CudaEvent work with multi-device (#2614 ) * Set current device when creating cuda event * Separate cuda events by device * Avoid race condition in pool	2025-09-27 11:27:17 +09:00
Angelos Katharopoulos	7a6adda1e6	Bump the version (#2627 )	2025-09-26 15:15:28 -07:00
Angelos Katharopoulos	1a9f820af6	Compiled should not end in broadcast (#2622 )	2025-09-26 13:36:09 -07:00
Awni Hannun	d4f4ff3c5e	Allow None input to compiled functions (#2621 ) * Allow None input to compiled functions * Allow None input to compiled functions	2025-09-25 08:42:23 -07:00
Jagrit Digani	7c7e48dbd1	New tuning for small K gemv (#2620 ) * New tuning for small K gemv	2025-09-23 12:28:35 -07:00
Daniel Yeh	fbbf3b9b3e	Support pickling array for bfloat16 (#2586 ) * add bfloat16 pickling * Improvements * improve --------- Co-authored-by: Chen-Chen Yeh <ge96noj@mytum.de>	2025-09-22 20:12:15 -07:00
Daniel Yeh	bf01ad9367	fix (#2613 ) Co-authored-by: Chen-Chen Yeh <ge96noj@mytum.de>	2025-09-22 20:12:04 -07:00
Cheng	ae438d05fa	[CUDA] Recycle CUDA events (#2604 ) * Make CudaEvent a CudaHandle * Add caching for CudaEvent * Make sure cuda events are destroyed at last * Fix headers * SharedEvent => AtomicEvent * RawCudaEvent => CudaEventHandle, CudaEventWrapper => CopyableCudaEvent * Remove unneeded asserts	2025-09-23 10:42:03 +09:00
Awni Hannun	711a645807	avoid producing NaN in attention (#2608 )	2025-09-22 13:10:43 -07:00
Josh Bleecher Snyder	aa9d44b3d4	implement Convolution::output_shape (#2601 ) - pull conv_out_shape out for re-use - add Conv::output_shape - add e2e python tests confirming shapeless=True support and correctness Updates #2599	2025-09-22 10:09:45 -07:00
Awni Hannun	ec2ab42888	Lower sorted QMM gather threshold (#2609 )	2025-09-19 18:22:55 -07:00
Cheng	787c0d90cd	Detect cache thrashing in LRUCache (#2600 ) * Detect cache thrashing in LRUCache * Do not check cache thrashing in tests	2025-09-19 09:12:14 +09:00
Oleksandr Bilous	e8b604a6a3	fix: library loading for swift dynamic frameworks (#2568 )	2025-09-18 13:54:59 -07:00
Awni Hannun	50cc09887f	expose depends (#2606 )	2025-09-18 10:06:15 -07:00
Umberto Mignozzetti	3f730e77aa	Update export function example for array input (#2598 ) After changing the shape to conform (same shapes for all objects), the example works.	2025-09-16 14:38:05 -07:00
Awni Hannun	caecbe876a	no copy batch rope (#2595 )	2025-09-15 14:23:48 -07:00
Umberto Mignozzetti	8afb6d62f2	Fix typo in average_gradients function call (#2594 )	2025-09-15 11:29:21 -07:00
Awni Hannun	6ccfa603cd	fix metal scan (#2591 )	2025-09-15 11:01:57 -07:00
Umberto Mignozzetti	36cad99a11	Refactor code examples to use 'gelu' (#2592 ) Updated code examples to use 'gelu' directly instead of 'nn.gelu'.	2025-09-15 09:47:02 -07:00
Awni Hannun	ee18e1cbf0	patch bump (#2588 )	2025-09-11 17:10:09 -07:00
Awni Hannun	af120c2bc0	set nccl ABI version (#2587 )	2025-09-11 16:55:53 -07:00
Cheng	6a3acf2301	[CUDA] Set bias as input when using bias epilogue (#2584 )	2025-09-11 15:31:09 +09:00
Awni Hannun	d6977f2a57	Add sdpa with sinks (#2558 ) * add sdpa with sinks * fix 2 pass * fix matrix sdpa * fix perf regression * add to cuda (#2580)	2025-09-10 14:53:00 -07:00
Gökdeniz Gülmez	db5443e831	Adding Relu2 (#2582 ) * in. com. * upd. ackn. * update __init__ * nits * nits + format * used mx.maximum(x, 0) instead of calling the function and moves relu6 under relu2 to make it nicer * same with _make_activation_module * Update python/mlx/nn/layers/activations.py upd Co-authored-by: Awni Hannun <awni.hannun@gmail.com> * update funct.rst * upd. layers.rst --------- Co-authored-by: Awni Hannun <awni.hannun@gmail.com>	2025-09-10 07:24:30 -07:00
Cheng	52b8384d10	Fix flaky addmm tests (#2581 )	2025-09-10 14:22:22 +09:00
Cheng	44cc5da4bc	[CUDA] Fix alpha not respected when using bias epilogue (#2578 )	2025-09-10 09:08:01 +09:00
Cheng	dde3682b69	[CUDA] Use GEMM with epilogue instead of AddMM (#2569 )	2025-09-09 13:18:49 +09:00
Awni Hannun	17310d91a6	Add batch offsets for mx.fast.rope (#2564 ) * implement batch rope for Metal * cuda rope (#2576)	2025-09-08 17:35:07 -07:00
Cheng	b194d65a6a	Some tweaks in cmake files (#2574 ) * Do proper check of Metal lib * Update doctest to get rid of cmake version hack	2025-09-09 08:27:18 +09:00
Cheng	a44b27f5f8	Fix a few ccache cache miss (#2573 ) * Fix ccache cache miss * Do not define _VERSION_ in python bindings	2025-09-09 07:41:05 +09:00
Awni Hannun	e5a33f2223	faster depthwise 1D conv (#2567 )	2025-09-08 11:37:23 -07:00
Cheng	c1e3340b23	Set ccache size before building (#2570 )	2025-09-07 09:00:31 +09:00
XXXXRT666	8f163a367d	typing: add type hints to mlx.core.array, linalg, distributed, and random (#2565 ) * Add type annotations to mlx methods * Missing list_or_scalar	2025-09-04 09:08:11 -07:00
Manuel Villanueva	89a3df9014	Fixed several type annotations in the MLX stubs which degraded to Unknown/Any (#2560 ) * Added scalar to stubs to fix Unkown Type Hint ### Proposed changes Issue #2478 reports that several type annotations in the MLX stubs degrade to Unknown/Any in editors like VS Code with Pylance, due to missing imports (Union, Optional, Tuple) and an undefined scalar type alias. This PR updates the stub generation patterns to: • Add missing typing imports in mlx.core.__prefix__ so that Union, Optional, Tuple, etc. are always available. • Define and export scalar: TypeAlias = Union[int, float, bool] in mlx.core.__suffix__ so that functions typed with Union[scalar, array] resolve correctly instead of falling back to Any. • Update submodule stub prefixes (distributed, fast, linalg, metal, random) to import scalar alongside array, Device, and Stream, ensuring type checkers resolve the union consistently across modules. With these changes, functions like mlx.add now display rich type signatures such as: ``` def add( a: scalar \| array, b: scalar \| array, stream: Stream \| Device \| None = None ) -> array ``` instead of degrading to Any. ### Checklist • I have read the CONTRIBUTING document • I have run pre-commit run --all-files to format my code / installed pre-commit prior to committing changes • I have added tests that prove my fix is effective or that my feature works (n/a — stub generation only) • I have updated the necessary documentation (if needed) * add bool to patterns --------- Co-authored-by: Awni Hannun <awni@apple.com>	2025-09-03 12:52:08 -07:00
Krishi Saripalli	c5d2937aa5	chore: Update Docs With Slice Copy Example (#2559 ) * chore: updated docs with slice copy example * nits --------- Co-authored-by: Awni Hannun <awni@apple.com>	2025-09-02 22:07:02 -07:00
Awni Hannun	b61a65e313	fix copies in sdpa (#2563 )	2025-09-02 11:00:36 -07:00
wrmsr	04cbb4191c	Fix dequantize python sig (#2562 )	2025-09-01 11:50:20 -07:00
Artur Antonov	c5460762e7	Fix AdamW weight_decay default value in docstring (#2557 )	2025-08-31 21:29:30 -07:00
Awni Hannun	8ce49cd39e	fix quantized vjp for mxfp4 (#2555 )	2025-08-29 10:06:15 -07:00
Awni Hannun	9c68b50853	version bump (#2554 )	2025-08-29 06:54:17 -07:00
Awni Hannun	111f1e71af	Faster contiguous gather for indices in the first axis (#2552 ) * faster contiguous gather for indices in the first axis * work per thread > 1 * angelos suggestion for scales / biases	2025-08-28 21:26:30 -07:00
Awni Hannun	827003d568	fix METAL quantization in JIT (#2553 )	2025-08-28 18:26:25 -07:00
Awni Hannun	d363a76aa4	Bump xcode in circle (#2551 ) * bump xcode in circle * bump xcode in circle * bump xcode in circle	2025-08-28 13:13:34 -07:00
Awni Hannun	70560b6bd5	Add mode parameter for quantization (#2499 ) * add mode parameter for quantization * mxfp4 quantize/dequantize + start of optional biases * mxfp4 works * speedup * cpu mxfp4 * fix * fix test tol * fix * refactor * add quant mode enum	2025-08-28 06:45:26 -07:00
Awni Hannun	7ef8a6f2d5	[CUDA] fix sort (#2550 ) * [CUDA] fix sort * fix test	2025-08-27 19:48:43 -07:00
Cheng	31c6f6e33f	[CUDA] Use ConcurrentContext in concatenate_gpu (#2549 )	2025-08-28 09:30:08 +09:00
Awni Hannun	584d48458e	link with nccl (#2546 )	2025-08-27 10:01:07 -07:00
Cheng	5cf984ca87	Separate cpu compilation cache by versions (#2548 )	2025-08-27 11:25:15 +09:00
Cheng	a9bac3d9e5	Run CPP tests for CUDA build in CI (#2544 )	2025-08-27 08:06:46 +09:00
Awni Hannun	5458d43247	add load with path tests (#2543 )	2025-08-26 14:24:47 -07:00
Awni Hannun	a4dba65220	Enable cuda graph toggle (#2545 ) * enable cuda graph toggle * increase cache size	2025-08-26 12:50:38 -07:00
Awni Hannun	3dcb286baf	Remove stream from average grads so it uses default (#2532 ) * Remove stream from average grads so it uses default * comment	2025-08-25 15:56:29 -07:00
Cheng	4822c3dbe9	[CUDA] Implement DynamicSlice/DynamicSliceUpdate (#2533 ) * Move DynamicSlice to gpu/primitives * Implement compute_dynamic_offset in CUDA	2025-08-26 07:31:39 +09:00
Awni Hannun	2ca75bb529	Remove nccl install in release (#2542 )	2025-08-25 15:20:18 -07:00
Awni Hannun	db14e29a0b	allow pathlib.Path to save/load functions (#2541 )	2025-08-25 14:58:49 -07:00
Awni Hannun	d2f540f4e0	Use nccl header only when nccl is not present (#2539 ) * use nccl header only when nccl is not present * larger machine for cuda build	2025-08-25 14:17:25 -07:00
Cheng	333ffea273	[CUDA] Remove thrust in arange (#2535 )	2025-08-24 16:22:36 +09:00
Cheng	f55b6f1f2f	Enable COMPILE_WARNING_AS_ERROR for linux builds in CI (#2534 )	2025-08-24 15:33:08 +09:00
Awni Hannun	30561229c7	Fix allocation bug in NCCL (#2530 )	2025-08-22 14:39:43 -07:00
Awni Hannun	068a4612e9	nccl default for backend=any (#2528 ) * nccl default for backend=any * check num gpus + ensure row contiguous for all reduce * comment	2025-08-22 12:24:27 -07:00
Andrey Portnoy	5722c147de	[CUDA] Update calls to `cudaMemAdvise` and `cudaGraphAddDependencies` for CUDA 13 (#2525 ) * [CUDA] Update cudaMemAdvise and cudaGraphAddDependencies for CUDA 13 These functions' signatures changed in CUDA 13, so we differentiate between CUDA 13 and preceding releases at compile time. * Mention NVIDIA in ACKNOWLEDGMENTS.md	2025-08-21 19:57:20 -07:00
Cheng	f6819a1f26	Fix warning 186-D from nvcc (#2527 )	2025-08-22 10:29:55 +09:00
Awni Hannun	f93f87c802	nccl dep + default for cuda (#2526 )	2025-08-21 17:57:49 -07:00
Anastasiia Filippova	9392fc3f88	NCCL backend (#2476 )	2025-08-21 11:56:15 -07:00
Awni Hannun	e843c4d8d5	fix power (#2523 )	2025-08-21 06:46:01 -07:00