Support unaligned M

Add dynamic shared memory and improve qmm
Refactor the matmul a bit
2025-12-16 01:49:05 +08:00 · 2025-07-23 00:40:27 -07:00 · 2025-07-22 23:36:53 -07:00 · 2025-07-21 23:38:21 -07:00 · 2025-07-21 04:47:42 -07:00 · 2025-07-21 04:47:42 -07:00
63 changed files with 1569 additions and 655 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7,9 +7,6 @@ parameters:
  nightly_build:
    type: boolean
    default: false
-  test_release:
-    type: boolean
-    default: false

 jobs:
  build_documentation:
@@ -369,27 +366,22 @@ jobs:
        type: string
        default: ""
    machine:
-      image: ubuntu-2204:current
-      resource_class: large
+      image: linux-cuda-12:2024.11.1
+      resource_class: gpu.nvidia.small.gen2
    steps:
      - checkout
      - run:
          name: Build wheel
          command: |
-            export DEBIAN_FRONTEND=noninteractive
-            export NEEDRESTART_MODE=a
-            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
-            sudo dpkg -i cuda-keyring_1.1-1_all.deb
            sudo apt-get update
-            sudo apt install cuda-toolkit-12-9
            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
            sudo apt-get install zip
+            python -m venv env
+            source env/bin/activate
            pip install auditwheel
            pip install patchelf
            pip install build
            pip install twine
-            export PATH=/usr/local/cuda/bin${PATH:+:${PATH}}
-            export LD_LIBRARY_PATH=/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
            << parameters.build_env >> MLX_BUILD_STAGE=2 \
              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
              python -m build -w
@@ -400,6 +392,7 @@ jobs:
            - run:
                name: Upload package
                command: |
+                  source env/bin/activate
                  twine upload wheelhouse/*.whl
      - store_artifacts:
          path: wheelhouse/
@@ -412,7 +405,6 @@ workflows:
            pattern: "^(?!pull/)[-\\w]+$"
            value: << pipeline.git.branch >>
        - not: << pipeline.parameters.nightly_build >>
-        - not: << pipeline.parameters.test_release >>
    jobs:
      - mac_build_and_test:
          matrix:
@@ -426,7 +418,6 @@ workflows:
    when:
      and:
        - not: << pipeline.parameters.nightly_build >>
-        - not: << pipeline.parameters.test_release >>
    jobs:
      - build_release:
          filters:
@@ -610,87 +601,3 @@ workflows:
            parameters:
              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
      - build_cuda_release
-
-  build_dev_release:
-    when:
-      and:
-        - equal: [ main, << pipeline.git.branch >> ]
-        - << pipeline.parameters.test_release >>
-    jobs:
-      - build_release:
-          matrix:
-            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
-              macosx_deployment_target: ["13.5", "14.0", "15.0"]
-              build_env: ["DEV_RELEASE=1"]
-              xcode_version: ["16.2.0", "15.0.0"]
-            exclude:
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.9"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.10"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.11"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.12"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "13.5"
-                xcode_version: "16.2.0"
-                python_version: "3.13"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.9"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.10"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.11"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.12"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "14.0"
-                xcode_version: "15.0.0"
-                python_version: "3.13"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.9"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.10"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.11"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.12"
-                build_env: "DEV_RELEASE=1"
-              - macosx_deployment_target: "15.0"
-                xcode_version: "15.0.0"
-                python_version: "3.13"
-                build_env: "DEV_RELEASE=1"
-      - build_linux_release:
-          matrix:
-            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
-              build_env: ["DEV_RELEASE=1"]
-      - build_cuda_release:
-          matrix:
-            parameters:
-              build_env: ["DEV_RELEASE=1"]
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,7 +22,7 @@ project(

 # ----------------------------- Setup -----------------------------
 set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_INSTALL_MESSAGE NEVER)
--- a/mlx/backend/cpu/copy.cpp
+++ b/mlx/backend/cpu/copy.cpp
@@ -377,10 +377,4 @@ void copy_cpu_inplace(
      });
 }

-array contiguous_copy_cpu(const array& arr, Stream stream) {
-  array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-  copy_cpu(arr, arr_copy, CopyType::General, stream);
-  return arr_copy;
-}
-
 } // namespace mlx::core
--- a/mlx/backend/cpu/copy.h
+++ b/mlx/backend/cpu/copy.h
@@ -30,7 +30,4 @@ void copy_cpu_inplace(
    const std::optional<array>& dynamic_i_offset = std::nullopt,
    const std::optional<array>& dynamic_o_offset = std::nullopt);

-// Return a contiguous array with same shape that copies the data of |arr|.
-array contiguous_copy_cpu(const array& arr, Stream stream);
-
 } // namespace mlx::core
--- a/mlx/backend/cpu/distributed.cpp
+++ b/mlx/backend/cpu/distributed.cpp
@@ -13,7 +13,9 @@ std::pair<array, bool> ensure_row_contiguous(const array& arr, Stream stream) {
  if (arr.flags().row_contiguous) {
    return {arr, false};
  } else {
-    return {contiguous_copy_cpu(arr, stream), true};
+    array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
+    copy_cpu(arr, arr_copy, CopyType::General, stream);
+    return {arr_copy, true};
  }
 };

@@ -32,7 +34,8 @@ void AllReduce::eval_cpu(
      }
      return in;
    } else {
-      array arr_copy = contiguous_copy_cpu(in, s);
+      array arr_copy(in.shape(), in.dtype(), nullptr, {});
+      copy_cpu(in, arr_copy, CopyType::General, s);
      out.copy_shared_buffer(arr_copy);
      return arr_copy;
    }
--- a/mlx/backend/cpu/logsumexp.cpp
+++ b/mlx/backend/cpu/logsumexp.cpp
@@ -87,7 +87,8 @@ void LogSumExp::eval_cpu(const std::vector<array>& inputs, array& out) {
    if (x.flags().contiguous && x.strides()[x.ndim() - 1] == 1) {
      return x;
    } else {
-      array x_copy = contiguous_copy_cpu(x, s);
+      auto x_copy = array(x.shape(), x.dtype(), nullptr, {});
+      copy_cpu(x, x_copy, CopyType::General, s);
      encoder.add_temporary(x_copy);
      return x_copy;
    }
--- a/mlx/backend/cpu/masked_mm.cpp
+++ b/mlx/backend/cpu/masked_mm.cpp
@@ -136,8 +136,9 @@ void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
          }
          return std::make_tuple(true, sty, arr, false);
        } else {
+          array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
+          copy_cpu(arr, arr_copy, CopyType::General, s);
          int64_t stx = arr.shape(-1);
-          array arr_copy = contiguous_copy_cpu(arr, s);
          return std::make_tuple(false, stx, arr_copy, true);
        }
      };
--- a/mlx/backend/cpu/quantized.cpp
+++ b/mlx/backend/cpu/quantized.cpp
@@ -712,7 +712,9 @@ void fast::AffineQuantize::eval_cpu(
    if (arr.flags().row_contiguous) {
      return std::make_pair(arr, false);
    } else {
-      return std::make_pair(contiguous_copy_cpu(arr, s), true);
+      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
+      copy_cpu(arr, arr_copy, CopyType::General, s);
+      return std::make_pair(arr_copy, true);
    }
  };

--- a/mlx/backend/cpu/scan.cpp
+++ b/mlx/backend/cpu/scan.cpp
@@ -250,8 +250,10 @@ void Scan::eval_cpu(const std::vector<array>& inputs, array& out) {
  // Ensure contiguity
  auto in = inputs[0];
  if (!in.flags().row_contiguous) {
-    in = contiguous_copy_cpu(in, stream());
-    encoder.add_temporary(in);
+    array arr_copy(in.shape(), in.dtype(), nullptr, {});
+    copy_cpu(in, arr_copy, CopyType::General, stream());
+    in = arr_copy;
+    encoder.add_temporary(arr_copy);
  }
  out.set_data(allocator::malloc(out.nbytes()));

--- a/mlx/backend/cpu/softmax.cpp
+++ b/mlx/backend/cpu/softmax.cpp
@@ -131,7 +131,8 @@ void Softmax::eval_cpu(const std::vector<array>& inputs, array& out) {
      }
      return x;
    } else {
-      array x_copy = contiguous_copy_cpu(x, s);
+      array x_copy(x.shape(), x.dtype(), nullptr, {});
+      copy_cpu(x, x_copy, CopyType::General, s);
      out.copy_shared_buffer(x_copy);
      return x_copy;
    }
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -20,7 +20,6 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/eval.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/event.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/fence.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/gemv.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/jit_module.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/kernel_utils.cu
@@ -43,7 +42,9 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/ternary.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/unary.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/quantized.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/affine_quantize.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/qmm.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/quantized.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/worker.cpp)

 target_compile_definitions(mlx PRIVATE MLX_USE_CUDA)
@@ -88,17 +89,10 @@ endif()
 target_compile_options(
  mlx PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--Wno-deprecated-gpu-targets>")

-# Use stronger binaries compression. This feature was introduced in CUDA 12.8
-# and requires drivers released after CUDA 12.4.
-if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8.0)
-  target_compile_options(
-    mlx PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--compress-mode=size>")
-endif()
-
 # Compute capability 7 is required for synchronization between CPU/GPU with
 # managed memory. TODO: Add more architectures for potential performance gain.
 set(MLX_CUDA_ARCHITECTURES
-    "70;80"
+    "80"
    CACHE STRING "CUDA architectures")
 message(STATUS "CUDA architectures: ${MLX_CUDA_ARCHITECTURES}")
 set_target_properties(mlx PROPERTIES CUDA_ARCHITECTURES
@@ -138,3 +132,12 @@ target_compile_options(mlx PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe
 # Install CCCL headers for JIT.
 install(DIRECTORY ${cccl_SOURCE_DIR}/include/cuda
        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cccl)
+
+# Make Thunderkittens available
+FetchContent_Declare(
+  kittens
+  GIT_REPOSITORY https://github.com/HazyResearch/ThunderKittens.git
+  GIT_TAG aaab847f430ed313ed466e64b25b9177babd1db8
+  GIT_SHALLOW TRUE)
+FetchContent_MakeAvailable(kittens)
+target_include_directories(mlx BEFORE PRIVATE "${kittens_SOURCE_DIR}/include")
--- a/mlx/backend/cuda/allocator.cpp
+++ b/mlx/backend/cuda/allocator.cpp
@@ -2,6 +2,7 @@

 #include "mlx/backend/cuda/allocator.h"
 #include "mlx/backend/cuda/utils.h"
+#include "mlx/backend/cuda/worker.h"
 #include "mlx/utils.h"

 #include <cuda_runtime.h>
@@ -24,58 +25,52 @@ constexpr int small_block_size = 8;
 constexpr int small_pool_size = 4 * page_size;

 SmallSizePool::SmallSizePool() {
+  CHECK_CUDA_ERROR(cudaMallocManaged(&buffer_, small_pool_size));
+  end_ = reinterpret_cast<void*>(
+      reinterpret_cast<char*>(buffer_) + small_pool_size);
+  next_free_ = reinterpret_cast<Block*>(buffer_);
+
  auto num_blocks = small_pool_size / small_block_size;
-  buffer_ = new Block[num_blocks];
-
-  next_free_ = buffer_;
-
-  CHECK_CUDA_ERROR(cudaMallocManaged(&data_, small_pool_size));
-  CHECK_CUDA_ERROR(
-      cudaMemAdvise(data_, small_pool_size, cudaMemAdviseSetReadMostly, 0));
-
  auto curr = next_free_;
-  for (size_t i = 1; i < num_blocks; ++i) {
-    curr->next = buffer_ + i;
+  for (size_t i = 0; i < num_blocks - 1; ++i) {
+    curr->next = reinterpret_cast<Block*>(
+        reinterpret_cast<char*>(buffer_) + (i + 1) * small_block_size);
    curr = curr->next;
  }
  curr->next = nullptr;
 }

 SmallSizePool::~SmallSizePool() {
-  CHECK_CUDA_ERROR(cudaFree(data_));
-  delete[] buffer_;
+  CHECK_CUDA_ERROR(cudaFree(buffer_));
 }

-CudaBuffer* SmallSizePool::malloc() {
+void* SmallSizePool::malloc() {
  if (next_free_ == nullptr) {
    return nullptr;
  }
  Block* b = next_free_;
-  uint64_t i = next_free_ - buffer_;
  next_free_ = next_free_->next;
-  b->buf.data = static_cast<char*>(data_) + i * small_block_size;
-  b->buf.size = small_block_size;
-  return &b->buf;
+  return static_cast<void*>(b);
 }

-void SmallSizePool::free(CudaBuffer* buf) {
-  auto b = reinterpret_cast<Block*>(buf);
+void SmallSizePool::free(void* p) {
+  auto b = static_cast<Block*>(p);
  b->next = next_free_;
  next_free_ = b;
 }

-bool SmallSizePool::in_pool(CudaBuffer* buf) {
-  constexpr int num_blocks = (small_pool_size / small_block_size);
-  auto b = reinterpret_cast<Block*>(buf);
-  int64_t block_num = b - buffer_;
-  return block_num >= 0 && block_num < num_blocks;
+bool SmallSizePool::in_pool(void* p) {
+  return (p >= buffer_) && (p < end_);
 }

 CudaAllocator::CudaAllocator()
    : buffer_cache_(
          page_size,
          [](CudaBuffer* buf) { return buf->size; },
-          [this](CudaBuffer* buf) { cuda_free(buf); }) {
+          [this](CudaBuffer* buf) {
+            cuda_free(buf->data);
+            delete buf;
+          }) {
  // TODO: Set memory limit for multi-device.
  size_t free, total;
  CHECK_CUDA_ERROR(cudaMemGetInfo(&free, &total));
@@ -97,26 +92,28 @@ Buffer CudaAllocator::malloc(size_t size) {

  CudaBuffer* buf = buffer_cache_.reuse_from_cache(size);
  if (!buf) {
-    // If we have a lot of memory pressure try to reclaim memory from the cache.
-    int64_t mem_to_free =
-        get_active_memory() + get_cache_memory() + size - memory_limit_;
-    if (mem_to_free > 0) {
-      buffer_cache_.release_cached_buffers(mem_to_free);
+    // If we have a lot of memory pressure or are over the maximum cache size,
+    // try to reclaim memory from the cache.
+    size_t mem_required = get_active_memory() + get_cache_memory() + size;
+    if (mem_required >= memory_limit_) {
+      buffer_cache_.release_cached_buffers(mem_required - memory_limit_);
    }

+    lock.unlock();
+    buf = new CudaBuffer{nullptr, size};
+
    // Try the scalar pool first
    if (size <= small_block_size) {
-      buf = scalar_pool_.malloc();
+      buf->data = scalar_pool_.malloc();
    }
-    lock.unlock();
-    if (!buf) {
-      buf = new CudaBuffer{nullptr, size};
+    if (!buf->data) {
      cudaError_t err = cudaMallocManaged(&buf->data, size);
      if (err != cudaSuccess && err != cudaErrorMemoryAllocation) {
        throw std::runtime_error(fmt::format(
            "cudaMallocManaged failed: {}.", cudaGetErrorString(err)));
      }
    }
+
    lock.lock();
  }
  active_memory_ += size;
@@ -126,6 +123,7 @@ Buffer CudaAllocator::malloc(size_t size) {
  if (get_cache_memory() > max_pool_size_) {
    buffer_cache_.release_cached_buffers(get_cache_memory() - max_pool_size_);
  }
+
  return Buffer{buf};
 }

@@ -140,7 +138,9 @@ void CudaAllocator::free(Buffer buffer) {
  if (get_cache_memory() < max_pool_size_) {
    buffer_cache_.recycle_to_cache(buf);
  } else {
-    cuda_free(buf);
+    lock.unlock();
+    cuda_free(buf->data);
+    delete buf;
  }
 }

@@ -152,13 +152,30 @@ size_t CudaAllocator::size(Buffer buffer) const {
  return buf->size;
 }

-// This must be called with mutex_ aquired
-void CudaAllocator::cuda_free(CudaBuffer* buf) {
+void CudaAllocator::register_this_thread() {
+  std::lock_guard lock(worker_mutex_);
+  allowed_threads_.insert(std::this_thread::get_id());
+}
+
+void CudaAllocator::cuda_free(void* buf) {
+  // If cuda_free() is called from a unregistered thread, reschedule the call to
+  // worker.
+  {
+    std::lock_guard lock(worker_mutex_);
+    if (allowed_threads_.count(std::this_thread::get_id()) == 0) {
+      if (!worker_) {
+        worker_.reset(new Worker);
+      }
+      worker_->add_task([this, buf]() { this->cuda_free(buf); });
+      worker_->end_batch();
+      worker_->commit();
+      return;
+    }
+  }
  if (scalar_pool_.in_pool(buf)) {
    scalar_pool_.free(buf);
  } else {
-    cudaFree(buf->data);
-    delete buf;
+    cudaFree(buf);
  }
 }

--- a/mlx/backend/cuda/allocator.h
+++ b/mlx/backend/cuda/allocator.h
@@ -7,10 +7,13 @@

 #include <mutex>
 #include <set>
+#include <thread>
 #include <utility>

 namespace mlx::core::cu {

+class Worker;
+
 using allocator::Buffer;

 // Stores cuda-managed unified memory.
@@ -21,14 +24,13 @@ struct CudaBuffer {

 class SmallSizePool {
 private:
-  union Block {
+  struct Block {
    Block* next;
-    CudaBuffer buf;
  };

-  Block* buffer_{nullptr};
-  void* data_{nullptr};
+  void* buffer_{nullptr};
  Block* next_free_{nullptr};
+  void* end_{nullptr};

 public:
  SmallSizePool();
@@ -37,9 +39,9 @@ class SmallSizePool {
  SmallSizePool(const SmallSizePool&) = delete;
  SmallSizePool& operator=(const SmallSizePool&) = delete;

-  CudaBuffer* malloc();
-  void free(CudaBuffer* buf);
-  bool in_pool(CudaBuffer* buf);
+  void* malloc();
+  void free(void* p);
+  bool in_pool(void* p);
 };

 class CudaAllocator : public allocator::Allocator {
@@ -48,6 +50,15 @@ class CudaAllocator : public allocator::Allocator {
  void free(Buffer buffer) override;
  size_t size(Buffer buffer) const override;

+  // Register current thread as safe to free buffers.
+  // In cuda freeing a buffer implicitly synchronizes stream, and for threads
+  // that may be waited by gpu stream (for example cpu stream threads), freeing
+  // buffers there would result in dead lock.
+  void register_this_thread();
+
+  // Call cudaFree in the safe thread.
+  void cuda_free(void* buf);
+
  size_t get_active_memory() const;
  size_t get_peak_memory() const;
  void reset_peak_memory();
@@ -58,11 +69,13 @@ class CudaAllocator : public allocator::Allocator {
  void clear_cache();

 private:
-  void cuda_free(CudaBuffer* buf);
-
  CudaAllocator();
  friend CudaAllocator& allocator();

+  std::mutex worker_mutex_;
+  std::unique_ptr<Worker> worker_;
+  std::set<std::thread::id> allowed_threads_;
+
  std::mutex mutex_;
  size_t memory_limit_;
  size_t max_pool_size_;
--- a/mlx/backend/cuda/arg_reduce.cu
+++ b/mlx/backend/cuda/arg_reduce.cu
@@ -1,8 +1,8 @@
 // Copyright © 2025 Apple Inc.
-
 #include "mlx/backend/common/utils.h"
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/device/fp16_math.cuh"
+#include "mlx/backend/cuda/iterators/strided_iterator.cuh"
 #include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/dtype_utils.h"
 #include "mlx/primitives.h"
@@ -115,7 +115,7 @@ __global__ void arg_reduce_general(
    T vals[N_READS];
    auto tid = r * BLOCK_DIM + block.thread_index().x;
    cub::LoadDirectBlocked(
-        tid, StridedIterator(in + in_idx, axis_stride), vals, axis_size, init);
+        tid, strided_iterator(in + in_idx, axis_stride), vals, axis_size, init);
    best = op.reduce_many(best, vals, tid * N_READS);
  }

@@ -166,6 +166,7 @@ void ArgReduce::eval_gpu(const std::vector<array>& inputs, array& out) {
          kernel,
          num_blocks,
          block_dim(),
+          0,
          in.data<T>(),
          out.data<uint32_t>(),
          out.size(),
--- a/mlx/backend/cuda/binary.cu
+++ b/mlx/backend/cuda/binary.cu
@@ -128,7 +128,7 @@ __global__ void binary_g(
    int ndim) {
  IdxT index = cg::this_grid().thread_rank();
  if (index < size) {
-    auto [a_idx, b_idx] = elem_to_loc(
+    auto [a_idx, b_idx] = elem_to_loc_4d(
        index, shape.data(), a_strides.data(), b_strides.data(), ndim);
    out[index] = Op{}(a[a_idx], b[b_idx]);
  }
@@ -219,6 +219,7 @@ void binary_op_gpu_inplace(
                        kernel,
                        num_blocks,
                        block_dims,
+                        0,
                        a.data<InType>(),
                        b.data<InType>(),
                        out.data<OutType>(),
@@ -235,6 +236,7 @@ void binary_op_gpu_inplace(
                      kernel,
                      num_blocks,
                      block_dims,
+                      0,
                      a.data<InType>(),
                      b.data<InType>(),
                      out.data<OutType>(),
@@ -269,6 +271,7 @@ void binary_op_gpu_inplace(
                kernel,
                num_blocks,
                block_dims,
+                0,
                a.data<InType>(),
                b.data<InType>(),
                out.data<OutType>(),
--- a/mlx/backend/cuda/binary_two.cu
+++ b/mlx/backend/cuda/binary_two.cu
@@ -160,7 +160,7 @@ __global__ void binary_two_g(
    int ndim) {
  IdxT index = cg::this_grid().thread_rank();
  if (index < size) {
-    auto [a_idx, b_idx] = elem_to_loc(
+    auto [a_idx, b_idx] = elem_to_loc_4d(
        index, shape.data(), a_strides.data(), b_strides.data(), ndim);
    auto out = Op{}(a[a_idx], b[b_idx]);
    out_a[index] = out[0];
@@ -239,6 +239,7 @@ void binary_two_op_gpu_inplace(
                        kernel,
                        num_blocks,
                        block_dims,
+                        0,
                        a.data<InType>(),
                        b.data<InType>(),
                        out_a.data<OutType>(),
@@ -256,6 +257,7 @@ void binary_two_op_gpu_inplace(
                      kernel,
                      num_blocks,
                      block_dims,
+                      0,
                      a.data<InType>(),
                      b.data<InType>(),
                      out_a.data<OutType>(),
@@ -291,6 +293,7 @@ void binary_two_op_gpu_inplace(
                kernel,
                num_blocks,
                block_dims,
+                0,
                a.data<InType>(),
                b.data<InType>(),
                out_a.data<OutType>(),
--- a/mlx/backend/cuda/compiled.cpp
+++ b/mlx/backend/cuda/compiled.cpp
@@ -295,7 +295,7 @@ void Compiled::eval_gpu(
  auto kernel = mod.get_kernel(kernel_name);
  auto [num_blocks, block_dims] =
      get_launch_args(kernel, outputs[0], large, work_per_thread);
-  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
+  encoder.add_kernel_node(kernel, num_blocks, block_dims, 0, args.args());
 }

 } // namespace mlx::core
--- a/mlx/backend/cuda/copy/copy_contiguous.cu
+++ b/mlx/backend/cuda/copy/copy_contiguous.cu
@@ -82,6 +82,7 @@ void copy_contiguous(
            kernel,
            num_blocks,
            block_dims,
+            0,
            in.data<InType>() + in_offset,
            out.data<OutType>() + out_offset,
            out.data_size());
--- a/mlx/backend/cuda/copy/copy_general.cu
+++ b/mlx/backend/cuda/copy/copy_general.cu
@@ -37,7 +37,7 @@ __global__ void copy_gg(
    int ndim) {
  IdxT index = cg::this_grid().thread_rank();
  if (index < size) {
-    auto [idx_in, idx_out] = elem_to_loc(
+    auto [idx_in, idx_out] = elem_to_loc_4d(
        index, shape.data(), strides_in.data(), strides_out.data(), ndim);
    out[idx_out] = CastOp<In, Out>{}(in[idx_in]);
  }
@@ -79,6 +79,7 @@ void copy_general(
                    kernel,
                    num_blocks,
                    block_dims,
+                    0,
                    in_ptr,
                    out_ptr,
                    data_size,
@@ -94,6 +95,7 @@ void copy_general(
                  kernel,
                  num_blocks,
                  block_dims,
+                  0,
                  in_ptr,
                  out_ptr,
                  data_size,
--- a/mlx/backend/cuda/copy/copy_general_dynamic.cu
+++ b/mlx/backend/cuda/copy/copy_general_dynamic.cu
@@ -41,7 +41,7 @@ __global__ void copy_gg_dynamic(
    const int64_t* offset_out) {
  IdxT index = cg::this_grid().thread_rank();
  if (index < size) {
-    auto [idx_in, idx_out] = elem_to_loc(
+    auto [idx_in, idx_out] = elem_to_loc_4d(
        index, shape.data(), strides_in.data(), strides_out.data(), ndim);
    out[idx_out + *offset_out] = CastOp<In, Out>{}(in[idx_in + *offset_in]);
  }
@@ -82,6 +82,7 @@ void copy_general_dynamic(
                    kernel,
                    num_blocks,
                    block_dims,
+                    0,
                    in_ptr,
                    out_ptr,
                    out.size(),
@@ -99,6 +100,7 @@ void copy_general_dynamic(
                  kernel,
                  num_blocks,
                  block_dims,
+                  0,
                  in_ptr,
                  out_ptr,
                  out.size(),
--- a/mlx/backend/cuda/copy/copy_general_input.cu
+++ b/mlx/backend/cuda/copy/copy_general_input.cu
@@ -34,7 +34,7 @@ __global__ void copy_g(
    int ndim) {
  IdxT index = cg::this_grid().thread_rank();
  if (index < size) {
-    IdxT idx_in = elem_to_loc(index, shape.data(), strides_in.data(), ndim);
+    IdxT idx_in = elem_to_loc_4d(index, shape.data(), strides_in.data(), ndim);
    out[index] = CastOp<In, Out>{}(in[idx_in]);
  }
 }
@@ -71,6 +71,7 @@ void copy_general_input(
                    kernel,
                    num_blocks,
                    block_dims,
+                    0,
                    in_ptr,
                    out_ptr,
                    out.size(),
@@ -85,6 +86,7 @@ void copy_general_input(
                  kernel,
                  num_blocks,
                  block_dims,
+                  0,
                  in_ptr,
                  out_ptr,
                  out.size(),
--- a/mlx/backend/cuda/device.cpp
+++ b/mlx/backend/cuda/device.cpp
@@ -215,12 +215,14 @@ void CommandEncoder::add_kernel_node(
    void* func,
    dim3 grid_dim,
    dim3 block_dim,
+    uint32_t smem_bytes,
    void** params) {
  cudaKernelNodeParams kernel_params = {0};
  kernel_params.func = func;
  kernel_params.gridDim = grid_dim;
  kernel_params.blockDim = block_dim;
  kernel_params.kernelParams = params;
+  kernel_params.sharedMemBytes = smem_bytes;
  cudaGraphNode_t node;
  CHECK_CUDA_ERROR(
      cudaGraphAddKernelNode(&node, graph_, NULL, 0, &kernel_params));
@@ -231,6 +233,7 @@ void CommandEncoder::add_kernel_node(
    CUfunction func,
    dim3 grid_dim,
    dim3 block_dim,
+    uint32_t smem_bytes,
    void** params) {
  CUDA_KERNEL_NODE_PARAMS kernel_params = {0};
  kernel_params.func = func;
@@ -241,6 +244,7 @@ void CommandEncoder::add_kernel_node(
  kernel_params.blockDimY = block_dim.y;
  kernel_params.blockDimZ = block_dim.z;
  kernel_params.kernelParams = params;
+  kernel_params.sharedMemBytes = smem_bytes;
  CUgraphNode node;
  CHECK_CUDA_ERROR(
      cuGraphAddKernelNode(&node, graph_, NULL, 0, &kernel_params));
@@ -306,6 +310,7 @@ void CommandEncoder::commit() {
  }

  // Put completion handlers in a batch.
+  worker_.end_batch();
  worker_.commit(stream_);
 }

@@ -314,6 +319,7 @@ void CommandEncoder::synchronize() {
  auto p = std::make_shared<std::promise<void>>();
  std::future<void> f = p->get_future();
  add_completed_handler([p = std::move(p)]() { p->set_value(); });
+  worker_.end_batch();
  commit();
  f.wait();
 }
--- a/mlx/backend/cuda/device.h
+++ b/mlx/backend/cuda/device.h
@@ -45,25 +45,34 @@ class CommandEncoder {
  void set_output_array(const array& arr);

  template <typename F, typename... Params>
-  void
-  add_kernel_node(F* func, dim3 grid_dim, dim3 block_dim, Params&&... params) {
+  void add_kernel_node(
+      F* func,
+      dim3 grid_dim,
+      dim3 block_dim,
+      uint32_t smem_bytes,
+      Params&&... params) {
    constexpr size_t num = sizeof...(Params);
    void* ptrs[num];
    size_t i = 0;
    ([&](auto&& p) { ptrs[i++] = static_cast<void*>(&p); }(
         std::forward<Params>(params)),
     ...);
-    add_kernel_node((void*)func, grid_dim, block_dim, ptrs);
+    add_kernel_node((void*)func, grid_dim, block_dim, smem_bytes, ptrs);
  }

  void add_kernel_node(
      CUfunction func,
      dim3 grid_dim,
      dim3 block_dim,
+      uint32_t smem_bytes,
      void** params);

-  void
-  add_kernel_node(void* func, dim3 grid_dim, dim3 block_dim, void** params);
+  void add_kernel_node(
+      void* func,
+      dim3 grid_dim,
+      dim3 block_dim,
+      uint32_t smem_bytes,
+      void** params);

  void add_temporary(const array& arr) {
    temporaries_.push_back(arr.data_shared_ptr());
--- a/mlx/backend/cuda/device/utils.cuh
+++ b/mlx/backend/cuda/device/utils.cuh
@@ -49,20 +49,6 @@ store_vector(T* ptr, uint32_t offset, const AlignedVector<T, N>& vec) {
  to[offset] = vec;
 }

-// Helper for accessing strided data.
-template <typename T>
-struct StridedIterator {
-  T it;
-  int64_t stride;
-
-  __host__ __device__ StridedIterator(T it, int64_t stride)
-      : it(it), stride(stride) {}
-
-  __host__ __device__ auto operator[](int i) const {
-    return it[i * stride];
-  }
-};
-
 ///////////////////////////////////////////////////////////////////////////////
 // Type limits utils
 ///////////////////////////////////////////////////////////////////////////////
@@ -218,8 +204,20 @@ inline __host__ __device__ cuda::std::tuple<IdxT, IdxT, IdxT> elem_to_loc_nd(
  return cuda::std::make_tuple(a_loc, b_loc, c_loc);
 }

+// Optimized version when ndim is larger than 4.
 template <typename IdxT = int64_t>
-inline __host__ __device__ cuda::std::tuple<IdxT, IdxT> elem_to_loc(
+inline __host__ __device__ IdxT
+elem_to_loc_4d(IdxT elem, const int* shape, const int64_t* strides, int ndim) {
+  IdxT loc = 0;
+  for (int i = ndim - 1; i >= 0; --i) {
+    loc += (elem % shape[i]) * IdxT(strides[i]);
+    elem /= shape[i];
+  }
+  return loc;
+}
+
+template <typename IdxT = int64_t>
+inline __host__ __device__ cuda::std::tuple<IdxT, IdxT> elem_to_loc_4d(
    IdxT elem,
    const int* shape,
    const int64_t* a_strides,
@@ -237,7 +235,7 @@ inline __host__ __device__ cuda::std::tuple<IdxT, IdxT> elem_to_loc(
 }

 template <typename IdxT = int64_t>
-inline __host__ __device__ cuda::std::tuple<IdxT, IdxT, IdxT> elem_to_loc(
+inline __host__ __device__ cuda::std::tuple<IdxT, IdxT, IdxT> elem_to_loc_4d(
    IdxT elem,
    const int* shape,
    const int64_t* a_strides,
--- a/mlx/backend/cuda/eval.cpp
+++ b/mlx/backend/cuda/eval.cpp
@@ -19,6 +19,8 @@ void new_stream(Stream s) {
  cudaFree(nullptr);
  // Ensure the static stream objects get created.
  cu::get_command_encoder(s);
+  // The main thread is safe to free buffers.
+  cu::allocator().register_this_thread();
 }

 void eval(array& arr) {
--- a/mlx/backend/cuda/event.cu
+++ b/mlx/backend/cuda/event.cu
@@ -110,26 +110,24 @@ __global__ void event_signal_kernel(SharedEvent::Atomic* ac, uint64_t value) {
  event_signal(ac, value);
 }

-SharedEvent::Atomic* to_atomic(std::shared_ptr<Buffer> buf) {
-  return static_cast<SharedEvent::Atomic*>(buf->raw_ptr());
-}
-
 SharedEvent::SharedEvent() {
-  buf_ = std::shared_ptr<Buffer>(
-      new Buffer{allocator().malloc(sizeof(Atomic))}, [](Buffer* ptr) {
-        allocator().free(*ptr);
-        delete ptr;
-      });
-  *static_cast<uint64_t*>(buf_->raw_ptr()) = 0;
+  // Allocate cuda::atomic on managed memory.
+  Atomic* ac;
+  CHECK_CUDA_ERROR(cudaMallocManaged(&ac, sizeof(Atomic)));
+  new (ac) Atomic(0);
+  ac_ = std::shared_ptr<Atomic>(ac, [](Atomic* ptr) {
+    ptr->~Atomic();
+    allocator().cuda_free(ptr);
+  });
 }

 void SharedEvent::wait(uint64_t value) {
  nvtx3::scoped_range r("cu::SharedEvent::wait");
-  event_wait(to_atomic(buf_), value);
+  event_wait(ac_.get(), value);
 }

 void SharedEvent::wait(cudaStream_t stream, uint64_t value) {
-  event_wait_kernel<<<1, 1, 0, stream>>>(to_atomic(buf_), value);
+  event_wait_kernel<<<1, 1, 0, stream>>>(ac_.get(), value);
 }

 void SharedEvent::wait(Stream s, uint64_t value) {
@@ -140,17 +138,17 @@ void SharedEvent::wait(Stream s, uint64_t value) {
    auto& encoder = get_command_encoder(s);
    encoder.commit();
    wait(encoder.stream(), value);
-    encoder.add_completed_handler([buf = buf_]() {});
+    encoder.add_completed_handler([ac = ac_]() {});
  }
 }

 void SharedEvent::signal(uint64_t value) {
  nvtx3::scoped_range r("cu::SharedEvent::signal");
-  event_signal(to_atomic(buf_), value);
+  event_signal(ac_.get(), value);
 }

 void SharedEvent::signal(cudaStream_t stream, uint64_t value) {
-  event_signal_kernel<<<1, 1, 0, stream>>>(to_atomic(buf_), value);
+  event_signal_kernel<<<1, 1, 0, stream>>>(ac_.get(), value);
 }

 void SharedEvent::signal(Stream s, uint64_t value) {
@@ -164,18 +162,18 @@ void SharedEvent::signal(Stream s, uint64_t value) {
    auto& encoder = get_command_encoder(s);
    encoder.commit();
    signal(encoder.stream(), value);
-    encoder.add_completed_handler([buf = buf_]() {});
+    encoder.add_completed_handler([ac = ac_]() {});
  }
 }

 bool SharedEvent::is_signaled(uint64_t value) const {
  nvtx3::scoped_range r("cu::SharedEvent::is_signaled");
-  return to_atomic(buf_)->load() >= value;
+  return ac_->load() >= value;
 }

 uint64_t SharedEvent::value() const {
  nvtx3::scoped_range r("cu::SharedEvent::value");
-  return to_atomic(buf_)->load();
+  return ac_->load();
 }

 } // namespace cu
--- a/mlx/backend/cuda/event.h
+++ b/mlx/backend/cuda/event.h
@@ -2,7 +2,6 @@

 #pragma once

-#include "mlx/allocator.h"
 #include "mlx/stream.h"

 #include <cuda_runtime.h>
@@ -56,8 +55,12 @@ class SharedEvent {
  bool is_signaled(uint64_t value) const;
  uint64_t value() const;

+  const std::shared_ptr<Atomic>& atomic() const {
+    return ac_;
+  }
+
 private:
-  std::shared_ptr<mlx::core::allocator::Buffer> buf_;
+  std::shared_ptr<Atomic> ac_;
 };

 } // namespace mlx::core::cu
--- a/mlx/backend/cuda/gemv.cu
+++ b/mlx/backend/cuda/gemv.cu
@@ -1,147 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/cuda/gemv.h"
-#include "mlx/backend/cuda/kernel_utils.cuh"
-#include "mlx/dtype_utils.h"
-
-#include <cooperative_groups.h>
-#include <cooperative_groups/reduce.h>
-
-namespace mlx::core::cu {
-
-namespace cg = cooperative_groups;
-
-static constexpr int n_per_thread = 4;
-static constexpr int rows_per_block = 8;
-
-template <typename T, int rows_per_block, int n_per_thread>
-__device__ void
-gemv_impl(const T* mat, const T* vec, T* out, int rows, int cols) {
-  auto block = cg::this_thread_block();
-  auto warp = cg::tiled_partition<WARP_SIZE>(block);
-
-  auto g_idx = block.group_index();
-  auto t_idx = block.thread_index();
-  int row = g_idx.x * rows_per_block + t_idx.y;
-
-  if (row < rows) {
-    float sum = 0.0f;
-    for (int col = n_per_thread * warp.thread_rank(); col < cols;
-         col += (WARP_SIZE * n_per_thread)) {
-      auto local_mat = load_vector<n_per_thread>(mat + row * cols + col, 0);
-      auto local_vec = load_vector<n_per_thread>(vec + col, 0);
-#pragma unroll
-      for (int j = 0; j < n_per_thread; ++j) {
-        sum += static_cast<float>(local_mat.val[j]) *
-            static_cast<float>(local_vec.val[j]);
-      }
-    }
-
-    sum = cg::reduce(warp, sum, cg::plus<float>{});
-    if (warp.thread_rank() == 0) {
-      out[row] = static_cast<T>(sum);
-    }
-  }
-}
-
-template <typename T, int rows_per_block, int n_per_thread>
-__global__ void
-gemv_single(const T* mat, const T* vec, T* out, int rows, int cols) {
-  gemv_impl<T, rows_per_block, n_per_thread>(mat, vec, out, rows, cols);
-}
-
-template <typename T, int rows_per_block, int n_per_thread>
-__global__ void gemv_batched(
-    const T* mat,
-    const T* vec,
-    T* out,
-    int rows,
-    int cols,
-    const __grid_constant__ Shape batch_shape,
-    const __grid_constant__ Strides mat_batch_strides,
-    const __grid_constant__ Strides vec_batch_strides,
-    int batch_ndim) {
-  auto block = cg::this_thread_block();
-  auto batch_idx = block.group_index().y;
-  auto [vec_offset, mat_offset] = elem_to_loc(
-      batch_idx,
-      batch_shape.data(),
-      vec_batch_strides.data(),
-      mat_batch_strides.data(),
-      batch_ndim);
-  gemv_impl<T, rows_per_block, n_per_thread>(
-      mat + mat_offset, vec + vec_offset, out + batch_idx * rows, rows, cols);
-}
-
-bool can_use_gemv(int M, int N, int K, bool a_transposed, bool b_transposed) {
-  return K % (WARP_SIZE * n_per_thread) == 0 &&
-      ((M == 1 && b_transposed) || (N == 1 && !a_transposed));
-}
-
-void gemv(
-    const array& a,
-    const array& b,
-    array& out,
-    int M,
-    int N,
-    int K,
-    uint32_t batch_count,
-    const mlx::core::Shape& batch_shape,
-    const mlx::core::Strides& a_batch_strides,
-    const mlx::core::Strides& b_batch_strides,
-    CommandEncoder& encoder) {
-  encoder.set_input_array(a);
-  encoder.set_input_array(b);
-  encoder.set_output_array(out);
-  dispatch_float_types(out.dtype(), "gemv", [&](auto type_tag) {
-    using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-    dim3 block_dims{WARP_SIZE, rows_per_block};
-    const DataType* mat;
-    const DataType* vec;
-    int rows;
-    int cols = K;
-    auto mat_strides = const_param(a_batch_strides);
-    auto vec_strides = const_param(b_batch_strides);
-
-    if (M == 1) {
-      mat = b.data<DataType>();
-      vec = a.data<DataType>();
-      rows = N;
-      std::swap(mat_strides, vec_strides);
-    } else {
-      mat = a.data<DataType>();
-      vec = b.data<DataType>();
-      rows = M;
-    }
-    uint32_t num_blocks_x = (rows + rows_per_block - 1) / rows_per_block;
-    if (batch_count == 1) {
-      auto kernel = gemv_single<DataType, rows_per_block, n_per_thread>;
-      encoder.add_kernel_node(
-          kernel,
-          num_blocks_x,
-          block_dims,
-          mat,
-          vec,
-          out.data<DataType>(),
-          rows,
-          cols);
-    } else {
-      auto kernel = gemv_batched<DataType, rows_per_block, n_per_thread>;
-      encoder.add_kernel_node(
-          kernel,
-          dim3{num_blocks_x, batch_count},
-          block_dims,
-          mat,
-          vec,
-          out.data<DataType>(),
-          rows,
-          cols,
-          const_param(batch_shape),
-          mat_strides,
-          vec_strides,
-          batch_shape.size());
-    }
-  });
-}
-
-} // namespace mlx::core::cu
--- a/mlx/backend/cuda/gemv.h
+++ b/mlx/backend/cuda/gemv.h
@@ -1,24 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#pragma once
-
-#include "mlx/backend/cuda/device.h"
-
-namespace mlx::core::cu {
-
-bool can_use_gemv(int M, int N, int K, bool a_transposed, bool b_transposed);
-
-void gemv(
-    const array& a,
-    const array& b,
-    array& out,
-    int M,
-    int N,
-    int K,
-    uint32_t batch_count,
-    const mlx::core::Shape& batch_shape,
-    const mlx::core::Strides& a_batch_strides,
-    const mlx::core::Strides& b_batch_strides,
-    CommandEncoder& encoder);
-
-} // namespace mlx::core::cu
--- a/mlx/backend/cuda/indexing.cpp
+++ b/mlx/backend/cuda/indexing.cpp
@@ -129,7 +129,7 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {

  auto kernel = mod.get_kernel(kernel_name);
  auto [num_blocks, block_dims] = get_launch_args(kernel, out, large);
-  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
+  encoder.add_kernel_node(kernel, num_blocks, block_dims, 0, args.args());
 }

 void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
@@ -230,7 +230,7 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
  encoder.set_output_array(out);
  auto kernel = mod.get_kernel(kernel_name);
  auto [num_blocks, block_dims] = get_launch_args(kernel, upd, large);
-  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
+  encoder.add_kernel_node(kernel, num_blocks, block_dims, 0, args.args());
 }

 void GatherAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
@@ -318,7 +318,7 @@ void GatherAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
  encoder.set_output_array(out);
  auto kernel = mod.get_kernel(kernel_name);
  auto [num_blocks, block_dims] = get_launch_args(kernel, idx, large);
-  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
+  encoder.add_kernel_node(kernel, num_blocks, block_dims, 0, args.args());
 }

 void ScatterAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
@@ -422,7 +422,7 @@ void ScatterAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
  encoder.set_output_array(out);
  auto kernel = mod.get_kernel(kernel_name);
  auto [num_blocks, block_dims] = get_launch_args(kernel, idx, large);
-  encoder.add_kernel_node(kernel, num_blocks, block_dims, args.args());
+  encoder.add_kernel_node(kernel, num_blocks, block_dims, 0, args.args());
 }

 } // namespace mlx::core
--- a/mlx/backend/cuda/iterators/general_iterator.cuh
+++ b/mlx/backend/cuda/iterators/general_iterator.cuh
@@ -0,0 +1,121 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include <thrust/iterator/iterator_adaptor.h>
+#include <cuda/std/utility>
+
+#include "mlx/backend/cuda/kernel_utils.cuh"
+
+namespace mlx::core::cu {
+
+// Iterating non-contiguous array.
+template <typename Iterator, typename IdxT = int64_t>
+class general_iterator
+    : public thrust::
+          iterator_adaptor<general_iterator<Iterator, IdxT>, Iterator> {
+ public:
+  using super_t =
+      thrust::iterator_adaptor<general_iterator<Iterator, IdxT>, Iterator>;
+
+  using reference = typename super_t::reference;
+  using difference_type = typename super_t::difference_type;
+
+  __host__ __device__ general_iterator(
+      Iterator it,
+      IdxT index,
+      int ndim,
+      Shape shape,
+      Strides strides)
+      : super_t(it),
+        index_(index),
+        ndim_(ndim),
+        shape_(cuda::std::move(shape)),
+        strides_(cuda::std::move(strides)) {}
+
+  __host__ __device__ IdxT index() const {
+    return index_;
+  }
+
+  __host__ __device__ const Shape& shape() const {
+    return shape_;
+  }
+
+  __host__ __device__ const Strides& strides() const {
+    return strides_;
+  }
+
+ private:
+  friend class thrust::iterator_core_access;
+
+  __host__ __device__ bool equal(const general_iterator& other) const {
+    return this->base() == other.base() && this->index() == other.index();
+  }
+
+  __host__ __device__ void advance(difference_type n) {
+    this->index_ += n;
+  }
+
+  __host__ __device__ void increment() {
+    this->index_ += 1;
+  }
+
+  __host__ __device__ void decrement() {
+    this->index_ -= 1;
+  }
+
+  __host__ __device__ difference_type
+  distance_to(const general_iterator& other) const {
+    _CCCL_ASSERT(
+        this->base() == other.base(),
+        "Underlying iterator must point to same base iterator");
+    return other.index() - this->index();
+  }
+
+  // The dereference is device-only to avoid accidental running in host.
+  __device__ typename super_t::reference dereference() const {
+    IdxT offset = elem_to_loc(index_, shape_.data(), strides_.data(), ndim_);
+    return *(this->base() + offset);
+  }
+
+  IdxT index_;
+  int ndim_;
+  Shape shape_;
+  Strides strides_;
+};
+
+template <typename IdxT, typename Iterator>
+__host__ __device__ auto make_general_iterator(
+    Iterator it,
+    IdxT index,
+    int ndim,
+    Shape shape,
+    Strides strides) {
+  return general_iterator<Iterator, IdxT>(
+      it, index, ndim, cuda::std::move(shape), cuda::std::move(strides));
+}
+
+template <typename IdxT, typename Iterator>
+auto make_general_iterator(
+    Iterator it,
+    const std::vector<int32_t>& shape,
+    const std::vector<int64_t>& strides) {
+  return make_general_iterator<IdxT>(
+      it, 0, shape.size(), const_param(shape), const_param(strides));
+}
+
+template <typename IdxT, typename Iterator>
+auto make_general_iterators(
+    Iterator it,
+    IdxT size,
+    const std::vector<int32_t>& shape,
+    const std::vector<int64_t>& strides) {
+  auto ndim = shape.size();
+  auto shape_arg = const_param(shape);
+  auto strides_arg = const_param(strides);
+  return std::make_pair(
+      make_general_iterator<IdxT>(it, 0, ndim, shape_arg, strides_arg),
+      make_general_iterator<IdxT>(it, size, ndim, shape_arg, strides_arg));
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/iterators/strided_iterator.cuh
+++ b/mlx/backend/cuda/iterators/strided_iterator.cuh
@@ -0,0 +1,60 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/iterator/iterator_facade.h>
+
+namespace mlx::core::cu {
+
+// RandomAccessIterator for strided access to array entries.
+template <typename Iterator, typename Stride = int64_t>
+class strided_iterator
+    : public thrust::
+          iterator_adaptor<strided_iterator<Iterator, Stride>, Iterator> {
+ public:
+  using super_t =
+      thrust::iterator_adaptor<strided_iterator<Iterator, Stride>, Iterator>;
+
+  using reference = typename super_t::reference;
+  using difference_type = typename super_t::difference_type;
+
+  __host__ __device__ strided_iterator(Iterator it, Stride stride)
+      : super_t(it), stride_(stride) {}
+
+  __host__ __device__ Stride stride() const {
+    return stride_;
+  }
+
+ private:
+  friend class thrust::iterator_core_access;
+
+  __host__ __device__ bool equal(const strided_iterator& other) const {
+    return this->base() == other.base();
+  }
+
+  __host__ __device__ void advance(difference_type n) {
+    this->base_reference() += n * stride_;
+  }
+
+  __host__ __device__ void increment() {
+    this->base_reference() += stride_;
+  }
+
+  __host__ __device__ void decrement() {
+    this->base_reference() -= stride_;
+  }
+
+  __host__ __device__ difference_type
+  distance_to(const strided_iterator& other) const {
+    const difference_type dist = other.base() - this->base();
+    _CCCL_ASSERT(
+        dist % stride() == 0,
+        "Underlying iterator difference must be divisible by the stride");
+    return dist / stride();
+  }
+
+  Stride stride_;
+};
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/layer_norm.cu
+++ b/mlx/backend/cuda/layer_norm.cu
@@ -1,6 +1,7 @@
 // Copyright © 2025 Apple Inc.

 #include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/iterators/strided_iterator.cuh"
 #include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/backend/cuda/reduce/reduce.cuh"
 #include "mlx/backend/gpu/copy.h"
@@ -104,8 +105,8 @@ __global__ void layer_norm(
    T wn[N_READS];
    T bn[N_READS];
    cub::LoadDirectBlocked(index, x, xn, axis_size);
-    cub::LoadDirectBlocked(index, StridedIterator(w, w_stride), wn, axis_size);
-    cub::LoadDirectBlocked(index, StridedIterator(b, b_stride), bn, axis_size);
+    cub::LoadDirectBlocked(index, strided_iterator(w, w_stride), wn, axis_size);
+    cub::LoadDirectBlocked(index, strided_iterator(b, b_stride), bn, axis_size);
    for (int i = 0; i < N_READS; ++i) {
      float norm = (static_cast<float>(xn[i]) - mean) * normalizer;
      xn[i] = wn[i] * static_cast<T>(norm) + bn[i];
@@ -161,7 +162,7 @@ __global__ void layer_norm_vjp(
    auto index = r * BLOCK_DIM + block.thread_rank();
    cub::LoadDirectBlocked(index, x, xn, axis_size, mean);
    cub::LoadDirectBlocked(index, g, gn, axis_size);
-    cub::LoadDirectBlocked(index, StridedIterator(w, w_stride), wn, axis_size);
+    cub::LoadDirectBlocked(index, strided_iterator(w, w_stride), wn, axis_size);
    for (int i = 0; i < N_READS; i++) {
      float t = static_cast<float>(xn[i]) - mean;
      float wi = wn[i];
@@ -184,7 +185,7 @@ __global__ void layer_norm_vjp(
    T gn[N_READS];
    cub::LoadDirectBlocked(index, x, xn, axis_size);
    cub::LoadDirectBlocked(index, g, gn, axis_size);
-    cub::LoadDirectBlocked(index, StridedIterator(w, w_stride), wn, axis_size);
+    cub::LoadDirectBlocked(index, strided_iterator(w, w_stride), wn, axis_size);
    for (int i = 0; i < N_READS; i++) {
      float xi = (static_cast<float>(xn[i]) - mean) * normalizer;
      float wi = wn[i];
@@ -265,6 +266,7 @@ void LayerNorm::eval_gpu(
          kernel,
          n_rows,
          block_dim(),
+          0,
          x.data<DataType>(),
          w.data<DataType>(),
          b.data<DataType>(),
@@ -377,6 +379,7 @@ void LayerNormVJP::eval_gpu(
                kernel,
                n_rows,
                block_dim(),
+                0,
                x.data<DataType>(),
                w.data<DataType>(),
                g.data<DataType>(),
--- a/mlx/backend/cuda/logsumexp.cu
+++ b/mlx/backend/cuda/logsumexp.cu
@@ -151,6 +151,7 @@ void LogSumExp::eval_gpu(const std::vector<array>& inputs, array& out) {
          kernel,
          n_rows,
          block_dim(),
+          0,
          in.data<DataType>(),
          out.data<DataType>(),
          axis_size);
--- a/mlx/backend/cuda/matmul.cpp
+++ b/mlx/backend/cuda/matmul.cpp
@@ -2,7 +2,6 @@

 #include "mlx/backend/common/matmul.h"
 #include "mlx/backend/cuda/device.h"
-#include "mlx/backend/cuda/gemv.h"
 #include "mlx/backend/gpu/copy.h"
 #include "mlx/dtype_utils.h"
 #include "mlx/primitives.h"
@@ -354,22 +353,6 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
    batch_shape = {1};
  }

-  if (cu::can_use_gemv(M, N, K, a_transposed, b_transposed)) {
-    cu::gemv(
-        a,
-        b,
-        out,
-        M,
-        N,
-        K,
-        batch_count,
-        batch_shape,
-        a_batch_strides,
-        b_batch_strides,
-        encoder);
-    return;
-  }
-
  /////////////////////////////////////////////////////////////////////////////
  // Invoke cublasLt

--- a/mlx/backend/cuda/matmul/mma.cuh
+++ b/mlx/backend/cuda/matmul/mma.cuh
@@ -0,0 +1,108 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/cuda/matmul/tiles.cuh"
+
+namespace mlx::core::cu {
+
+template <typename U, typename T>
+__device__ inline void
+mma_t(Tile16x16<U>& C, Tile16x16<T>& A, Tile16x16<T>& B) {}
+
+/**
+ * Multiply the 16x16 bfloat16 tiles and accumulate the result in one 16x16
+ * float tile.
+ *
+ * We actually perform C += A @ B.T
+ */
+__device__ inline void mma_t(
+    Tile16x16<float>& C,
+    Tile16x16<__nv_bfloat16>& A,
+    Tile16x16<__nv_bfloat16>& B) {
+  asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+      "{%0, %1, %2, %3}, "
+      "{%4, %5, %6, %7}, "
+      "{%8, %9}, "
+      "{%10, %11, %12, %13};"
+
+      // D matrix
+      : "+f"(C.values[0].x),
+        "+f"(C.values[0].y),
+        "+f"(C.values[1].x),
+        "+f"(C.values[1].y)
+
+      // A matrix
+      : "r"(*(uint32_t*)(&A.values[0])),
+        "r"(*(uint32_t*)(&A.values[1])),
+        "r"(*(uint32_t*)(&A.values[2])),
+        "r"(*(uint32_t*)(&A.values[3])),
+
+        // B matrix
+        "r"(*(uint32_t*)(&B.values[0])),
+        "r"(*(uint32_t*)(&B.values[2])),
+
+        // C matrix
+        "f"(C.values[0].x),
+        "f"(C.values[0].y),
+        "f"(C.values[1].x),
+        "f"(C.values[1].y));
+  asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+      "{%0, %1, %2, %3}, "
+      "{%4, %5, %6, %7}, "
+      "{%8, %9}, "
+      "{%10, %11, %12, %13};"
+
+      // D matrix
+      : "+f"(C.values[2].x),
+        "+f"(C.values[2].y),
+        "+f"(C.values[3].x),
+        "+f"(C.values[3].y)
+
+      // A matrix
+      : "r"(*(uint32_t*)(&A.values[0])),
+        "r"(*(uint32_t*)(&A.values[1])),
+        "r"(*(uint32_t*)(&A.values[2])),
+        "r"(*(uint32_t*)(&A.values[3])),
+
+        // B matrix
+        "r"(*(uint32_t*)(&B.values[1])),
+        "r"(*(uint32_t*)(&B.values[3])),
+
+        // C matrix
+        "f"(C.values[2].x),
+        "f"(C.values[2].y),
+        "f"(C.values[3].x),
+        "f"(C.values[3].y));
+}
+
+/**
+ * Multiply larger register tiles by delegating to mma_t.
+ */
+template <typename U, typename T, int M, int N, int K>
+__device__ inline void mma_t(
+    RegisterTile<U, M, N>& C,
+    RegisterTile<T, M, K>& A,
+    RegisterTile<T, N, K>& B) {
+  constexpr int TILES_M = RegisterTile<T, M, K>::TILES_Y;
+  constexpr int TILES_K = RegisterTile<T, M, K>::TILES_X;
+  constexpr int TILES_N = RegisterTile<T, N, K>::TILES_Y;
+
+  MLX_UNROLL
+  for (int k = 0; k < TILES_K; k++) {
+    MLX_UNROLL
+    for (int m = 0; m < TILES_M; m++) {
+      MLX_UNROLL
+      for (int n = 0; n < TILES_N; n++) {
+        mma_t(
+            C.data[m * TILES_N + n],
+            A.data[m * TILES_K + k],
+            B.data[n * TILES_K + k]);
+      }
+    }
+  }
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/matmul/tiles.cuh
+++ b/mlx/backend/cuda/matmul/tiles.cuh
@@ -0,0 +1,419 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#define MLX_UNROLL _Pragma("unroll")
+
+namespace mlx::core::cu {
+
+// Map types to their vector of 2 type float -> float2, double -> double2 etc
+template <typename T>
+struct Vector2;
+template <>
+struct Vector2<double> {
+  using type = double2;
+};
+template <>
+struct Vector2<float> {
+  using type = float2;
+};
+template <>
+struct Vector2<__half> {
+  using type = __half2;
+};
+template <>
+struct Vector2<__nv_bfloat16> {
+  using type = __nv_bfloat162;
+};
+template <typename T>
+using Vector2_t = typename Vector2<T>::type;
+
+/**
+ * The basic building block for Ampere mmas. A 16x16 tile distributed across
+ * the warp.
+ *
+ * Each thread holds 8 values. They are distributed according to
+ * https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-fragment-mma-16816-float
+ *
+ * For use instructions see the individual methods eg load().
+ */
+template <typename T>
+struct Tile16x16 {
+  using T2 = Vector2_t<T>;
+
+  T2 values[4];
+
+  __device__ inline void fill(T v) {
+    T2 v2 = {v, v};
+    for (int i = 0; i < 4; i++) {
+      values[i] = v2;
+    }
+  }
+
+  /**
+   * Load a 16x16 tile from shared memory.
+   *
+   * The instruction is a bit weird in the sense that the address provided by
+   * each thread and the elements loaded are not the same.
+   *
+   * We load 4 8x8 tiles. The tile rows are stored contiguously in memory. As a
+   * result the warp provides 4*8 = 32 addresses one per row.
+   *
+   * Threads 0-7 provide the addresses for the first tile, 8-15 for the second
+   * and so on. For instance to load a non swizzled tile we would do
+   *
+   *    base_addr + (laneid % 16) * BK + (laneid / 2) * 8
+   *
+   * See
+   * https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-instructions-ldmatrix
+   */
+  __device__ inline void load(uint32_t row_address) {
+    if constexpr (
+        std::is_same_v<T2, __nv_bfloat162> || std::is_same_v<T2, __half2>) {
+      asm volatile(
+          "ldmatrix.sync.aligned.m8n8.x4.shared::cta.b16 {%0, %1, %2, %3}, [%4];\n"
+          : "=r"(*(uint32_t*)&(values[0])),
+            "=r"(*(uint32_t*)&(values[1])),
+            "=r"(*(uint32_t*)&(values[2])),
+            "=r"(*(uint32_t*)&(values[3]))
+          : "r"(row_address));
+    }
+  }
+
+  /**
+   * Store the tile to the address pointed to by `x`.
+   *
+   * The provided pointer is a generic pointer but this is meant to be used to
+   * store to global memory. For storing to shared memory we should use
+   * `stmatrix`.
+   *
+   * This also showcases the format of the tile quite nicely. Each register is
+   * holding to adjacent values. The indices are
+   *
+   *    row + 0, col + 0
+   *    row + 8, col + 0
+   *    row + 0, col + 8
+   *    row + 8, col + 8
+   *
+   * Given that we are dealing with Vector2_t<U> the column offsets are 4
+   * instead of 8.
+   */
+  template <typename U>
+  __device__ inline void store_global(U* x, int N) {
+    using U2 = Vector2_t<U>;
+    U2* x2 = reinterpret_cast<U2*>(x);
+    const int laneid = threadIdx.x % 32;
+    const int row = laneid / 4;
+    const int col = laneid % 4;
+    if constexpr (std::is_same_v<U2, T2>) {
+      x2[(row + 0) * (N / 2) + col + 0] = values[0];
+      x2[(row + 0) * (N / 2) + col + 4] = values[2];
+      x2[(row + 8) * (N / 2) + col + 0] = values[1];
+      x2[(row + 8) * (N / 2) + col + 4] = values[3];
+    } else if constexpr (
+        std::is_same_v<T2, float2> && std::is_same_v<U, __nv_bfloat16>) {
+      x2[(row + 0) * (N / 2) + col + 0] =
+          __floats2bfloat162_rn(values[0].x, values[0].y);
+      x2[(row + 0) * (N / 2) + col + 4] =
+          __floats2bfloat162_rn(values[2].x, values[2].y);
+      x2[(row + 8) * (N / 2) + col + 0] =
+          __floats2bfloat162_rn(values[1].x, values[1].y);
+      x2[(row + 8) * (N / 2) + col + 4] =
+          __floats2bfloat162_rn(values[3].x, values[3].y);
+    }
+  }
+
+  template <typename U>
+  __device__ inline void store_global_safe(U* x, int N, int max_rows) {
+    const int laneid = threadIdx.x % 32;
+    const int row = laneid / 4;
+    const int col = laneid % 4;
+    if (row < max_rows) {
+      x[(row + 0) * N + 2 * col + 0] = static_cast<U>(values[0].x);
+      x[(row + 0) * N + 2 * col + 1] = static_cast<U>(values[0].y);
+      x[(row + 0) * N + 2 * col + 8] = static_cast<U>(values[2].x);
+      x[(row + 0) * N + 2 * col + 9] = static_cast<U>(values[2].y);
+    }
+    if (row + 8 < max_rows) {
+      x[(row + 8) * N + 2 * col + 0] = static_cast<U>(values[1].x);
+      x[(row + 8) * N + 2 * col + 1] = static_cast<U>(values[1].y);
+      x[(row + 8) * N + 2 * col + 8] = static_cast<U>(values[3].x);
+      x[(row + 8) * N + 2 * col + 9] = static_cast<U>(values[3].y);
+    }
+  }
+};
+
+/**
+ * A simple container of multiple Tile16x16.
+ *
+ * Provides utility functions for loading and manipulating collections of basic
+ * tiles.
+ */
+template <typename T, int ROWS_, int COLS_>
+struct RegisterTile {
+  static constexpr int ROWS = ROWS_;
+  static constexpr int COLS = COLS_;
+  static constexpr int TILES_X = COLS / 16;
+  static constexpr int TILES_Y = ROWS / 16;
+
+  Tile16x16<T> data[TILES_X * TILES_Y];
+
+  __device__ inline void fill(T v) {
+    MLX_UNROLL
+    for (int i = 0; i < TILES_Y; i++) {
+      MLX_UNROLL
+      for (int j = 0; j < TILES_X; j++) {
+        data[i * TILES_X + j].fill(v);
+      }
+    }
+  }
+
+  template <typename Tile>
+  __device__ inline void
+  load(Tile& tile, uint32_t base_address, int row, int col) {
+    MLX_UNROLL
+    for (int i = 0; i < TILES_Y; i++) {
+      MLX_UNROLL
+      for (int j = 0; j < TILES_X; j++) {
+        data[i * TILES_X + j].load(
+            tile.loc(base_address, row + i * 16, col + j * 16));
+      }
+    }
+  }
+
+  template <typename U>
+  __device__ inline void store_global(U* x, int N, int row, int col) {
+    MLX_UNROLL
+    for (int i = 0; i < TILES_Y; i++) {
+      MLX_UNROLL
+      for (int j = 0; j < TILES_X; j++) {
+        data[i * TILES_X + j].store_global(
+            x + (row + i * 16) * N + col + j * 16, N);
+      }
+    }
+  }
+
+  template <typename U>
+  __device__ inline void
+  store_global_safe(U* x, int N, int row, int col, int max_rows) {
+    MLX_UNROLL
+    for (int i = 0; i < TILES_Y; i++) {
+      MLX_UNROLL
+      for (int j = 0; j < TILES_X; j++) {
+        data[i * TILES_X + j].store_global_safe(
+            x + (row + i * 16) * N + col + j * 16, N, max_rows - row - i * 16);
+      }
+    }
+  }
+};
+
+template <typename T, int ROWS_, int COLS_>
+struct SharedTile {
+  static constexpr int ROWS = ROWS_;
+  static constexpr int COLS = COLS_;
+  static constexpr int TILES_X = COLS / 16;
+  static constexpr int TILES_Y = ROWS / 16;
+  static constexpr int NUMEL = ROWS * COLS;
+
+  // Swizzle taken from ThunderKittens.
+  //
+  // See inludes/types/shared/st.cuh
+  //
+  // I do feel that it is too math heavy and can be improved. Also the math is
+  // done every time although the addresses don't change from load to load. I
+  // guess we are expecting the compiler to figure that out.
+  static constexpr int swizzle_bytes =
+      (sizeof(T) == 2 ? (TILES_X % 4 == 0 ? 128 : (TILES_X % 2 == 0 ? 64 : 32))
+                      : (sizeof(T) == 4 ? (TILES_X % 2 == 0 ? 128 : 64) : 0));
+
+  T data[ROWS * COLS];
+
+  // Return a pointer to the element at (row, col) using the swizzle.
+  __device__ static inline T* ptr(T* ptr, int row, int col) {
+    if constexpr (swizzle_bytes > 0) {
+      static constexpr int swizzle_repeat = swizzle_bytes * 8;
+      static constexpr int subtile_cols = swizzle_bytes / sizeof(T);
+      const int outer_idx = col / subtile_cols;
+      const uint64_t addr =
+          (uint64_t)(&ptr
+                         [outer_idx * ROWS * subtile_cols + row * subtile_cols +
+                          col % subtile_cols]);
+      const int swizzle = ((addr % swizzle_repeat) >> 7) << 4;
+      return (T*)(addr ^ swizzle);
+    } else {
+      return ptr + row * COLS + col;
+    }
+  }
+
+  // Return the location of the element at (row, col) using the swizzle.
+  __device__ static inline uint32_t loc(uint32_t ptr, int row, int col) {
+    if constexpr (swizzle_bytes > 0) {
+      static constexpr int swizzle_repeat = swizzle_bytes * 8;
+      static constexpr int subtile_cols = swizzle_bytes / sizeof(T);
+      const int outer_idx = col / subtile_cols;
+      const uint32_t addr = ptr +
+          sizeof(T) *
+              (outer_idx * ROWS * subtile_cols + row * subtile_cols +
+               col % subtile_cols);
+      const int swizzle = ((addr % swizzle_repeat) >> 7) << 4;
+      return (addr ^ swizzle);
+    } else {
+      return ptr + sizeof(T) * (row * COLS + col);
+    }
+  }
+
+  // Convenience functions to edit elements going through the swizzle.
+  __device__ inline T& operator()(int row, int col) {
+    return *ptr(data, row, col);
+  }
+  __device__ inline void store(float4& v, int row, int col) {
+    *(reinterpret_cast<float4*>(ptr(data, row, col))) = v;
+  }
+  __device__ inline void store(float2& v, int row, int col) {
+    *(reinterpret_cast<float2*>(ptr(data, row, col))) = v;
+  }
+  __device__ inline void store(float& v, int row, int col) {
+    *(reinterpret_cast<float*>(ptr(data, row, col))) = v;
+  }
+  template <int N>
+  __device__ inline void store(T (&v)[N], int row, int col) {
+    if constexpr (sizeof(T) * N == 4) {
+      store(*(reinterpret_cast<float*>(&v[0])), row, col);
+    } else if constexpr (sizeof(T) * N == 8) {
+      store(*(reinterpret_cast<float2*>(&v[0])), row, col);
+    } else if constexpr (sizeof(T) * N == 16) {
+      store(*(reinterpret_cast<float4*>(&v[0])), row, col);
+    } else {
+      MLX_UNROLL
+      for (int i = 0; i < N; i++) {
+        *ptr(data, row, col + i) = v[i];
+      }
+    }
+  }
+};
+
+/**
+ * Load the tile from global memory by loading 16 bytes at a time and storing
+ * them immediately.
+ */
+template <int NUM_WARPS, typename T, typename Tile>
+__device__ inline void load(Tile& tile, const T* x, int N) {
+  constexpr int NUM_THREADS = NUM_WARPS * 32;
+  constexpr int ELEMENTS_PER_LOAD = sizeof(float4) / sizeof(T);
+  constexpr int NUM_LOADS = Tile::NUMEL / ELEMENTS_PER_LOAD;
+  constexpr int NUM_LOADS_PER_THREAD = NUM_LOADS / NUM_THREADS;
+  constexpr int NUM_LOADS_PER_ROW = Tile::COLS / ELEMENTS_PER_LOAD;
+  constexpr int STEP_ROWS = NUM_THREADS / NUM_LOADS_PER_ROW;
+
+  const int row = threadIdx.x / NUM_LOADS_PER_ROW;
+  const int col = threadIdx.x % NUM_LOADS_PER_ROW;
+
+  x += row * N + col * ELEMENTS_PER_LOAD;
+
+  MLX_UNROLL
+  for (int i = 0; i < NUM_LOADS_PER_THREAD; i++) {
+    float4 tmp;
+    tmp = *(reinterpret_cast<const float4*>(&x[i * STEP_ROWS * N]));
+    tile.store(tmp, row + i * STEP_ROWS, col * ELEMENTS_PER_LOAD);
+  }
+}
+
+/**
+ * Copy 16 bytes from the globale memory address pointed to by x to the smem
+ * address pointed to by row_address.
+ *
+ * A simple wrapper over the PTX.
+ */
+template <typename T>
+__device__ inline void cp_async_16(uint32_t row_address, const T* x) {
+  asm volatile(
+      "cp.async.ca.shared::cta.global [%0], [%1], 16;\n" ::"r"(row_address),
+      "l"(reinterpret_cast<const int4*>(x)));
+}
+
+/**
+ * Submit all the previous async copies to be executed.
+ */
+__device__ inline void cp_async_commit() {
+  asm volatile("cp.async.commit_group;\n" ::);
+}
+
+/**
+ * Wait for all the async copies to finish.
+ */
+__device__ inline void cp_async_wait_all() {
+  asm volatile("cp.async.wait_all;\n" ::);
+}
+
+/**
+ * The asynchronous equivalent of load.
+ *
+ * Loads the tile from global memory by submitting a bunch of async copy
+ * instructions. The copy won't start until commit is called and we don't have
+ * a guarantee it will finish until wait is called.
+ *
+ * It should be used as follows
+ *
+ *    load(...)
+ *    load(...)
+ *    cp_async_commit()
+ *    do_other_stuff()
+ *    cp_async_wait_all()
+ *    do_stuff_with_shmem()
+ */
+template <int NUM_WARPS, typename T, typename Tile>
+__device__ inline void
+load_async(Tile& tile, uint32_t base_address, const T* x, int N) {
+  constexpr int NUM_THREADS = NUM_WARPS * 32;
+  constexpr int ELEMENTS_PER_LOAD = sizeof(float4) / sizeof(T);
+  constexpr int NUM_LOADS = Tile::NUMEL / ELEMENTS_PER_LOAD;
+  constexpr int NUM_LOADS_PER_THREAD = NUM_LOADS / NUM_THREADS;
+  constexpr int NUM_LOADS_PER_ROW = Tile::COLS / ELEMENTS_PER_LOAD;
+  constexpr int STEP_ROWS = NUM_THREADS / NUM_LOADS_PER_ROW;
+
+  const int row = threadIdx.x / NUM_LOADS_PER_ROW;
+  const int col = threadIdx.x % NUM_LOADS_PER_ROW;
+
+  x += row * N + col * ELEMENTS_PER_LOAD;
+
+  MLX_UNROLL
+  for (int i = 0; i < NUM_LOADS_PER_THREAD; i++) {
+    cp_async_16(
+        tile.loc(base_address, row + i * STEP_ROWS, col * ELEMENTS_PER_LOAD),
+        x + i * STEP_ROWS * N);
+  }
+}
+
+template <int NUM_WARPS, typename T, typename Tile>
+__device__ inline void load_async_safe(
+    Tile& tile,
+    uint32_t base_address,
+    const T* x,
+    int N,
+    int max_rows) {
+  constexpr int NUM_THREADS = NUM_WARPS * 32;
+  constexpr int ELEMENTS_PER_LOAD = sizeof(float4) / sizeof(T);
+  constexpr int NUM_LOADS = Tile::NUMEL / ELEMENTS_PER_LOAD;
+  constexpr int NUM_LOADS_PER_THREAD = NUM_LOADS / NUM_THREADS;
+  constexpr int NUM_LOADS_PER_ROW = Tile::COLS / ELEMENTS_PER_LOAD;
+  constexpr int STEP_ROWS = NUM_THREADS / NUM_LOADS_PER_ROW;
+
+  const int row = threadIdx.x / NUM_LOADS_PER_ROW;
+  const int col = threadIdx.x % NUM_LOADS_PER_ROW;
+
+  x += row * N + col * ELEMENTS_PER_LOAD;
+
+  MLX_UNROLL
+  for (int i = 0; i < NUM_LOADS_PER_THREAD; i++) {
+    if (row + i * STEP_ROWS < max_rows) {
+      cp_async_16(
+          tile.loc(base_address, row + i * STEP_ROWS, col * ELEMENTS_PER_LOAD),
+          x + i * STEP_ROWS * N);
+    } else {
+      float4 tmp = {0, 0, 0, 0};
+      tile.store(tmp, row + i * STEP_ROWS, col * ELEMENTS_PER_LOAD);
+    }
+  }
+}
+
+} // namespace mlx::core::cu
--- a/mlx/backend/cuda/primitives.cu
+++ b/mlx/backend/cuda/primitives.cu
@@ -81,7 +81,6 @@ NO_GPU(Hadamard)
 NO_GPU(Load)
 NO_GPU_MULTI(LUF)
 NO_GPU_MULTI(QRF)
-NO_GPU(QuantizedMatmul)
 NO_GPU(SegmentedMM)
 NO_GPU_MULTI(SVD)
 NO_GPU(Inverse)
--- a/mlx/backend/cuda/quantized/affine_quantize.cu
+++ b/mlx/backend/cuda/quantized/affine_quantize.cu
@@ -2,30 +2,17 @@

 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/kernel_utils.cuh"
-#include "mlx/backend/gpu/copy.h"
+#include "mlx/backend/cuda/quantized/quantized_utils.cuh"
 #include "mlx/dtype_utils.h"
-#include "mlx/fast_primitives.h"

 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
-#include <nvtx3/nvtx3.hpp>

 namespace mlx::core {
 namespace cu {

 namespace cg = cooperative_groups;

-template <int bits, int wsize = 8>
-inline constexpr __device__ short get_pack_factor() {
-  return (bits == 3 || bits == 5) ? 8 : (bits == 6 ? 4 : wsize / bits);
-}
-
-template <int bits, int wsize = 8>
-inline constexpr __device__ short get_bytes_per_pack() {
-  constexpr int power_of_2_bits = (bits & (bits - 1)) == 0;
-  return power_of_2_bits ? (wsize / 8) : (bits == 5 ? 5 : 3);
-}
-
 template <typename T, int group_size, int bits>
 __global__ void
 affine_quantize(const T* w, uint8_t* out, T* scales, T* biases, size_t size) {
@@ -240,144 +227,102 @@ __global__ void affine_dequantize(
 }

 } // namespace cu
-namespace {

-inline array ensure_row_contiguous(
-    const array& x,
+void affine_quantize(
+    const array& w,
+    array& wq,
+    array& scales,
+    array& biases,
+    int group_size_,
+    int bits_,
    cu::CommandEncoder& enc,
    const Stream& s) {
-  if (!x.flags().row_contiguous) {
-    array x_copy = contiguous_copy_gpu(x, s);
-    enc.add_temporary(x_copy);
-    return x_copy;
-  } else {
-    return x;
-  }
-}
-
-} // namespace
-
-template <typename F>
-void dispatch_groups(int group_size, F&& f) {
-  switch (group_size) {
-    case 32:
-      f(std::integral_constant<int, 32>{});
-      break;
-    case 64:
-      f(std::integral_constant<int, 64>{});
-      break;
-    case 128:
-      f(std::integral_constant<int, 128>{});
-      break;
-  }
-}
-
-template <typename F>
-void dispatch_bits(int bits, F&& f) {
-  switch (bits) {
-    case 2:
-      f(std::integral_constant<int, 2>{});
-      break;
-    case 3:
-      f(std::integral_constant<int, 3>{});
-      break;
-    case 4:
-      f(std::integral_constant<int, 4>{});
-      break;
-    case 5:
-      f(std::integral_constant<int, 5>{});
-      break;
-    case 6:
-      f(std::integral_constant<int, 6>{});
-      break;
-    case 8:
-      f(std::integral_constant<int, 8>{});
-      break;
-  }
-}
-
-void fast::AffineQuantize::eval_gpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
-  auto& w_pre = inputs[0];
-  auto& out = outputs[0];
-  out.set_data(allocator::malloc(out.nbytes()));
-
-  auto& s = stream();
-  auto& d = cu::device(s.device);
-  auto& enc = d.get_command_encoder(s);
-
-  auto w = ensure_row_contiguous(w_pre, enc, s);
-  enc.set_input_array(w);
-  if (dequantize_) {
-    auto scales = ensure_row_contiguous(inputs[1], enc, s);
-    auto biases = ensure_row_contiguous(inputs[2], enc, s);
-    enc.set_input_array(scales);
-    enc.set_input_array(biases);
-    enc.set_output_array(out);
-  } else {
-    auto& scales = outputs[1];
-    auto& biases = outputs[2];
-    scales.set_data(allocator::malloc(scales.nbytes()));
-    biases.set_data(allocator::malloc(biases.nbytes()));
-    enc.set_output_array(out);
-    enc.set_output_array(scales);
-    enc.set_output_array(biases);
-  }
-
-  auto dtype = dequantize_ ? outputs[0].dtype() : inputs[0].dtype();
-
-  // Treat uint32 as uint8 in kernel
-  int uint8_per_uint32 = 4;
-  int packs_per_int = (bits_ == 3 || bits_ == 5) ? 8
-      : bits_ == 6                               ? 4
-                                                 : 8 / bits_;
-  int per_thread = dequantize_ ? packs_per_int : group_size_ / WARP_SIZE;
-  size_t size =
-      dequantize_ ? out.size() / packs_per_int : w.size() / per_thread;
+  // Calculate the number of elements per thread
+  int per_thread = group_size_ / WARP_SIZE;
+  size_t size = w.size() / per_thread;

+  // Calculate the thread grid that we need to launch
  bool large = size > UINT_MAX;
  auto grid_shape = w.shape();
+  grid_shape.back() /= per_thread;

-  if (dequantize_) {
-    grid_shape.back() *= uint8_per_uint32;
-  } else {
-    grid_shape.back() /= per_thread;
-  }
-
-  dispatch_float_types(dtype, "affine_quantize", [&](auto type_tag) {
+  enc.set_input_array(w);
+  enc.set_output_array(wq);
+  enc.set_output_array(scales);
+  enc.set_output_array(biases);
+  dispatch_float_types(w.dtype(), "affine_quantize", [&](auto type_tag) {
    dispatch_groups(group_size_, [&](auto group_size) {
      dispatch_bits(bits_, [&](auto bits) {
-        using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
-        if (dequantize_) {
-          auto kernel =
-              cu::affine_dequantize<DataType, group_size.value, bits.value>;
-          auto [num_blocks, block_dims] =
-              get_launch_args(kernel, size, grid_shape, w.strides(), large);
-          enc.add_kernel_node(
-              kernel,
-              num_blocks,
-              block_dims,
-              w.data<uint8_t>(),
-              inputs[1].data<DataType>(),
-              inputs[2].data<DataType>(),
-              out.data<DataType>(),
-              out.size());
-        } else {
-          auto kernel =
-              cu::affine_quantize<DataType, group_size.value, bits.value>;
-          auto [num_blocks, block_dims] =
-              get_launch_args(kernel, size, grid_shape, w.strides(), large);
-          enc.add_kernel_node(
-              kernel,
-              num_blocks,
-              block_dims,
-              w.data<DataType>(),
-              out.data<uint8_t>(),
-              outputs[1].data<DataType>(),
-              outputs[2].data<DataType>(),
-              w.size());
-        }
+        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+        auto kernel = cu::affine_quantize<T, group_size.value, bits.value>;
+        auto [num_blocks, block_dims] =
+            get_launch_args(kernel, size, grid_shape, w.strides(), large);
+        enc.add_kernel_node(
+            kernel,
+            num_blocks,
+            block_dims,
+            0,
+            w.data<T>(),
+            wq.data<uint8_t>(),
+            scales.data<T>(),
+            biases.data<T>(),
+            w.size());
+      });
+    });
+  });
+}
+
+void affine_dequantize(
+    const array& wq,
+    const array& scales,
+    const array& biases,
+    array& w,
+    int group_size_,
+    int bits_,
+    cu::CommandEncoder& enc,
+    const Stream& s) {
+  // Calculate how many numbers we pack together. For 2, 4, 8 bits we pack in
+  // one uint8, for 3, 6 in 3 uint8 and for 5 in 5 uint8.
+  constexpr int uint8_per_uint32 = 4;
+  int packs_per_int;
+  switch (bits_) {
+    case 3:
+    case 5:
+      packs_per_int = 8;
+      break;
+    case 6:
+      packs_per_int = 4;
+      break;
+    default:
+      packs_per_int = 8 / bits_;
+  }
+
+  size_t size = w.size() / packs_per_int;
+  bool large = size > UINT_MAX;
+  auto grid_shape = w.shape();
+  grid_shape.back() *= uint8_per_uint32;
+
+  enc.set_input_array(wq);
+  enc.set_input_array(scales);
+  enc.set_input_array(biases);
+  enc.set_output_array(w);
+  dispatch_float_types(w.dtype(), "affine_quantize", [&](auto type_tag) {
+    dispatch_groups(group_size_, [&](auto group_size) {
+      dispatch_bits(bits_, [&](auto bits) {
+        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+        auto kernel = cu::affine_dequantize<T, group_size.value, bits.value>;
+        auto [num_blocks, block_dims] =
+            get_launch_args(kernel, size, grid_shape, w.strides(), large);
+        enc.add_kernel_node(
+            kernel,
+            num_blocks,
+            block_dims,
+            0,
+            wq.data<uint8_t>(),
+            scales.data<T>(),
+            biases.data<T>(),
+            w.data<T>(),
+            w.size());
      });
    });
  });
--- a/mlx/backend/cuda/quantized/qmm.cu
+++ b/mlx/backend/cuda/quantized/qmm.cu
@@ -0,0 +1,228 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/backend/cuda/matmul/mma.cuh"
+#include "mlx/backend/cuda/matmul/tiles.cuh"
+#include "mlx/backend/cuda/quantized/quantized_utils.cuh"
+#include "mlx/dtype_utils.h"
+
+namespace mlx::core {
+
+namespace cu {
+
+template <int NUM_WARPS, int group_size, int bits, typename T, typename Tile>
+__device__ inline void load_quantized(
+    Tile& tile,
+    const uint8_t* x,
+    const T* scales,
+    const T* biases,
+    int N) {
+  constexpr int NUM_THREADS = NUM_WARPS * 32;
+  constexpr int ELEMENTS_PER_LOAD = sizeof(uint32_t) * get_pack_factor<bits>();
+  constexpr int NUM_LOADS = Tile::NUMEL / ELEMENTS_PER_LOAD;
+  constexpr int NUM_LOADS_PER_THREAD = NUM_LOADS / NUM_THREADS;
+  constexpr int NUM_LOADS_PER_ROW = Tile::COLS / ELEMENTS_PER_LOAD;
+  constexpr int STEP_ROWS = NUM_THREADS / NUM_LOADS_PER_ROW;
+  constexpr int MASK = (1 << bits) - 1;
+
+  const int row = threadIdx.x / NUM_LOADS_PER_ROW;
+  const int col = threadIdx.x % NUM_LOADS_PER_ROW;
+
+  const int Nx = N / get_pack_factor<bits>();
+  const int Ng = N / group_size;
+
+  x += row * Nx + col * (ELEMENTS_PER_LOAD / get_pack_factor<bits>());
+  scales += row * Ng + col * ELEMENTS_PER_LOAD / group_size;
+  biases += row * Ng + col * ELEMENTS_PER_LOAD / group_size;
+
+  MLX_UNROLL
+  for (int i = 0; i < NUM_LOADS_PER_THREAD; i++) {
+    T vs[ELEMENTS_PER_LOAD];
+    uint32_t w = *reinterpret_cast<const uint32_t*>(x + i * STEP_ROWS * Nx);
+    T s = scales[i * STEP_ROWS * Ng];
+    T b = biases[i * STEP_ROWS * Ng];
+    MLX_UNROLL
+    for (int j = 0; j < ELEMENTS_PER_LOAD; j++) {
+      vs[j] = static_cast<T>((w >> (j * bits)) & MASK) * s + b;
+    }
+    tile.store(vs, row + i * STEP_ROWS, col * ELEMENTS_PER_LOAD);
+  }
+}
+
+template <
+    typename T,
+    int BM,
+    int BN,
+    int BK,
+    int group_size,
+    int bits,
+    bool aligned_M>
+__global__ void qmm_t(
+    const T* x,
+    const uint8_t* w,
+    const T* scales,
+    const T* biases,
+    T* y,
+    int M,
+    int N,
+    int K) {
+  constexpr int WARPS_M = 2;
+  constexpr int WARPS_N = 4;
+  constexpr int NUM_WARPS = WARPS_M * WARPS_N;
+  constexpr int WARP_STEP_M = BM / WARPS_M;
+  constexpr int WARP_STEP_N = BN / WARPS_N;
+
+  const int warpid = threadIdx.x / 32;
+  const int laneid = threadIdx.x % 32;
+  const int wm = warpid / WARPS_N;
+  const int wn = warpid % WARPS_N;
+  const int offset_m = wm * WARP_STEP_M;
+  const int offset_n = wn * WARP_STEP_N;
+
+  extern __shared__ char shmem[];
+  SharedTile<T, BM, BK>(&xs)[1] = *(SharedTile<T, BM, BK>(*)[1])(&shmem[0]);
+  SharedTile<T, BN, BK>(&ws)[1] =
+      *(SharedTile<T, BN, BK>(*)[1])(&shmem[1 * sizeof(T) * BM * BK]);
+
+  RegisterTile<float, BM / WARPS_M, BN / WARPS_N> C;
+  RegisterTile<T, BM / WARPS_M, 16> A;
+  RegisterTile<T, BN / WARPS_N, 16> B;
+
+  const int max_rows = M - blockIdx.y * BM;
+
+  x += blockIdx.y * BM * K;
+  w += blockIdx.x * BN * K / get_pack_factor<bits>();
+  scales += blockIdx.x * BN * K / group_size;
+  biases += blockIdx.x * BN * K / group_size;
+  y += blockIdx.y * BM * N + blockIdx.x * BN;
+
+  C.fill(0);
+
+  int tic = 0;
+  uint32_t base_addr_xs[1], base_addr_ws[1];
+  base_addr_xs[0] = __cvta_generic_to_shared(&xs[0].data[0]);
+  base_addr_ws[0] = __cvta_generic_to_shared(&ws[0].data[0]);
+
+  if (aligned_M || max_rows >= BM) {
+    for (int k_block = 0; k_block < K; k_block += BK) {
+      load_async<NUM_WARPS>(xs[tic], base_addr_xs[tic], x + k_block, K);
+      cp_async_commit();
+      load_quantized<NUM_WARPS, group_size, bits>(
+          ws[tic],
+          w + k_block / get_pack_factor<bits>(),
+          scales + k_block / group_size,
+          biases + k_block / group_size,
+          K);
+      cp_async_wait_all();
+      __syncthreads();
+
+      MLX_UNROLL
+      for (int k = 0; k < BK / 16; k++) {
+        A.load(
+            xs[tic],
+            base_addr_xs[tic],
+            offset_m + laneid % 16,
+            k * 16 + laneid / 16 * 8);
+        B.load(
+            ws[tic],
+            base_addr_ws[tic],
+            offset_n + laneid % 16,
+            k * 16 + laneid / 16 * 8);
+        mma_t(C, A, B);
+      }
+    }
+    C.store_global(y, N, offset_m, offset_n);
+  } else {
+    for (int k_block = 0; k_block < K; k_block += BK) {
+      load_async_safe<NUM_WARPS>(
+          xs[tic], base_addr_xs[tic], x + k_block, K, max_rows);
+      cp_async_commit();
+      load_quantized<NUM_WARPS, group_size, bits>(
+          ws[tic],
+          w + k_block / get_pack_factor<bits>(),
+          scales + k_block / group_size,
+          biases + k_block / group_size,
+          K);
+      cp_async_wait_all();
+      __syncthreads();
+
+      MLX_UNROLL
+      for (int k = 0; k < BK / 16; k++) {
+        A.load(
+            xs[tic],
+            base_addr_xs[tic],
+            offset_m + laneid % 16,
+            k * 16 + laneid / 16 * 8);
+        B.load(
+            ws[tic],
+            base_addr_ws[tic],
+            offset_n + laneid % 16,
+            k * 16 + laneid / 16 * 8);
+        mma_t(C, A, B);
+      }
+    }
+    C.store_global_safe(y, N, offset_m, offset_n, max_rows);
+  }
+}
+
+} // namespace cu
+
+void qmm(
+    const array& x,
+    const array& w,
+    const array& scales,
+    const array& biases,
+    array& out,
+    bool transpose_,
+    int group_size_,
+    int bits_,
+    int M,
+    int N,
+    int K,
+    cu::CommandEncoder& enc,
+    const Stream& s) {
+  if (x.dtype() != bfloat16) {
+    throw std::invalid_argument("[qmm] Only bfloat16 is supported for now");
+  }
+  if (!transpose_) {
+    throw std::invalid_argument(
+        "[qmm] Only transposed matmul is supported for now");
+  }
+
+  dispatch_float_types(x.dtype(), "qmm", [&](auto type_tag) {
+    dispatch_groups(group_size_, [&](auto group_size) {
+      dispatch_bits(bits_, [&](auto bits) {
+        using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
+
+        constexpr int BM = 128;
+        constexpr int BN = 128;
+        constexpr int BK = 32;
+        auto kernel =
+            cu::qmm_t<DataType, BM, BN, BK, group_size.value, bits.value, true>;
+        if (M % BM != 0) {
+          kernel = cu::
+              qmm_t<DataType, BM, BN, BK, group_size.value, bits.value, false>;
+        }
+
+        dim3 grid((N + BN - 1) / BN, (M + BM - 1) / BM);
+
+        enc.add_kernel_node(
+            kernel,
+            grid,
+            2 * 4 * 32,
+            1 * sizeof(DataType) * (BM * BK + BN * BK),
+            x.data<DataType>(),
+            w.data<uint8_t>(),
+            scales.data<DataType>(),
+            biases.data<DataType>(),
+            out.data<DataType>(),
+            M,
+            N,
+            K);
+      });
+    });
+  });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/quantized/quantized.cu
+++ b/mlx/backend/cuda/quantized/quantized.cu
@@ -0,0 +1,113 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/backend/cuda/quantized/quantized.cuh"
+#include "mlx/backend/gpu/copy.h"
+#include "mlx/dtype_utils.h"
+#include "mlx/fast_primitives.h"
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+#include <nvtx3/nvtx3.hpp>
+
+namespace mlx::core {
+
+namespace {
+
+inline array ensure_row_contiguous(
+    const array& x,
+    cu::CommandEncoder& enc,
+    const Stream& s) {
+  if (!x.flags().row_contiguous) {
+    array x_copy = contiguous_copy_gpu(x, s);
+    enc.add_temporary(x_copy);
+    return x_copy;
+  } else {
+    return x;
+  }
+}
+
+inline array ensure_row_contiguous_matrix(
+    const array& x,
+    cu::CommandEncoder& enc,
+    const Stream& s) {
+  auto stride_0 = x.strides()[x.ndim() - 2];
+  auto stride_1 = x.strides()[x.ndim() - 1];
+  if (stride_0 == x.shape(-1) && stride_1 == 1) {
+    return x;
+  } else {
+    array x_copy = contiguous_copy_gpu(x, s);
+    enc.add_temporary(x_copy);
+    return x_copy;
+  }
+}
+
+} // namespace
+
+void QuantizedMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {
+  auto& s = stream();
+  auto& d = cu::device(s.device);
+  auto& enc = d.get_command_encoder(s);
+
+  out.set_data(allocator::malloc(out.nbytes()));
+
+  // Make sure the last two dims of x and w, s, b are contiguous. This should
+  // be relaxed for x.
+  array x = ensure_row_contiguous_matrix(inputs[0], enc, s);
+  array w = ensure_row_contiguous_matrix(inputs[1], enc, s);
+  array scales = ensure_row_contiguous_matrix(inputs[2], enc, s);
+  array biases = ensure_row_contiguous_matrix(inputs[3], enc, s);
+
+  // Extract the matmul shapes
+  bool non_batched = w.ndim() == 2 && x.flags().row_contiguous;
+  int K = x.shape(-1);
+  int M = non_batched ? x.size() / K : x.shape(-2);
+  int N = out.shape(-1);
+
+  qmm(x,
+      w,
+      scales,
+      biases,
+      out,
+      transpose_,
+      group_size_,
+      bits_,
+      M,
+      N,
+      K,
+      enc,
+      s);
+}
+
+void fast::AffineQuantize::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  auto& s = stream();
+  auto& d = cu::device(s.device);
+  auto& enc = d.get_command_encoder(s);
+
+  if (dequantize_) {
+    auto wq = ensure_row_contiguous(inputs[0], enc, s);
+    auto scales = ensure_row_contiguous(inputs[1], enc, s);
+    auto biases = ensure_row_contiguous(inputs[2], enc, s);
+    auto& w = outputs[0];
+
+    w.set_data(allocator::malloc(w.nbytes()));
+
+    affine_dequantize(wq, scales, biases, w, group_size_, bits_, enc, s);
+  } else {
+    auto w = ensure_row_contiguous(inputs[0], enc, s);
+    auto& wq = outputs[0];
+    auto& scales = outputs[1];
+    auto& biases = outputs[2];
+
+    wq.set_data(allocator::malloc(wq.nbytes()));
+    scales.set_data(allocator::malloc(scales.nbytes()));
+    biases.set_data(allocator::malloc(biases.nbytes()));
+
+    affine_quantize(w, wq, scales, biases, group_size_, bits_, enc, s);
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/quantized/quantized.cuh
+++ b/mlx/backend/cuda/quantized/quantized.cuh
@@ -0,0 +1,42 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device.h"
+
+namespace mlx::core {
+
+void affine_quantize(
+    const array& w,
+    array& wq,
+    array& scales,
+    array& biases,
+    int group_size_,
+    int bits_,
+    cu::CommandEncoder& enc,
+    const Stream& s);
+
+void affine_dequantize(
+    const array& wq,
+    const array& scales,
+    const array& biases,
+    array& w,
+    int group_size_,
+    int bits_,
+    cu::CommandEncoder& enc,
+    const Stream& s);
+
+void qmm(
+    const array& x,
+    const array& w,
+    const array& scales,
+    const array& biases,
+    array& out,
+    bool transpose_,
+    int group_size_,
+    int bits_,
+    int M,
+    int N,
+    int K,
+    cu::CommandEncoder& enc,
+    const Stream& s);
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/quantized/quantized_utils.cuh
+++ b/mlx/backend/cuda/quantized/quantized_utils.cuh
@@ -0,0 +1,59 @@
+// Copyright © 2025 Apple Inc.
+
+namespace mlx::core {
+
+namespace cu {
+
+template <int bits, int wsize = 8>
+inline constexpr __device__ short get_pack_factor() {
+  return (bits == 3 || bits == 5) ? 8 : (bits == 6 ? 4 : wsize / bits);
+}
+
+template <int bits, int wsize = 8>
+inline constexpr __device__ short get_bytes_per_pack() {
+  constexpr int power_of_2_bits = (bits & (bits - 1)) == 0;
+  return power_of_2_bits ? (wsize / 8) : (bits == 5 ? 5 : 3);
+}
+
+} // namespace cu
+
+template <typename F>
+void dispatch_groups(int group_size, F&& f) {
+  switch (group_size) {
+    case 32:
+      f(std::integral_constant<int, 32>{});
+      break;
+    case 64:
+      f(std::integral_constant<int, 64>{});
+      break;
+    case 128:
+      f(std::integral_constant<int, 128>{});
+      break;
+  }
+}
+
+template <typename F>
+void dispatch_bits(int bits, F&& f) {
+  switch (bits) {
+    case 2:
+      f(std::integral_constant<int, 2>{});
+      break;
+    case 3:
+      f(std::integral_constant<int, 3>{});
+      break;
+    case 4:
+      f(std::integral_constant<int, 4>{});
+      break;
+    case 5:
+      f(std::integral_constant<int, 5>{});
+      break;
+    case 6:
+      f(std::integral_constant<int, 6>{});
+      break;
+    case 8:
+      f(std::integral_constant<int, 8>{});
+      break;
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/cuda/random.cu
+++ b/mlx/backend/cuda/random.cu
@@ -170,6 +170,7 @@ void RandomBits::eval_gpu(const std::vector<array>& inputs, array& out) {
        cu::rbitsc,
        grid,
        block,
+        0,
        keys.data<uint32_t>(),
        out.data<uint8_t>(),
        grid_dims,
@@ -180,6 +181,7 @@ void RandomBits::eval_gpu(const std::vector<array>& inputs, array& out) {
        cu::rbits,
        grid,
        block,
+        0,
        keys.data<uint32_t>(),
        out.data<uint8_t>(),
        grid_dims,
--- a/mlx/backend/cuda/reduce/all_reduce.cu
+++ b/mlx/backend/cuda/reduce/all_reduce.cu
@@ -120,6 +120,7 @@ void all_reduce(
            kernel,
            blocks,
            threads,
+            0,
            static_cast<T*>(indata),
            intermediate.data<U>(),
            block_step,
@@ -146,6 +147,7 @@ void all_reduce(
          kernel,
          blocks,
          threads,
+          0,
          static_cast<T*>(indata),
          out.data<U>(),
          block_step,
--- a/mlx/backend/cuda/reduce/col_reduce.cu
+++ b/mlx/backend/cuda/reduce/col_reduce.cu
@@ -230,7 +230,7 @@ void col_reduce_looped(
        auto kernel =
            cu::col_reduce_looped<T, U, OP, reduce_ndim(), BM, BN, N_READS>;
        encoder.add_kernel_node(
-            kernel, grid, blocks, indata, out.data<U>(), args);
+            kernel, grid, blocks, 0, indata, out.data<U>(), args);
      });
    });
  });
--- a/mlx/backend/cuda/reduce/init_reduce.cu
+++ b/mlx/backend/cuda/reduce/init_reduce.cu
@@ -41,7 +41,8 @@ void init_reduce(
      dim3 grid = get_2d_grid_dims(out.shape(), out.strides());
      dim3 block(grid.x < 1024 ? grid.x : 1024, 1, 1);
      grid.x = (grid.x + 1023) / 1024;
-      encoder.add_kernel_node(kernel, grid, block, out.data<U>(), out.size());
+      encoder.add_kernel_node(
+          kernel, grid, block, 0, out.data<U>(), out.size());
    });
  });
 }
--- a/mlx/backend/cuda/reduce/row_reduce.cu
+++ b/mlx/backend/cuda/reduce/row_reduce.cu
@@ -269,7 +269,7 @@ void row_reduce_simple(

      int size = plan.shape.back();
      encoder.add_kernel_node(
-          kernel, grid, block, indata, out.data<U>(), out.size(), size);
+          kernel, grid, block, 0, indata, out.data<U>(), out.size(), size);
    });
  });
 }
@@ -322,7 +322,7 @@ void row_reduce_looped(
      });

      encoder.add_kernel_node(
-          kernel, grid, block, indata, out.data<U>(), out.size(), args);
+          kernel, grid, block, 0, indata, out.data<U>(), out.size(), args);
    });
  });
 }
--- a/mlx/backend/cuda/rms_norm.cu
+++ b/mlx/backend/cuda/rms_norm.cu
@@ -1,6 +1,7 @@
 // Copyright © 2025 Apple Inc.

 #include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/iterators/strided_iterator.cuh"
 #include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/backend/cuda/reduce/reduce.cuh"
 #include "mlx/backend/gpu/copy.h"
@@ -88,7 +89,7 @@ __global__ void rms_norm(
    T xn[N_READS];
    T wn[N_READS];
    cub::LoadDirectBlocked(index, x, xn, axis_size);
-    cub::LoadDirectBlocked(index, StridedIterator(w, w_stride), wn, axis_size);
+    cub::LoadDirectBlocked(index, strided_iterator(w, w_stride), wn, axis_size);
    for (int i = 0; i < N_READS; ++i) {
      float norm = static_cast<float>(xn[i]) * normalizer;
      xn[i] = wn[i] * static_cast<T>(norm);
@@ -131,7 +132,7 @@ __global__ void rms_norm_vjp(
    auto index = r * BLOCK_DIM + block.thread_rank();
    cub::LoadDirectBlocked(index, x, xn, axis_size, cast_to<T>(0));
    cub::LoadDirectBlocked(index, g, gn, axis_size);
-    cub::LoadDirectBlocked(index, StridedIterator(w, w_stride), wn, axis_size);
+    cub::LoadDirectBlocked(index, strided_iterator(w, w_stride), wn, axis_size);
    for (int i = 0; i < N_READS; i++) {
      float t = static_cast<float>(xn[i]);
      float wi = wn[i];
@@ -153,7 +154,7 @@ __global__ void rms_norm_vjp(
    T gn[N_READS];
    cub::LoadDirectBlocked(index, x, xn, axis_size);
    cub::LoadDirectBlocked(index, g, gn, axis_size);
-    cub::LoadDirectBlocked(index, StridedIterator(w, w_stride), wn, axis_size);
+    cub::LoadDirectBlocked(index, strided_iterator(w, w_stride), wn, axis_size);
    for (int i = 0; i < N_READS; i++) {
      float xi = xn[i];
      float wi = wn[i];
@@ -231,6 +232,7 @@ void RMSNorm::eval_gpu(
          kernel,
          n_rows,
          block_dim(),
+          0,
          x.data<DataType>(),
          w.data<DataType>(),
          out.data<DataType>(),
@@ -326,6 +328,7 @@ void RMSNormVJP::eval_gpu(
                kernel,
                n_rows,
                block_dim(),
+                0,
                x.data<DataType>(),
                w.data<DataType>(),
                g.data<DataType>(),
--- a/mlx/backend/cuda/rope.cu
+++ b/mlx/backend/cuda/rope.cu
@@ -325,6 +325,7 @@ void RoPE::eval_gpu(
              kernel,
              grid,
              block,
+              0,
              (donated ? out : in).data<DataType>(),
              out.data<DataType>(),
              offset.data<int32_t>(),
@@ -341,6 +342,7 @@ void RoPE::eval_gpu(
              kernel,
              grid,
              block,
+              0,
              (donated ? out : in).data<DataType>(),
              out.data<DataType>(),
              offset.data<int32_t>(),
@@ -360,6 +362,7 @@ void RoPE::eval_gpu(
              kernel,
              grid,
              block,
+              0,
              (donated ? out : in).data<DataType>(),
              out.data<DataType>(),
              offset.data<int32_t>(),
@@ -381,6 +384,7 @@ void RoPE::eval_gpu(
              kernel,
              grid,
              block,
+              0,
              (donated ? out : in).data<DataType>(),
              out.data<DataType>(),
              offset.data<int32_t>(),
--- a/mlx/backend/cuda/scan.cu
+++ b/mlx/backend/cuda/scan.cu
@@ -414,6 +414,7 @@ void Scan::eval_gpu(const std::vector<array>& inputs, array& out) {
                  kernel,
                  in.data_size() / axis_size,
                  block_dim,
+                  0,
                  in.data<T>(),
                  out.data<U>(),
                  axis_size);
@@ -443,6 +444,7 @@ void Scan::eval_gpu(const std::vector<array>& inputs, array& out) {
                  kernel,
                  num_blocks,
                  block_dim,
+                  0,
                  in.data<T>(),
                  out.data<U>(),
                  axis_size,
--- a/mlx/backend/cuda/softmax.cu
+++ b/mlx/backend/cuda/softmax.cu
@@ -152,6 +152,7 @@ void Softmax::eval_gpu(const std::vector<array>& inputs, array& out) {
          kernel,
          n_rows,
          block_dim(),
+          0,
          in.data<DataType>(),
          out.data<DataType>(),
          axis_size);
--- a/mlx/backend/cuda/ternary.cu
+++ b/mlx/backend/cuda/ternary.cu
@@ -76,7 +76,7 @@ __global__ void ternary_g(
    int ndim) {
  IdxT index = cg::this_grid().thread_rank();
  if (index < size) {
-    auto [a_idx, b_idx, c_idx] = elem_to_loc(
+    auto [a_idx, b_idx, c_idx] = elem_to_loc_4d(
        index,
        shape.data(),
        a_strides.data(),
@@ -133,6 +133,7 @@ void ternary_op_gpu_inplace(
                    kernel,
                    num_blocks,
                    block_dims,
+                    0,
                    a.data<bool>(),
                    b.data<DType>(),
                    c.data<DType>(),
@@ -151,6 +152,7 @@ void ternary_op_gpu_inplace(
                  kernel,
                  num_blocks,
                  block_dims,
+                  0,
                  a.data<bool>(),
                  b.data<DType>(),
                  c.data<DType>(),
@@ -180,6 +182,7 @@ void ternary_op_gpu_inplace(
            kernel,
            num_blocks,
            block_dims,
+            0,
            a.data<bool>(),
            b.data<DType>(),
            c.data<DType>(),
--- a/mlx/backend/cuda/unary.cu
+++ b/mlx/backend/cuda/unary.cu
@@ -3,6 +3,7 @@
 #include "mlx/backend/common/unary.h"
 #include "mlx/backend/cuda/device.h"
 #include "mlx/backend/cuda/device/unary_ops.cuh"
+#include "mlx/backend/cuda/iterators/general_iterator.cuh"
 #include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/dtype_utils.h"
 #include "mlx/primitives.h"
@@ -47,7 +48,7 @@ __global__ void unary_g(
    int ndim) {
  IdxT index = cg::this_grid().thread_rank();
  if (index < size) {
-    auto idx = elem_to_loc(index, shape.data(), strides.data(), ndim);
+    auto idx = elem_to_loc_4d(index, shape.data(), strides.data(), ndim);
    out[index] = Op{}(in[idx]);
  }
 }
@@ -141,6 +142,7 @@ void unary_op_gpu_inplace(
                kernel,
                num_blocks,
                block_dims,
+                0,
                in.data<InType>(),
                out.data<OutType>(),
                out.data_size());
@@ -153,6 +155,7 @@ void unary_op_gpu_inplace(
                kernel,
                num_blocks,
                block_dims,
+                0,
                in.data<InType>(),
                out.data<OutType>(),
                out.data_size(),
--- a/mlx/backend/cuda/worker.cpp
+++ b/mlx/backend/cuda/worker.cpp
@@ -1,6 +1,7 @@
 // Copyright © 2025 Apple Inc.

 #include "mlx/backend/cuda/worker.h"
+#include "mlx/backend/cuda/allocator.h"
 #include "mlx/backend/cuda/device.h"

 namespace mlx::core::cu {
@@ -11,10 +12,10 @@ Worker::Worker()

 Worker::~Worker() {
  {
-    std::lock_guard lock(mtx_);
+    std::lock_guard lock(worker_mutex_);
    stop_ = true;
  }
-  cond_.notify_one();
+  worker_event_.signal(batch_ + 1);
  worker_.join();
 }

@@ -22,41 +23,53 @@ void Worker::add_task(std::function<void()> task) {
  pending_tasks_.push_back(std::move(task));
 }

-void Worker::signal(void* data) {
-  auto w = static_cast<Worker*>(data);
-  {
-    std::lock_guard lock(w->mtx_);
-    w->signaled_batch_++;
+void Worker::consume_in_this_thread() {
+  for (auto& task : pending_tasks_) {
+    task();
  }
-  w->cond_.notify_one();
+  pending_tasks_.clear();
+}
+
+void Worker::end_batch() {
+  batch_++;
+  {
+    std::lock_guard lock(worker_mutex_);
+    worker_tasks_[batch_] = std::move(pending_tasks_);
+  }
+  uncommited_batches_++;
+}
+
+void Worker::commit() {
+  if (uncommited_batches_ == 0) {
+    return;
+  }
+  uncommited_batches_ = 0;
+  worker_event_.signal(batch_);
 }

 void Worker::commit(cudaStream_t stream) {
-  // Move pending tasks into tasks
-  if (pending_tasks_.empty()) {
+  if (uncommited_batches_ == 0) {
    return;
  }
-  {
-    std::lock_guard lock(mtx_);
-    // Move pending tasks into ready tasks
-    worker_tasks_[++committed_batch_] = std::move(pending_tasks_);
-  }
+  uncommited_batches_ = 0;
+  // Signal the |worker_event_| in |signal_stream_| after the kernels in
+  // |stream_| finish running.
  signal_event_.record(stream);
  signal_event_.wait(signal_stream_);
-  cudaLaunchHostFunc(signal_stream_, signal, this);
+  worker_event_.signal(signal_stream_, batch_);
 }

 void Worker::thread_fn() {
+  // The worker thread is safe to free buffers.
+  allocator().register_this_thread();
+
  while (!stop_) {
-    uint64_t current_batch = 0;
+    uint64_t batch = worker_event_.value();
    Tasks tasks;
    {
-      std::unique_lock<std::mutex> lk(mtx_);
-      cond_.wait(lk, [this, &current_batch] {
-        return this->signaled_batch_ > current_batch || this->stop_;
-      });
-      current_batch = signaled_batch_;
-      auto end = worker_tasks_.upper_bound(current_batch);
+      std::lock_guard lock(worker_mutex_);
+      // Move tasks in signaled batches.
+      auto end = worker_tasks_.upper_bound(batch);
      for (auto it = worker_tasks_.begin(); it != end; ++it) {
        if (tasks.empty()) {
          tasks = std::move(it->second);
@@ -72,6 +85,7 @@ void Worker::thread_fn() {
      auto task = std::move(tasks[i]);
      task();
    }
+    worker_event_.wait(batch + 1);
  }
 }

--- a/mlx/backend/cuda/worker.h
+++ b/mlx/backend/cuda/worker.h
@@ -5,7 +5,6 @@
 #include "mlx/backend/cuda/event.h"
 #include "mlx/backend/cuda/utils.h"

-#include <condition_variable>
 #include <functional>
 #include <map>
 #include <mutex>
@@ -25,24 +24,38 @@ class Worker {
  // Add a pending |task| that will run when consumed or commited.
  void add_task(std::function<void()> task);

+  // Run pending tasks immediately in current thread.
+  void consume_in_this_thread();
+
+  // Put pending tasks in a batch.
+  void end_batch();
+
+  // Inform worker thread to run current batches now.
+  void commit();
+
  // Inform worker thread to run current batches after kernels in |stream|
  // finish running.
  void commit(cudaStream_t stream);

+  // Return how many batches have been added but not committed yet.
+  size_t uncommited_batches() const {
+    return uncommited_batches_;
+  }
+
 private:
-  static void signal(void*);
-
  void thread_fn();
-  std::mutex mtx_;
-  std::condition_variable cond_;

-  uint64_t committed_batch_{0};
-  uint64_t signaled_batch_{0};
+  uint64_t batch_{0};
+  size_t uncommited_batches_{0};

  // Cuda stream and event for signaling kernel completion.
  CudaStream signal_stream_;
  CudaEvent signal_event_;

+  // Worker thread.
+  SharedEvent worker_event_;
+  std::thread worker_;
+  std::mutex worker_mutex_;
  bool stop_{false};

  // Tasks are put in |pending_tasks_| first, and then moved to
@@ -50,7 +63,6 @@ class Worker {
  using Tasks = std::vector<std::function<void()>>;
  Tasks pending_tasks_;
  std::map<uint64_t, Tasks> worker_tasks_;
-  std::thread worker_;
 };

 } // namespace mlx::core::cu
--- a/mlx/backend/metal/allocator.cpp
+++ b/mlx/backend/metal/allocator.cpp
@@ -128,7 +128,8 @@ Buffer MetalAllocator::malloc(size_t size) {

    auto pool = metal::new_scoped_memory_pool();

-    // If we have a lot of memory pressure try to reclaim memory from the cache
+    // If we have a lot of memory pressure or are over the maximum cache size,
+    // try to reclaim memory from the cache
    if (mem_required >= gc_limit_ || num_resources_ >= resource_limit_) {
      num_resources_ -=
          buffer_cache_.release_cached_buffers(mem_required - gc_limit_);
--- a/mlx/fast.cpp
+++ b/mlx/fast.cpp
@@ -708,10 +708,7 @@ array scaled_dot_product_attention(
      }
      if (mask.dtype() == bool_) {
        scores = where(
-            mask,
-            scores,
-            array(-std::numeric_limits<float>::infinity(), scores.dtype()),
-            s);
+            mask, scores, array(finfo(scores.dtype()).min, scores.dtype()));
      } else {
        scores = add(scores, mask, s);
      }
--- a/mlx/primitives.cpp
+++ b/mlx/primitives.cpp
@@ -1271,6 +1271,19 @@ std::vector<array> Convolution::vjp(
        has_neg_padding |= (pd < 0);
      }

+      auto padding_lo_ = std::vector<int>(padding_lo);
+      auto padding_hi_ = std::vector<int>(padding_hi);
+
+      // Use negative padding on the gradient output
+      if (has_neg_padding) {
+        for (auto& p : padding_lo_) {
+          p = std::max(0, p);
+        }
+        for (auto& p : padding_hi_) {
+          p = std::max(0, p);
+        }
+      }
+
      auto wt_trans = group_transpose(wt, 0, 1, -1);
      auto grad = conv_general(
          /* const array& input = */ cotan,
@@ -1292,9 +1305,12 @@ std::vector<array> Convolution::vjp(
        for (int i = 0; i < grad.ndim() - 2; i++) {
          if (padding_lo[i] < 0) {
            starts[i + 1] -= padding_lo[i];
+            padding_lo[i] = 0;
          }
+
          if (padding_hi[i] < 0) {
            stops[i + 1] += padding_hi[i];
+            padding_hi[i] = 0;
          }
        }

--- a/python/scripts/repair_cuda.sh
+++ b/python/scripts/repair_cuda.sh
@@ -4,7 +4,6 @@ auditwheel repair dist/* \
  --plat manylinux_2_35_x86_64 \
  --exclude libcublas* \
  --exclude libnvrtc* \
-  --exclude libcuda* \
  -w wheel_tmp


--- a/python/src/ops.cpp
+++ b/python/src/ops.cpp
@@ -4022,9 +4022,8 @@ void init_ops(nb::module_& m) {
        Args:
            file (file, str): File in which the array is saved.
            arrays (dict(str, array)): The dictionary of names to arrays to
-              be saved.
-            metadata (dict(str, str), optional): The dictionary of
-              metadata to be saved.
+            be saved. metadata (dict(str, str), optional): The dictionary of
+            metadata to be saved.
      )pbdoc");
  m.def(
      "save_gguf",
@@ -4259,7 +4258,7 @@ void init_ops(nb::module_& m) {

        .. math::

-          w_i = s \hat{w_i} + \beta
+          w_i = s \hat{w_i} - \beta

        Args:
          w (array): Matrix to be quantized
--- a/python/tests/test_fast_sdpa.py
+++ b/python/tests/test_fast_sdpa.py
@@ -398,18 +398,6 @@ class TestFastSDPA(mlx_tests.MLXTestCase):
            )
            self.assertTrue(mx.allclose(ref, out, atol=1e-4, rtol=1e-4))

-    def test_fully_masked(self):
-        Lkv = 8
-        mask = mx.array(False)
-        for D in [4, 128]:
-            for Lq in [1, 8]:
-                q = mx.random.normal(shape=(1, 4, Lq, D))
-                k = mx.random.normal(shape=(1, 4, Lkv, D))
-                v = mx.random.normal(shape=(1, 4, Lkv, D))
-
-                out = mx.fast.scaled_dot_product_attention(q, k, v, mask=mask, scale=1)
-                self.assertTrue(mx.all(mx.isnan(out)))
-
    def test_fast_sdpa_few_query(self):
        D = 64
        L = 43
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,7 @@ from functools import partial
 from pathlib import Path
 from subprocess import run

-from setuptools import Command, Extension, find_namespace_packages, setup
+from setuptools import Command, Extension, setup
 from setuptools.command.bdist_wheel import bdist_wheel
 from setuptools.command.build_ext import build_ext

@@ -166,10 +166,6 @@ class GenerateStubs(Command):
        # Run again without recursive to specify output file name
        subprocess.run(["rm", f"{out_path}/mlx.pyi"])
        subprocess.run(stub_cmd + ["-o", f"{out_path}/__init__.pyi"])
-        # mx.bool_ gets filtered by nanobind because of the trailing
-        # underscore, add it manually:
-        with open(f"{out_path}/__init__.pyi", "a") as fid:
-            fid.write("\nbool_: Dtype = ...")


 class MLXBdistWheel(bdist_wheel):
@@ -188,23 +184,19 @@ with open(Path(__file__).parent / "README.md", encoding="utf-8") as f:

 if __name__ == "__main__":
    package_dir = {"": "python"}
-    packages = find_namespace_packages(
-        where="python",
-        exclude=[
-            "src",
-            "tests",
-            "scripts",
-            "mlx.lib",
-            "mlx.include",
-            "mlx.share",
-            "mlx.share.**",
-            "mlx.include.**",
-        ],
-    )
+    packages = [
+        "mlx",
+        "mlx.nn",
+        "mlx.nn.layers",
+        "mlx.optimizers",
+    ]

    build_macos = platform.system() == "Darwin"
    build_cuda = "MLX_BUILD_CUDA=ON" in os.environ.get("CMAKE_ARGS", "")

+    install_requires = []
+    if build_cuda:
+        install_requires = ["nvidia-cublas-cu12", "nvidia-cuda-nvrtc-cu12"]
    version = get_version()

    _setup = partial(
@@ -229,7 +221,7 @@ if __name__ == "__main__":
        },
    )

-    package_data = {"mlx.core": ["*.pyi"]}
+    package_data = {"mlx": ["lib/*", "include/*", "share/*"], "mlx.core": ["*.pyi"]}

    extras = {
        "dev": [
@@ -247,7 +239,6 @@ if __name__ == "__main__":
            "mlx.distributed_config = mlx.distributed_run:distributed_config",
        ]
    }
-    install_requires = []

    # Release builds for PyPi are in two stages.
    # Each stage should be run from a clean build:
@@ -267,11 +258,11 @@ if __name__ == "__main__":
    #  - Package name is back-end specific, e.g mlx-metal
    if build_stage != 2:
        if build_stage == 1:
-            install_requires.append(
-                f'mlx-metal=={version}; platform_system == "Darwin"'
-            )
-            extras["cuda"] = [f'mlx-cuda=={version}; platform_system == "Linux"']
-            extras["cpu"] = [f'mlx-cpu=={version}; platform_system == "Linux"']
+            if build_macos:
+                install_requires += [f"mlx-metal=={version}"]
+            else:
+                extras["cuda"] = [f"mlx-cuda=={version}"]
+                extras["cpu"] = [f"mlx-cpu=={version}"]

        _setup(
            name="mlx",
@@ -286,14 +277,9 @@ if __name__ == "__main__":
            name = "mlx-metal"
        elif build_cuda:
            name = "mlx-cuda"
-            install_requires += [
-                "nvidia-cublas-cu12==12.9.*",
-                "nvidia-cuda-nvrtc-cu12==12.9.*",
-            ]
        else:
            name = "mlx-cpu"
        _setup(
            name=name,
            packages=["mlx"],
-            install_requires=install_requires,
        )
Author	SHA1	Message	Date
Angelos Katharopoulos	8269c9d02d	Support unaligned M	2025-07-23 00:40:27 -07:00
Angelos Katharopoulos	903b40627c	Add dynamic shared memory and improve qmm	2025-07-22 23:36:53 -07:00
Angelos Katharopoulos	700f7dcf01	Refactor the matmul a bit	2025-07-21 23:38:21 -07:00
Angelos Katharopoulos	6c60bd1cbf	Fixed mma and working dequant	2025-07-21 04:47:42 -07:00
Angelos Katharopoulos	a64cc02a0c	Somewhat working matmul primitives	2025-07-21 04:47:42 -07:00
Angelos Katharopoulos	346ae5fdb5	Refactor quantized	2025-07-21 04:47:41 -07:00