From b2273733eaa0c08e4c4b51f93c76ee1710066967 Mon Sep 17 00:00:00 2001
From: Awni Hannun <awni@apple.com>
Date: Wed, 16 Jul 2025 13:00:37 -0700
Subject: [PATCH] Test with CUDA 12.2 (#2375)

* Test with CUDA 12.0

* try older image

* fix cpu sort
---
 .circleci/config.yml          |  4 ++--
 mlx/backend/cpu/sort.cpp      |  8 ++++++--
 mlx/backend/cuda/quantized.cu | 16 ++++++++++------
 3 files changed, 18 insertions(+), 10 deletions(-)
diff --git a/.circleci/config.yml b/.circleci/config.yml
index ef82298bc..1e6fea821 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -201,7 +201,7 @@ jobs:
 
   cuda_build_and_test:
     machine:
-      image: linux-cuda-12:default
+      image: linux-cuda-12:2023.11.1
       resource_class: gpu.nvidia.small.gen2
     steps:
       - checkout
@@ -210,7 +210,7 @@ jobs:
           command: |
             sudo apt-get update
             sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
-            python -m venv env
+            python3 -m venv env
             source env/bin/activate
             CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
               pip install -e ".[dev]"
diff --git a/mlx/backend/cpu/sort.cpp b/mlx/backend/cpu/sort.cpp
index f2243f60f..089f7c425 100644
--- a/mlx/backend/cpu/sort.cpp
+++ b/mlx/backend/cpu/sort.cpp
@@ -334,7 +334,9 @@ void Sort::eval_cpu(const std::vector<array>& inputs, array& out) {
   auto& in = inputs[0];
 
   // Copy input to output
-  CopyType ctype = in.flags().contiguous ? CopyType::Vector : CopyType::General;
+  CopyType ctype = (in.flags().contiguous && in.strides()[axis_] != 0)
+      ? CopyType::Vector
+      : CopyType::General;
   copy_cpu(in, out, ctype, stream());
 
   auto& encoder = cpu::get_command_encoder(stream());
@@ -426,7 +428,9 @@ void Partition::eval_cpu(const std::vector<array>& inputs, array& out) {
   auto& in = inputs[0];
 
   // Copy input to output
-  CopyType ctype = in.flags().contiguous ? CopyType::Vector : CopyType::General;
+  CopyType ctype = (in.flags().contiguous && in.strides()[axis_] != 0)
+      ? CopyType::Vector
+      : CopyType::General;
   copy_cpu(in, out, ctype, stream());
 
   auto& encoder = cpu::get_command_encoder(stream());
diff --git a/mlx/backend/cuda/quantized.cu b/mlx/backend/cuda/quantized.cu
index 12a1f6fe4..4424000d8 100644
--- a/mlx/backend/cuda/quantized.cu
+++ b/mlx/backend/cuda/quantized.cu
@@ -36,7 +36,8 @@ affine_quantize(const T* w, uint8_t* out, T* scales, T* biases, size_t size) {
   auto tidx = block_idx.x * block_size.x + idx_in_block.x;
   auto tidy = block_idx.y * block_size.y + idx_in_block.y;
 
-  auto grid_dim = cg::this_grid().dim_threads();
+  auto grid_dim_x =
+      cg::this_grid().dim_blocks().x * cg::this_grid().block_index().x;
   constexpr float eps = 1e-7;
   constexpr int simd_size = WARP_SIZE;
   constexpr float n_bins = (1 << bits) - 1;
@@ -48,7 +49,7 @@ affine_quantize(const T* w, uint8_t* out, T* scales, T* biases, size_t size) {
       writes_per_reduce > 1 ? 1 : values_per_reduce / pack_factor;
   constexpr int power_of_2_bits = (bits & (bits - 1)) == 0;
 
-  size_t offset = tidx + grid_dim.x * size_t(tidy);
+  size_t offset = tidx + grid_dim_x * size_t(tidy);
   size_t in_index = offset * values_per_reduce;
   if (in_index >= size) {
     return;
@@ -153,12 +154,13 @@ __global__ void affine_dequantize(
   auto tidx = block_idx.x * block_size.x + idx_in_block.x;
   auto tidy = block_idx.y * block_size.y + idx_in_block.y;
 
-  auto grid_dim = cg::this_grid().dim_threads();
+  auto grid_dim_x =
+      cg::this_grid().dim_blocks().x * cg::this_grid().block_index().x;
 
   constexpr int pack_factor = get_pack_factor<bits, 8>();
   constexpr int bytes_per_pack = get_bytes_per_pack<bits>();
 
-  size_t offset = tidx + grid_dim.x * size_t(tidy);
+  size_t offset = tidx + grid_dim_x * size_t(tidy);
   size_t oindex = offset * pack_factor;
 
   if (oindex >= size) {
@@ -349,7 +351,8 @@ void fast::AffineQuantize::eval_gpu(
       dispatch_bits(bits_, [&](auto bits) {
         using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
         if (dequantize_) {
-          auto kernel = cu::affine_dequantize<DataType, group_size(), bits()>;
+          auto kernel =
+              cu::affine_dequantize<DataType, group_size.value, bits.value>;
           auto [num_blocks, block_dims] =
               get_launch_args(kernel, size, grid_shape, w.strides(), large);
           enc.add_kernel_node(
@@ -362,7 +365,8 @@ void fast::AffineQuantize::eval_gpu(
               out.data<DataType>(),
               out.size());
         } else {
-          auto kernel = cu::affine_quantize<DataType, group_size(), bits()>;
+          auto kernel =
+              cu::affine_quantize<DataType, group_size.value, bits.value>;
           auto [num_blocks, block_dims] =
               get_launch_args(kernel, size, grid_shape, w.strides(), large);
           enc.add_kernel_node(