From b2273733eaa0c08e4c4b51f93c76ee1710066967 Mon Sep 17 00:00:00 2001 From: Awni Hannun Date: Wed, 16 Jul 2025 13:00:37 -0700 Subject: [PATCH] Test with CUDA 12.2 (#2375) * Test with CUDA 12.0 * try older image * fix cpu sort --- .circleci/config.yml | 4 ++-- mlx/backend/cpu/sort.cpp | 8 ++++++-- mlx/backend/cuda/quantized.cu | 16 ++++++++++------ 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ef82298bc..1e6fea821 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -201,7 +201,7 @@ jobs: cuda_build_and_test: machine: - image: linux-cuda-12:default + image: linux-cuda-12:2023.11.1 resource_class: gpu.nvidia.small.gen2 steps: - checkout @@ -210,7 +210,7 @@ jobs: command: | sudo apt-get update sudo apt-get install libblas-dev liblapack-dev liblapacke-dev - python -m venv env + python3 -m venv env source env/bin/activate CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \ pip install -e ".[dev]" diff --git a/mlx/backend/cpu/sort.cpp b/mlx/backend/cpu/sort.cpp index f2243f60f..089f7c425 100644 --- a/mlx/backend/cpu/sort.cpp +++ b/mlx/backend/cpu/sort.cpp @@ -334,7 +334,9 @@ void Sort::eval_cpu(const std::vector& inputs, array& out) { auto& in = inputs[0]; // Copy input to output - CopyType ctype = in.flags().contiguous ? CopyType::Vector : CopyType::General; + CopyType ctype = (in.flags().contiguous && in.strides()[axis_] != 0) + ? CopyType::Vector + : CopyType::General; copy_cpu(in, out, ctype, stream()); auto& encoder = cpu::get_command_encoder(stream()); @@ -426,7 +428,9 @@ void Partition::eval_cpu(const std::vector& inputs, array& out) { auto& in = inputs[0]; // Copy input to output - CopyType ctype = in.flags().contiguous ? CopyType::Vector : CopyType::General; + CopyType ctype = (in.flags().contiguous && in.strides()[axis_] != 0) + ? CopyType::Vector + : CopyType::General; copy_cpu(in, out, ctype, stream()); auto& encoder = cpu::get_command_encoder(stream()); diff --git a/mlx/backend/cuda/quantized.cu b/mlx/backend/cuda/quantized.cu index 12a1f6fe4..4424000d8 100644 --- a/mlx/backend/cuda/quantized.cu +++ b/mlx/backend/cuda/quantized.cu @@ -36,7 +36,8 @@ affine_quantize(const T* w, uint8_t* out, T* scales, T* biases, size_t size) { auto tidx = block_idx.x * block_size.x + idx_in_block.x; auto tidy = block_idx.y * block_size.y + idx_in_block.y; - auto grid_dim = cg::this_grid().dim_threads(); + auto grid_dim_x = + cg::this_grid().dim_blocks().x * cg::this_grid().block_index().x; constexpr float eps = 1e-7; constexpr int simd_size = WARP_SIZE; constexpr float n_bins = (1 << bits) - 1; @@ -48,7 +49,7 @@ affine_quantize(const T* w, uint8_t* out, T* scales, T* biases, size_t size) { writes_per_reduce > 1 ? 1 : values_per_reduce / pack_factor; constexpr int power_of_2_bits = (bits & (bits - 1)) == 0; - size_t offset = tidx + grid_dim.x * size_t(tidy); + size_t offset = tidx + grid_dim_x * size_t(tidy); size_t in_index = offset * values_per_reduce; if (in_index >= size) { return; @@ -153,12 +154,13 @@ __global__ void affine_dequantize( auto tidx = block_idx.x * block_size.x + idx_in_block.x; auto tidy = block_idx.y * block_size.y + idx_in_block.y; - auto grid_dim = cg::this_grid().dim_threads(); + auto grid_dim_x = + cg::this_grid().dim_blocks().x * cg::this_grid().block_index().x; constexpr int pack_factor = get_pack_factor(); constexpr int bytes_per_pack = get_bytes_per_pack(); - size_t offset = tidx + grid_dim.x * size_t(tidy); + size_t offset = tidx + grid_dim_x * size_t(tidy); size_t oindex = offset * pack_factor; if (oindex >= size) { @@ -349,7 +351,8 @@ void fast::AffineQuantize::eval_gpu( dispatch_bits(bits_, [&](auto bits) { using DataType = cuda_type_t; if (dequantize_) { - auto kernel = cu::affine_dequantize; + auto kernel = + cu::affine_dequantize; auto [num_blocks, block_dims] = get_launch_args(kernel, size, grid_shape, w.strides(), large); enc.add_kernel_node( @@ -362,7 +365,8 @@ void fast::AffineQuantize::eval_gpu( out.data(), out.size()); } else { - auto kernel = cu::affine_quantize; + auto kernel = + cu::affine_quantize; auto [num_blocks, block_dims] = get_launch_args(kernel, size, grid_shape, w.strides(), large); enc.add_kernel_node(