From aaef4327b48c3c6d1202788bc0072801cdf9a16d Mon Sep 17 00:00:00 2001 From: Awni Hannun Date: Tue, 15 Jul 2025 13:19:40 -0700 Subject: [PATCH] try older image --- .circleci/config.yml | 10 ++-------- mlx/backend/cuda/quantized.cu | 16 ++++++++++------ 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9143b5467..1e6fea821 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -201,22 +201,16 @@ jobs: cuda_build_and_test: machine: - image: linux-cuda-12:default + image: linux-cuda-12:2023.11.1 resource_class: gpu.nvidia.small.gen2 steps: - checkout - run: name: Install Python package command: | - wget http://security.ubuntu.com/ubuntu/pool/universe/n/ncurses/libtinfo5_6.3-2ubuntu0.1_amd64.deb - sudo dpkg -i libtinfo5_6.3-2ubuntu0.1_amd64.deb - wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb - sudo dpkg -i cuda-keyring_1.0-1_all.deb sudo apt-get update - sudo apt-get install cuda-toolkit-12-0 - sudo update-alternatives --set cuda /usr/local/cuda-12.0 sudo apt-get install libblas-dev liblapack-dev liblapacke-dev - python -m venv env + python3 -m venv env source env/bin/activate CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \ pip install -e ".[dev]" diff --git a/mlx/backend/cuda/quantized.cu b/mlx/backend/cuda/quantized.cu index 12a1f6fe4..4424000d8 100644 --- a/mlx/backend/cuda/quantized.cu +++ b/mlx/backend/cuda/quantized.cu @@ -36,7 +36,8 @@ affine_quantize(const T* w, uint8_t* out, T* scales, T* biases, size_t size) { auto tidx = block_idx.x * block_size.x + idx_in_block.x; auto tidy = block_idx.y * block_size.y + idx_in_block.y; - auto grid_dim = cg::this_grid().dim_threads(); + auto grid_dim_x = + cg::this_grid().dim_blocks().x * cg::this_grid().block_index().x; constexpr float eps = 1e-7; constexpr int simd_size = WARP_SIZE; constexpr float n_bins = (1 << bits) - 1; @@ -48,7 +49,7 @@ affine_quantize(const T* w, uint8_t* out, T* scales, T* biases, size_t size) { writes_per_reduce > 1 ? 1 : values_per_reduce / pack_factor; constexpr int power_of_2_bits = (bits & (bits - 1)) == 0; - size_t offset = tidx + grid_dim.x * size_t(tidy); + size_t offset = tidx + grid_dim_x * size_t(tidy); size_t in_index = offset * values_per_reduce; if (in_index >= size) { return; @@ -153,12 +154,13 @@ __global__ void affine_dequantize( auto tidx = block_idx.x * block_size.x + idx_in_block.x; auto tidy = block_idx.y * block_size.y + idx_in_block.y; - auto grid_dim = cg::this_grid().dim_threads(); + auto grid_dim_x = + cg::this_grid().dim_blocks().x * cg::this_grid().block_index().x; constexpr int pack_factor = get_pack_factor(); constexpr int bytes_per_pack = get_bytes_per_pack(); - size_t offset = tidx + grid_dim.x * size_t(tidy); + size_t offset = tidx + grid_dim_x * size_t(tidy); size_t oindex = offset * pack_factor; if (oindex >= size) { @@ -349,7 +351,8 @@ void fast::AffineQuantize::eval_gpu( dispatch_bits(bits_, [&](auto bits) { using DataType = cuda_type_t; if (dequantize_) { - auto kernel = cu::affine_dequantize; + auto kernel = + cu::affine_dequantize; auto [num_blocks, block_dims] = get_launch_args(kernel, size, grid_shape, w.strides(), large); enc.add_kernel_node( @@ -362,7 +365,8 @@ void fast::AffineQuantize::eval_gpu( out.data(), out.size()); } else { - auto kernel = cu::affine_quantize; + auto kernel = + cu::affine_quantize; auto [num_blocks, block_dims] = get_launch_args(kernel, size, grid_shape, w.strides(), large); enc.add_kernel_node(