From 1588659062ffd0d6d4dcc2268ab9e5369c63d0c7 Mon Sep 17 00:00:00 2001 From: Awni Hannun Date: Mon, 28 Jul 2025 09:09:41 -0700 Subject: [PATCH] no occupancy query for launch params (#2426) --- mlx/backend/cuda/kernel_utils.cuh | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/mlx/backend/cuda/kernel_utils.cuh b/mlx/backend/cuda/kernel_utils.cuh index 24c81f2fbc..bf10de6497 100644 --- a/mlx/backend/cuda/kernel_utils.cuh +++ b/mlx/backend/cuda/kernel_utils.cuh @@ -120,20 +120,6 @@ dim3 get_2d_grid_dims( size_t divisor); std::pair get_grid_and_block(int dim0, int dim1, int dim2); -// Return a block size that achieves maximum potential occupancy for kernel. -template -inline uint max_occupancy_block_dim(T kernel) { - int _, block_dim; - if constexpr (std::is_same_v) { - CHECK_CUDA_ERROR( - cuOccupancyMaxPotentialBlockSize(&_, &block_dim, kernel, 0, 0, 0)); - } else { - CHECK_CUDA_ERROR( - cudaOccupancyMaxPotentialBlockSize(&_, &block_dim, kernel)); - } - return block_dim; -} - // Get the num_blocks and block_dims that maximize occupancy for |kernel|, // assuming each thread handles |work_per_thread| elements of |arr|. template @@ -145,7 +131,7 @@ inline std::tuple get_launch_args( bool large, int work_per_thread = 1) { size_t nthreads = cuda::ceil_div(size, work_per_thread); - uint block_dim = max_occupancy_block_dim(kernel); + uint block_dim = 1024; if (block_dim > nthreads) { block_dim = nthreads; }