From 72e21b7d5145b0c92536246288aa8a5f6568af23 Mon Sep 17 00:00:00 2001 From: Awni Hannun Date: Wed, 18 Jun 2025 16:42:39 -0700 Subject: [PATCH] perf tuning --- mlx/backend/cuda/allocator.cpp | 12 +++++++++++- mlx/backend/cuda/copy.cu | 1 - mlx/backend/cuda/device/utils.cuh | 20 ++++++++++---------- mlx/backend/cuda/matmul.cpp | 18 +++++++++++------- 4 files changed, 32 insertions(+), 19 deletions(-) diff --git a/mlx/backend/cuda/allocator.cpp b/mlx/backend/cuda/allocator.cpp index 1d17d7df5a..cf957bd023 100644 --- a/mlx/backend/cuda/allocator.cpp +++ b/mlx/backend/cuda/allocator.cpp @@ -1,5 +1,6 @@ // Copyright © 2025 Apple Inc. +#include "mlx/utils.h" #include "mlx/backend/cuda/allocator.h" #include "mlx/backend/cuda/utils.h" #include "mlx/backend/cuda/worker.h" @@ -14,9 +15,11 @@ namespace mlx::core { namespace cu { +constexpr int page_size = 16384; + CudaAllocator::CudaAllocator() : buffer_cache_( - getpagesize(), + page_size, [](CudaBuffer* buf) { return buf->size; }, [this](CudaBuffer* buf) { cuda_free(buf->data); @@ -31,7 +34,14 @@ CudaAllocator::CudaAllocator() Buffer CudaAllocator::malloc(size_t size) { // Find available buffer from cache. + auto orig_size = size; std::unique_lock lock(mutex_); + if (size < page_size) { + size = next_power_of_2(size); + } else { + size = page_size * ((size + page_size - 1) / page_size); + } + CudaBuffer* buf = buffer_cache_.reuse_from_cache(size); if (!buf) { // If we have a lot of memory pressure or are over the maximum cache size, diff --git a/mlx/backend/cuda/copy.cu b/mlx/backend/cuda/copy.cu index 817860d0ac..3218067209 100644 --- a/mlx/backend/cuda/copy.cu +++ b/mlx/backend/cuda/copy.cu @@ -24,7 +24,6 @@ void copy_gpu_inplace( auto& encoder = cu::get_command_encoder(s); encoder.set_input_array(in); encoder.set_output_array(out); - if (ctype == CopyType::Scalar || ctype == CopyType::Vector) { copy_contiguous(encoder, ctype, in, out, offset_in, offset_out); return; diff --git a/mlx/backend/cuda/device/utils.cuh b/mlx/backend/cuda/device/utils.cuh index 54d5519921..6e8abdd7c0 100644 --- a/mlx/backend/cuda/device/utils.cuh +++ b/mlx/backend/cuda/device/utils.cuh @@ -155,8 +155,8 @@ inline __host__ __device__ cuda::std::tuple elem_to_loc_nd( #pragma unroll for (int i = NDIM - 1; i >= 0; --i) { int dim_idx = elem % shape[i]; - a_loc += dim_idx * a_strides[i]; - b_loc += dim_idx * b_strides[i]; + a_loc += dim_idx * IdxT(a_strides[i]); + b_loc += dim_idx * IdxT(b_strides[i]); elem /= shape[i]; } return cuda::std::make_tuple(a_loc, b_loc); @@ -175,9 +175,9 @@ inline __host__ __device__ cuda::std::tuple elem_to_loc_nd( #pragma unroll for (int i = NDIM - 1; i >= 0; --i) { int dim_idx = elem % shape[i]; - a_loc += dim_idx * a_strides[i]; - b_loc += dim_idx * b_strides[i]; - c_loc += dim_idx * c_strides[i]; + a_loc += dim_idx * IdxT(a_strides[i]); + b_loc += dim_idx * IdxT(b_strides[i]); + c_loc += dim_idx * IdxT(c_strides[i]); elem /= shape[i]; } return cuda::std::make_tuple(a_loc, b_loc, c_loc); @@ -206,8 +206,8 @@ inline __host__ __device__ cuda::std::tuple elem_to_loc_4d( IdxT b_loc = 0; for (int i = ndim - 1; i >= 0; --i) { int dim_idx = elem % shape[i]; - a_loc += dim_idx * a_strides[i]; - b_loc += dim_idx * b_strides[i]; + a_loc += dim_idx * IdxT(a_strides[i]); + b_loc += dim_idx * IdxT(b_strides[i]); elem /= shape[i]; } return cuda::std::make_tuple(a_loc, b_loc); @@ -226,9 +226,9 @@ inline __host__ __device__ cuda::std::tuple elem_to_loc_4d( IdxT c_loc = 0; for (int i = ndim - 1; i >= 0; --i) { int dim_idx = elem % shape[i]; - a_loc += dim_idx * a_strides[i]; - b_loc += dim_idx * b_strides[i]; - c_loc += dim_idx * c_strides[i]; + a_loc += dim_idx * IdxT(a_strides[i]); + b_loc += dim_idx * IdxT(b_strides[i]); + c_loc += dim_idx * IdxT(c_strides[i]); elem /= shape[i]; } return cuda::std::make_tuple(a_loc, b_loc, c_loc); diff --git a/mlx/backend/cuda/matmul.cpp b/mlx/backend/cuda/matmul.cpp index 5a5e6182e8..f7b7e60b7d 100644 --- a/mlx/backend/cuda/matmul.cpp +++ b/mlx/backend/cuda/matmul.cpp @@ -162,11 +162,15 @@ class MatMul { } } - array workspace( - allocator::malloc(heuristic_.workspaceSize), - {static_cast(heuristic_.workspaceSize)}, - int8); - encoder.add_temporary(workspace); + void *workspace_ptr = nullptr; + if (heuristic_.workspaceSize > 0) { + array workspace( + allocator::malloc(heuristic_.workspaceSize), + {static_cast(heuristic_.workspaceSize)}, + int8); + encoder.add_temporary(workspace); + workspace_ptr = workspace.data(); + } encoder.launch_kernel([&](cudaStream_t stream) { CHECK_CUBLAS_ERROR(cublasLtMatmul( @@ -183,8 +187,8 @@ class MatMul { out, out_desc_, &heuristic_.algo, - workspace.data(), - workspace.nbytes(), + workspace_ptr, + heuristic_.workspaceSize, stream)); }); }