[CUDA] speedup handling scalars (#2389 )

* speedup scalars in cuda * comment
fix cuda manylinux version to match others (#2388 )
2025-12-16 01:49:05 +08:00 · 2025-07-18 21:47:31 -07:00 · 2025-07-18 21:02:16 -07:00
4 changed files with 91 additions and 8 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -366,7 +366,7 @@ jobs:
        type: string
        default: ""
    machine:
-      image: linux-cuda-12:default
+      image: linux-cuda-12:2024.11.1
      resource_class: gpu.nvidia.small.gen2
    steps:
      - checkout
--- a/mlx/backend/cuda/allocator.cpp
+++ b/mlx/backend/cuda/allocator.cpp
@@ -17,6 +17,52 @@ namespace cu {

 constexpr int page_size = 16384;

+// Any allocations smaller than this will try to use the small pool
+constexpr int small_block_size = 8;
+
+// The small pool size in bytes. This should be a multiple of the host page
+// size and small_block_size.
+constexpr int small_pool_size = 4 * page_size;
+
+SmallSizePool::SmallSizePool() {
+  CHECK_CUDA_ERROR(cudaMallocManaged(&buffer_, small_pool_size));
+  end_ = reinterpret_cast<void*>(
+      reinterpret_cast<char*>(buffer_) + small_pool_size);
+  next_free_ = reinterpret_cast<Block*>(buffer_);
+
+  auto num_blocks = small_pool_size / small_block_size;
+  auto curr = next_free_;
+  for (size_t i = 0; i < num_blocks - 1; ++i) {
+    curr->next = reinterpret_cast<Block*>(
+        reinterpret_cast<char*>(buffer_) + (i + 1) * small_block_size);
+    curr = curr->next;
+  }
+  curr->next = nullptr;
+}
+
+SmallSizePool::~SmallSizePool() {
+  CHECK_CUDA_ERROR(cudaFree(buffer_));
+}
+
+void* SmallSizePool::malloc() {
+  if (next_free_ == nullptr) {
+    return nullptr;
+  }
+  Block* b = next_free_;
+  next_free_ = next_free_->next;
+  return static_cast<void*>(b);
+}
+
+void SmallSizePool::free(void* p) {
+  auto b = static_cast<Block*>(p);
+  b->next = next_free_;
+  next_free_ = b;
+}
+
+bool SmallSizePool::in_pool(void* p) {
+  return (p >= buffer_) && (p < end_);
+}
+
 CudaAllocator::CudaAllocator()
    : buffer_cache_(
          page_size,
@@ -36,7 +82,9 @@ Buffer CudaAllocator::malloc(size_t size) {
  // Find available buffer from cache.
  auto orig_size = size;
  std::unique_lock lock(mutex_);
-  if (size < page_size) {
+  if (size <= small_block_size) {
+    size = 8;
+  } else if (size < page_size) {
    size = next_power_of_2(size);
  } else {
    size = page_size * ((size + page_size - 1) / page_size);
@@ -53,11 +101,19 @@ Buffer CudaAllocator::malloc(size_t size) {

    lock.unlock();
    buf = new CudaBuffer{nullptr, size};
-    cudaError_t err = cudaMallocManaged(&buf->data, size);
-    if (err != cudaSuccess && err != cudaErrorMemoryAllocation) {
-      throw std::runtime_error(fmt::format(
-          "cudaMallocManaged failed: {}.", cudaGetErrorString(err)));
+
+    // Try the scalar pool first
+    if (size <= small_block_size) {
+      buf->data = scalar_pool_.malloc();
    }
+    if (!buf->data) {
+      cudaError_t err = cudaMallocManaged(&buf->data, size);
+      if (err != cudaSuccess && err != cudaErrorMemoryAllocation) {
+        throw std::runtime_error(fmt::format(
+            "cudaMallocManaged failed: {}.", cudaGetErrorString(err)));
+      }
+    }
+
    lock.lock();
  }
  active_memory_ += size;
@@ -116,7 +172,11 @@ void CudaAllocator::cuda_free(void* buf) {
      return;
    }
  }
-  cudaFree(buf);
+  if (scalar_pool_.in_pool(buf)) {
+    scalar_pool_.free(buf);
+  } else {
+    cudaFree(buf);
+  }
 }

 size_t CudaAllocator::get_active_memory() const {
--- a/mlx/backend/cuda/allocator.h
+++ b/mlx/backend/cuda/allocator.h
@@ -22,6 +22,28 @@ struct CudaBuffer {
  size_t size;
 };

+class SmallSizePool {
+ private:
+  struct Block {
+    Block* next;
+  };
+
+  void* buffer_{nullptr};
+  Block* next_free_{nullptr};
+  void* end_{nullptr};
+
+ public:
+  SmallSizePool();
+  ~SmallSizePool();
+
+  SmallSizePool(const SmallSizePool&) = delete;
+  SmallSizePool& operator=(const SmallSizePool&) = delete;
+
+  void* malloc();
+  void free(void* p);
+  bool in_pool(void* p);
+};
+
 class CudaAllocator : public allocator::Allocator {
 public:
  Buffer malloc(size_t size) override;
@@ -60,6 +82,7 @@ class CudaAllocator : public allocator::Allocator {
  BufferCache<CudaBuffer> buffer_cache_;
  size_t active_memory_{0};
  size_t peak_memory_{0};
+  SmallSizePool scalar_pool_;
 };

 CudaAllocator& allocator();
--- a/python/scripts/repair_cuda.sh
+++ b/python/scripts/repair_cuda.sh
@@ -1,7 +1,7 @@
 #!/bin/bash

 auditwheel repair dist/* \
-  --plat manylinux_2_39_x86_64 \
+  --plat manylinux_2_35_x86_64 \
  --exclude libcublas* \
  --exclude libnvrtc* \
  -w wheel_tmp
Author	SHA1	Message	Date
Awni Hannun	93d70419e7	[CUDA] speedup handling scalars (#2389 ) * speedup scalars in cuda * comment	2025-07-18 21:47:31 -07:00
Awni Hannun	63f663d9c6	fix cuda manylinux version to match others (#2388 )	2025-07-18 21:02:16 -07:00