mlx/backend/cuda/allocator.h

// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/allocator.h"
#include "mlx/backend/common/buffer_cache.h"
#include "mlx/backend/cuda/cuda_utils.h"

#include <cuda_runtime.h>
#include <mutex>
#include <set>
#include <utility>

namespace mlx::core::cu {

class CommandEncoder;

using allocator::Buffer;

// Stores cuda-managed unified memory.
struct CudaBuffer {
  void* data;
  size_t size;
  int device; // -1 for managed
};

class SmallSizePool {
 private:
  union Block {
    Block* next;
    CudaBuffer buf;
  };

  Block* buffer_{nullptr};
  void* data_{nullptr};
  Block* next_free_{nullptr};

 public:
  SmallSizePool();
  ~SmallSizePool();

  SmallSizePool(const SmallSizePool&) = delete;
  SmallSizePool& operator=(const SmallSizePool&) = delete;

  CudaBuffer* malloc();
  void free(CudaBuffer* buf);
  bool in_pool(CudaBuffer* buf);
};

class CudaAllocator : public allocator::Allocator {
 public:
  Buffer malloc(size_t size) override;
  Buffer malloc_async(size_t size, int device, cudaStream_t stream);
  void free(Buffer buffer) override;
  size_t size(Buffer buffer) const override;

  size_t get_active_memory() const;
  size_t get_peak_memory() const;
  void reset_peak_memory();
  size_t get_memory_limit();
  size_t set_memory_limit(size_t limit);
  size_t get_cache_memory() const;
  size_t set_cache_limit(size_t limit);
  void clear_cache();

 private:
  void cuda_free(CudaBuffer* buf);

  CudaAllocator();
  friend CudaAllocator& allocator();

  std::mutex mutex_;
  size_t memory_limit_;
  size_t free_limit_;
  size_t total_memory_;
  size_t max_pool_size_;
  BufferCache<CudaBuffer> buffer_cache_;
  size_t active_memory_{0};
  size_t peak_memory_{0};
  std::vector<cudaStream_t> free_streams_;
  std::vector<cudaMemPool_t> mem_pools_;
  SmallSizePool scalar_pool_;
};

CudaAllocator& allocator();

Buffer malloc_async(size_t size, CommandEncoder& encoder);

} // namespace mlx::core::cu
CUDA backend: backbone (#2075) 2025-05-07 13:26:46 +09:00			`// Copyright © 2025 Apple Inc.`

			`#pragma once`

			`#include "mlx/allocator.h"`
Add memory cache to CUDA backend (#2221) * Move BufferCache out of allocator * Add memory cache to cuda backend allocator * Simplify BufferCache assuming buf can not be null 2025-05-31 04:12:54 +09:00			`#include "mlx/backend/common/buffer_cache.h"`
[CUDA] Reduce use of managed memory (#2725) * Use async cuda malloc managed with cuda 13 * add pool threshold * refactor for regular cuda malloc * load eval gpu for cuda * remove use of cuda pool, use cuda free async * fix * fix * fix * fix * fix + comment 2025-11-05 16:05:23 -08:00			`#include "mlx/backend/cuda/cuda_utils.h"`
CUDA backend: backbone (#2075) 2025-05-07 13:26:46 +09:00
[CUDA] Reduce use of managed memory (#2725) * Use async cuda malloc managed with cuda 13 * add pool threshold * refactor for regular cuda malloc * load eval gpu for cuda * remove use of cuda pool, use cuda free async * fix * fix * fix * fix * fix + comment 2025-11-05 16:05:23 -08:00			`#include <cuda_runtime.h>`
CUDA backend: backbone (#2075) 2025-05-07 13:26:46 +09:00			`#include <mutex>`
			`#include <set>`
			`#include <utility>`

			`namespace mlx::core::cu {`

Fix building with CUDA < 12.8 (#2782) 2025-11-18 12:55:19 +09:00			`class CommandEncoder;`

CUDA backend: backbone (#2075) 2025-05-07 13:26:46 +09:00			`using allocator::Buffer;`

			`// Stores cuda-managed unified memory.`
			`struct CudaBuffer {`
			`void* data;`
			`size_t size;`
[CUDA] Reduce use of managed memory (#2725) * Use async cuda malloc managed with cuda 13 * add pool threshold * refactor for regular cuda malloc * load eval gpu for cuda * remove use of cuda pool, use cuda free async * fix * fix * fix * fix * fix + comment 2025-11-05 16:05:23 -08:00			`int device; // -1 for managed`
CUDA backend: backbone (#2075) 2025-05-07 13:26:46 +09:00			`};`

[CUDA] speedup handling scalars (#2389) * speedup scalars in cuda * comment 2025-07-18 21:47:31 -07:00			`class SmallSizePool {`
			`private:`
[CUDA] Simplify allocator (#2392) * simplify allocator and fixe race with small pool * Don't use shared event in worker * use cuda buffer in small pool * comment * comment 2025-07-22 08:24:01 -07:00			`union Block {`
[CUDA] speedup handling scalars (#2389) * speedup scalars in cuda * comment 2025-07-18 21:47:31 -07:00			`Block* next;`
[CUDA] Simplify allocator (#2392) * simplify allocator and fixe race with small pool * Don't use shared event in worker * use cuda buffer in small pool * comment * comment 2025-07-22 08:24:01 -07:00			`CudaBuffer buf;`
[CUDA] speedup handling scalars (#2389) * speedup scalars in cuda * comment 2025-07-18 21:47:31 -07:00			`};`

[CUDA] Simplify allocator (#2392) * simplify allocator and fixe race with small pool * Don't use shared event in worker * use cuda buffer in small pool * comment * comment 2025-07-22 08:24:01 -07:00			`Block* buffer_{nullptr};`
			`void* data_{nullptr};`
[CUDA] speedup handling scalars (#2389) * speedup scalars in cuda * comment 2025-07-18 21:47:31 -07:00			`Block* next_free_{nullptr};`

			`public:`
			`SmallSizePool();`
			`~SmallSizePool();`

			`SmallSizePool(const SmallSizePool&) = delete;`
			`SmallSizePool& operator=(const SmallSizePool&) = delete;`

[CUDA] Simplify allocator (#2392) * simplify allocator and fixe race with small pool * Don't use shared event in worker * use cuda buffer in small pool * comment * comment 2025-07-22 08:24:01 -07:00			`CudaBuffer* malloc();`
			`void free(CudaBuffer* buf);`
			`bool in_pool(CudaBuffer* buf);`
[CUDA] speedup handling scalars (#2389) * speedup scalars in cuda * comment 2025-07-18 21:47:31 -07:00			`};`

CUDA backend: backbone (#2075) 2025-05-07 13:26:46 +09:00			`class CudaAllocator : public allocator::Allocator {`
			`public:`
			`Buffer malloc(size_t size) override;`
Fix building with CUDA < 12.8 (#2782) 2025-11-18 12:55:19 +09:00			`Buffer malloc_async(size_t size, int device, cudaStream_t stream);`
CUDA backend: backbone (#2075) 2025-05-07 13:26:46 +09:00			`void free(Buffer buffer) override;`
			`size_t size(Buffer buffer) const override;`

			`size_t get_active_memory() const;`
			`size_t get_peak_memory() const;`
			`void reset_peak_memory();`
			`size_t get_memory_limit();`
			`size_t set_memory_limit(size_t limit);`
Add memory cache to CUDA backend (#2221) * Move BufferCache out of allocator * Add memory cache to cuda backend allocator * Simplify BufferCache assuming buf can not be null 2025-05-31 04:12:54 +09:00			`size_t get_cache_memory() const;`
			`size_t set_cache_limit(size_t limit);`
			`void clear_cache();`
CUDA backend: backbone (#2075) 2025-05-07 13:26:46 +09:00
			`private:`
[CUDA] Simplify allocator (#2392) * simplify allocator and fixe race with small pool * Don't use shared event in worker * use cuda buffer in small pool * comment * comment 2025-07-22 08:24:01 -07:00			`void cuda_free(CudaBuffer* buf);`

CUDA backend: backbone (#2075) 2025-05-07 13:26:46 +09:00			`CudaAllocator();`
			`friend CudaAllocator& allocator();`

			`std::mutex mutex_;`
			`size_t memory_limit_;`
Try not to fail when there should be memory available (#2869) 2025-12-07 06:11:00 -08:00			`size_t free_limit_;`
			`size_t total_memory_;`
Add memory cache to CUDA backend (#2221) * Move BufferCache out of allocator * Add memory cache to cuda backend allocator * Simplify BufferCache assuming buf can not be null 2025-05-31 04:12:54 +09:00			`size_t max_pool_size_;`
			`BufferCache<CudaBuffer> buffer_cache_;`
CUDA backend: backbone (#2075) 2025-05-07 13:26:46 +09:00			`size_t active_memory_{0};`
			`size_t peak_memory_{0};`
[CUDA] Reduce use of managed memory (#2725) * Use async cuda malloc managed with cuda 13 * add pool threshold * refactor for regular cuda malloc * load eval gpu for cuda * remove use of cuda pool, use cuda free async * fix * fix * fix * fix * fix + comment 2025-11-05 16:05:23 -08:00			`std::vector<cudaStream_t> free_streams_;`
Try not to fail when there should be memory available (#2869) 2025-12-07 06:11:00 -08:00			`std::vector<cudaMemPool_t> mem_pools_;`
[CUDA] speedup handling scalars (#2389) * speedup scalars in cuda * comment 2025-07-18 21:47:31 -07:00			`SmallSizePool scalar_pool_;`
CUDA backend: backbone (#2075) 2025-05-07 13:26:46 +09:00			`};`

			`CudaAllocator& allocator();`

Fix building with CUDA < 12.8 (#2782) 2025-11-18 12:55:19 +09:00			`Buffer malloc_async(size_t size, CommandEncoder& encoder);`
[CUDA] Reduce use of managed memory (#2725) * Use async cuda malloc managed with cuda 13 * add pool threshold * refactor for regular cuda malloc * load eval gpu for cuda * remove use of cuda pool, use cuda free async * fix * fix * fix * fix * fix + comment 2025-11-05 16:05:23 -08:00
CUDA backend: backbone (#2075) 2025-05-07 13:26:46 +09:00			`} // namespace mlx::core::cu`