mirror of
https://github.com/ml-explore/mlx.git
synced 2025-12-16 01:49:05 +08:00
Some checks failed
Build and Test / Check Lint (push) Has been cancelled
Build and Test / Linux (cpu, aarch64) (push) Has been cancelled
Build and Test / Linux (cpu, x86_64) (push) Has been cancelled
Build and Test / Linux (cuda-12.6, aarch64) (push) Has been cancelled
Build and Test / Linux (cuda-12.9, aarch64) (push) Has been cancelled
Build and Test / Linux (cuda-12.6, x86_64) (push) Has been cancelled
Build and Test / Linux (cuda-12.9, x86_64) (push) Has been cancelled
Build and Test / macOS (14.0) (push) Has been cancelled
Build and Test / macOS (15.0) (push) Has been cancelled
Build and Test / Build Documentation (push) Has been cancelled
Build and Test / Linux Fedora (aarch64) (push) Has been cancelled
Build and Test / Linux Fedora (x86_64) (push) Has been cancelled
Nightly Build / build_linux_release (3.10) (push) Has been cancelled
Nightly Build / build_linux_release (3.14) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.11, ubuntu-22.04) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.11, ubuntu-22.04-arm) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.12, ubuntu-22.04) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.12, ubuntu-22.04-arm) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.13, ubuntu-22.04) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.13, ubuntu-22.04-arm) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.14, ubuntu-22.04) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.14, ubuntu-22.04-arm) (push) Has been cancelled
Nightly Build / build_mac_release (3.10) (push) Has been cancelled
Nightly Build / build_mac_release (3.13) (push) Has been cancelled
Nightly Build / build_cuda_release (push) Has been cancelled
367 lines
9.2 KiB
C++
367 lines
9.2 KiB
C++
// Copyright © 2025 Apple Inc.
|
|
|
|
#include "mlx/backend/cuda/allocator.h"
|
|
#include "mlx/backend/cuda/device.h"
|
|
#include "mlx/backend/cuda/utils.h"
|
|
#include "mlx/utils.h"
|
|
|
|
#include <cuda_runtime.h>
|
|
#include <fmt/format.h>
|
|
#include <unistd.h>
|
|
|
|
#include <cassert>
|
|
|
|
namespace mlx::core {
|
|
|
|
namespace cu {
|
|
|
|
constexpr int page_size = 16384;
|
|
|
|
// Any allocations smaller than this will try to use the small pool
|
|
constexpr int small_block_size = 8;
|
|
|
|
#if CUDART_VERSION >= 13000
|
|
inline cudaMemLocation cuda_mem_loc(int i) {
|
|
cudaMemLocation loc;
|
|
loc.type = cudaMemLocationTypeDevice;
|
|
loc.id = i;
|
|
return loc;
|
|
}
|
|
#else
|
|
inline int cuda_mem_loc(int i) {
|
|
return i;
|
|
}
|
|
#endif // CUDART_VERSION >= 13000
|
|
|
|
// The small pool size in bytes. This should be a multiple of the host page
|
|
// size and small_block_size.
|
|
constexpr int small_pool_size = 4 * page_size;
|
|
|
|
SmallSizePool::SmallSizePool() {
|
|
auto num_blocks = small_pool_size / small_block_size;
|
|
buffer_ = new Block[num_blocks];
|
|
|
|
next_free_ = buffer_;
|
|
|
|
CHECK_CUDA_ERROR(cudaMallocManaged(&data_, small_pool_size));
|
|
|
|
int device_count = 0;
|
|
CHECK_CUDA_ERROR(cudaGetDeviceCount(&device_count));
|
|
for (int i = 0; i < device_count; ++i) {
|
|
auto loc = cuda_mem_loc(i);
|
|
CHECK_CUDA_ERROR(
|
|
cudaMemAdvise(data_, small_pool_size, cudaMemAdviseSetAccessedBy, loc));
|
|
}
|
|
|
|
auto curr = next_free_;
|
|
for (size_t i = 1; i < num_blocks; ++i) {
|
|
curr->next = buffer_ + i;
|
|
curr = curr->next;
|
|
}
|
|
curr->next = nullptr;
|
|
}
|
|
|
|
SmallSizePool::~SmallSizePool() {
|
|
CHECK_CUDA_ERROR(cudaFree(data_));
|
|
delete[] buffer_;
|
|
}
|
|
|
|
CudaBuffer* SmallSizePool::malloc() {
|
|
if (next_free_ == nullptr) {
|
|
return nullptr;
|
|
}
|
|
Block* b = next_free_;
|
|
uint64_t i = next_free_ - buffer_;
|
|
next_free_ = next_free_->next;
|
|
b->buf.data = static_cast<char*>(data_) + i * small_block_size;
|
|
b->buf.size = small_block_size;
|
|
b->buf.device = -1;
|
|
return &b->buf;
|
|
}
|
|
|
|
void SmallSizePool::free(CudaBuffer* buf) {
|
|
auto b = reinterpret_cast<Block*>(buf);
|
|
b->next = next_free_;
|
|
next_free_ = b;
|
|
}
|
|
|
|
bool SmallSizePool::in_pool(CudaBuffer* buf) {
|
|
constexpr int num_blocks = (small_pool_size / small_block_size);
|
|
auto b = reinterpret_cast<Block*>(buf);
|
|
int64_t block_num = b - buffer_;
|
|
return block_num >= 0 && block_num < num_blocks;
|
|
}
|
|
|
|
CudaAllocator::CudaAllocator()
|
|
: buffer_cache_(
|
|
page_size,
|
|
[](CudaBuffer* buf) { return buf->size; },
|
|
[this](CudaBuffer* buf) { cuda_free(buf); }) {
|
|
size_t free;
|
|
CHECK_CUDA_ERROR(cudaMemGetInfo(&free, &total_memory_));
|
|
memory_limit_ = total_memory_ * 0.95;
|
|
free_limit_ = total_memory_ - memory_limit_;
|
|
max_pool_size_ = memory_limit_;
|
|
|
|
int device_count = 0;
|
|
CHECK_CUDA_ERROR(cudaGetDeviceCount(&device_count));
|
|
int curr;
|
|
CHECK_CUDA_ERROR(cudaGetDevice(&curr));
|
|
for (int i = 0; i < device_count; ++i) {
|
|
CHECK_CUDA_ERROR(cudaSetDevice(i));
|
|
cudaStream_t s;
|
|
CHECK_CUDA_ERROR(cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking));
|
|
free_streams_.push_back(s);
|
|
|
|
cudaMemPool_t mem_pool;
|
|
CHECK_CUDA_ERROR(cudaDeviceGetDefaultMemPool(&mem_pool, i));
|
|
mem_pools_.push_back(mem_pool);
|
|
}
|
|
CHECK_CUDA_ERROR(cudaSetDevice(curr));
|
|
}
|
|
|
|
void copy_to_managed(CudaBuffer& buf) {
|
|
// TODO maybe make this async on a i/o stream to avoid synchronizing the
|
|
// device on malloc/and free
|
|
void* new_data;
|
|
CHECK_CUDA_ERROR(cudaMallocManaged(&new_data, buf.size));
|
|
buf.device = -1;
|
|
CHECK_CUDA_ERROR(cudaMemcpy(new_data, buf.data, buf.size, cudaMemcpyDefault));
|
|
CHECK_CUDA_ERROR(cudaFree(buf.data));
|
|
buf.data = new_data;
|
|
}
|
|
|
|
Buffer
|
|
CudaAllocator::malloc_async(size_t size, int device, cudaStream_t stream) {
|
|
if (size == 0) {
|
|
return Buffer{new CudaBuffer{nullptr, 0, -1}};
|
|
}
|
|
|
|
// Find available buffer from cache.
|
|
std::unique_lock lock(mutex_);
|
|
if (size <= small_block_size) {
|
|
size = 8;
|
|
} else if (size < page_size) {
|
|
size = next_power_of_2(size);
|
|
} else {
|
|
size = page_size * ((size + page_size - 1) / page_size);
|
|
}
|
|
|
|
if (size <= small_block_size || stream == nullptr) {
|
|
device = -1;
|
|
}
|
|
|
|
CudaBuffer* buf = buffer_cache_.reuse_from_cache(size);
|
|
if (!buf) {
|
|
// If we have a lot of memory pressure try to reclaim memory from the cache.
|
|
int64_t mem_to_free =
|
|
get_active_memory() + get_cache_memory() + size - memory_limit_;
|
|
if (mem_to_free > 0) {
|
|
buffer_cache_.release_cached_buffers(mem_to_free);
|
|
}
|
|
|
|
// Try the scalar pool first
|
|
if (size <= small_block_size) {
|
|
buf = scalar_pool_.malloc();
|
|
}
|
|
lock.unlock();
|
|
if (!buf) {
|
|
void* data = nullptr;
|
|
if (device == -1) {
|
|
CHECK_CUDA_ERROR(cudaMallocManaged(&data, size));
|
|
} else {
|
|
CHECK_CUDA_ERROR(cudaMallocAsync(&data, size, stream));
|
|
}
|
|
if (!data) {
|
|
std::ostringstream msg;
|
|
msg << "[malloc] Unable to allocate " << size << " bytes.";
|
|
throw std::runtime_error(msg.str());
|
|
}
|
|
buf = new CudaBuffer{data, size, device};
|
|
}
|
|
lock.lock();
|
|
|
|
// If any cuda memory pool has too much reserved memory, clear some
|
|
// memory from the cache. This prevents graph / kernel execution failing
|
|
// from OOM
|
|
if (get_cache_memory() > 0) {
|
|
for (auto p : mem_pools_) {
|
|
size_t used = 0;
|
|
CHECK_CUDA_ERROR(cudaMemPoolGetAttribute(
|
|
p, cudaMemPoolAttrReservedMemCurrent, &used));
|
|
if (used > (total_memory_ - free_limit_)) {
|
|
buffer_cache_.release_cached_buffers(free_limit_);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
active_memory_ += buf->size;
|
|
peak_memory_ = std::max(active_memory_, peak_memory_);
|
|
|
|
// Maintain the cache below the requested limit.
|
|
if (get_cache_memory() > max_pool_size_) {
|
|
buffer_cache_.release_cached_buffers(get_cache_memory() - max_pool_size_);
|
|
}
|
|
// Copy to managed here if the buffer is not on the right device
|
|
if (buf->device >= 0 && buf->device != device) {
|
|
copy_to_managed(*buf);
|
|
}
|
|
return Buffer{buf};
|
|
}
|
|
|
|
Buffer CudaAllocator::malloc(size_t size) {
|
|
return malloc_async(size, -1, nullptr);
|
|
}
|
|
|
|
void CudaAllocator::free(Buffer buffer) {
|
|
auto* buf = static_cast<CudaBuffer*>(buffer.ptr());
|
|
if (!buf) {
|
|
return;
|
|
}
|
|
if (buf->size == 0) {
|
|
delete buf;
|
|
return;
|
|
}
|
|
|
|
std::unique_lock lock(mutex_);
|
|
active_memory_ -= buf->size;
|
|
if (get_cache_memory() < max_pool_size_) {
|
|
buffer_cache_.recycle_to_cache(buf);
|
|
} else {
|
|
cuda_free(buf);
|
|
}
|
|
}
|
|
|
|
size_t CudaAllocator::size(Buffer buffer) const {
|
|
auto* buf = static_cast<CudaBuffer*>(buffer.ptr());
|
|
if (!buf) {
|
|
return 0;
|
|
}
|
|
return buf->size;
|
|
}
|
|
|
|
// This must be called with mutex_ aquired
|
|
void CudaAllocator::cuda_free(CudaBuffer* buf) {
|
|
if (scalar_pool_.in_pool(buf)) {
|
|
scalar_pool_.free(buf);
|
|
} else {
|
|
if (buf->device >= 0) {
|
|
CHECK_CUDA_ERROR(cudaFreeAsync(buf->data, free_streams_[buf->device]));
|
|
} else {
|
|
CHECK_CUDA_ERROR(cudaFree(buf->data));
|
|
}
|
|
delete buf;
|
|
}
|
|
}
|
|
|
|
size_t CudaAllocator::get_active_memory() const {
|
|
return active_memory_;
|
|
}
|
|
|
|
size_t CudaAllocator::get_peak_memory() const {
|
|
return peak_memory_;
|
|
}
|
|
|
|
void CudaAllocator::reset_peak_memory() {
|
|
std::lock_guard lock(mutex_);
|
|
peak_memory_ = 0;
|
|
}
|
|
|
|
size_t CudaAllocator::get_memory_limit() {
|
|
return memory_limit_;
|
|
}
|
|
|
|
size_t CudaAllocator::set_memory_limit(size_t limit) {
|
|
std::lock_guard lock(mutex_);
|
|
std::swap(limit, memory_limit_);
|
|
return limit;
|
|
}
|
|
|
|
size_t CudaAllocator::get_cache_memory() const {
|
|
return buffer_cache_.cache_size();
|
|
}
|
|
|
|
size_t CudaAllocator::set_cache_limit(size_t limit) {
|
|
std::lock_guard lk(mutex_);
|
|
std::swap(limit, max_pool_size_);
|
|
return limit;
|
|
}
|
|
|
|
void CudaAllocator::clear_cache() {
|
|
std::lock_guard lk(mutex_);
|
|
buffer_cache_.clear();
|
|
}
|
|
|
|
CudaAllocator& allocator() {
|
|
// By creating the |allocator_| on heap, the destructor of CudaAllocator
|
|
// will not be called on exit and buffers in the cache will be leaked. This
|
|
// can save some time at program exit.
|
|
static CudaAllocator* allocator_ = new CudaAllocator;
|
|
return *allocator_;
|
|
}
|
|
|
|
Buffer malloc_async(size_t size, CommandEncoder& encoder) {
|
|
auto buffer = allocator().malloc_async(
|
|
size, encoder.device().cuda_device(), encoder.stream());
|
|
if (size && !buffer.ptr()) {
|
|
std::ostringstream msg;
|
|
msg << "[malloc_async] Unable to allocate " << size << " bytes.";
|
|
throw std::runtime_error(msg.str());
|
|
}
|
|
return buffer;
|
|
}
|
|
|
|
} // namespace cu
|
|
|
|
namespace allocator {
|
|
|
|
Allocator& allocator() {
|
|
return cu::allocator();
|
|
}
|
|
|
|
void* Buffer::raw_ptr() {
|
|
if (!ptr_) {
|
|
return nullptr;
|
|
}
|
|
auto& cbuf = *static_cast<cu::CudaBuffer*>(ptr_);
|
|
if (cbuf.device != -1) {
|
|
copy_to_managed(cbuf);
|
|
}
|
|
return cbuf.data;
|
|
}
|
|
|
|
} // namespace allocator
|
|
|
|
size_t get_active_memory() {
|
|
return cu::allocator().get_active_memory();
|
|
}
|
|
size_t get_peak_memory() {
|
|
return cu::allocator().get_peak_memory();
|
|
}
|
|
void reset_peak_memory() {
|
|
return cu::allocator().reset_peak_memory();
|
|
}
|
|
size_t set_memory_limit(size_t limit) {
|
|
return cu::allocator().set_memory_limit(limit);
|
|
}
|
|
size_t get_memory_limit() {
|
|
return cu::allocator().get_memory_limit();
|
|
}
|
|
size_t get_cache_memory() {
|
|
return cu::allocator().get_cache_memory();
|
|
}
|
|
size_t set_cache_limit(size_t limit) {
|
|
return cu::allocator().set_cache_limit(limit);
|
|
}
|
|
void clear_cache() {
|
|
cu::allocator().clear_cache();
|
|
}
|
|
|
|
// Not supported in CUDA.
|
|
size_t set_wired_limit(size_t) {
|
|
return 0;
|
|
}
|
|
|
|
} // namespace mlx::core
|