mirror of
https://github.com/ml-explore/mlx.git
synced 2025-12-16 01:49:05 +08:00
[CUDA] Reduce use of managed memory (#2725)
Some checks failed
Nightly Build / build_linux_release (3.10) (push) Has been cancelled
Nightly Build / build_linux_release (3.14) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.10) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.11) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.12) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.13) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.14) (push) Has been cancelled
Nightly Build / build_mac_release (3.10) (push) Has been cancelled
Nightly Build / build_mac_release (3.13) (push) Has been cancelled
Nightly Build / build_cuda_with_tests (push) Has been cancelled
Nightly Build / build_cuda_release (push) Has been cancelled
Nightly Build / Linux Fedora CPP Build (aarch64) (push) Has been cancelled
Nightly Build / Linux Fedora CPP Build (x86_64) (push) Has been cancelled
Some checks failed
Nightly Build / build_linux_release (3.10) (push) Has been cancelled
Nightly Build / build_linux_release (3.14) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.10) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.11) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.12) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.13) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.14) (push) Has been cancelled
Nightly Build / build_mac_release (3.10) (push) Has been cancelled
Nightly Build / build_mac_release (3.13) (push) Has been cancelled
Nightly Build / build_cuda_with_tests (push) Has been cancelled
Nightly Build / build_cuda_release (push) Has been cancelled
Nightly Build / Linux Fedora CPP Build (aarch64) (push) Has been cancelled
Nightly Build / Linux Fedora CPP Build (x86_64) (push) Has been cancelled
* Use async cuda malloc managed with cuda 13 * add pool threshold * refactor for regular cuda malloc * load eval gpu for cuda * remove use of cuda pool, use cuda free async * fix * fix * fix * fix * fix + comment
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/fence.h"
|
||||
#include "mlx/backend/cuda/allocator.h"
|
||||
#include "mlx/backend/cuda/device.h"
|
||||
#include "mlx/backend/cuda/event.h"
|
||||
|
||||
namespace mlx::core {
|
||||
@@ -20,8 +22,24 @@ void Fence::wait(Stream s, const array&) {
|
||||
fence->event.wait(fence->count);
|
||||
}
|
||||
|
||||
void Fence::update(Stream s, const array&) {
|
||||
void Fence::update(Stream s, const array& a, bool cross_device) {
|
||||
auto* fence = static_cast<FenceImpl*>(fence_.get());
|
||||
if (cross_device) {
|
||||
// Move to managed memory if there is a device switch
|
||||
auto& cbuf =
|
||||
*static_cast<cu::CudaBuffer*>(const_cast<array&>(a).buffer().ptr());
|
||||
if (cbuf.device != -1) {
|
||||
void* new_data;
|
||||
CHECK_CUDA_ERROR(cudaMallocManaged(&new_data, cbuf.size));
|
||||
cbuf.device = -1;
|
||||
auto& encoder = cu::device(s.device).get_command_encoder(s);
|
||||
encoder.commit();
|
||||
CHECK_CUDA_ERROR(cudaMemcpyAsync(
|
||||
new_data, cbuf.data, cbuf.size, cudaMemcpyDefault, encoder.stream()));
|
||||
CHECK_CUDA_ERROR(cudaFreeAsync(cbuf.data, encoder.stream()));
|
||||
cbuf.data = new_data;
|
||||
}
|
||||
}
|
||||
fence->count++;
|
||||
fence->event.signal(s, fence->count);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user