cuda synch properly waits for all tasks to finish and clear

This commit is contained in:
Awni Hannun 2025-06-17 07:20:05 -07:00
parent b8022c578a
commit 3d94859ea2
6 changed files with 18 additions and 4 deletions

View File

@ -106,7 +106,6 @@ void CudaAllocator::cuda_free(void* buf) {
return;
}
}
cudaFree(buf);
}

View File

@ -6,6 +6,7 @@
#include <fmt/format.h>
#include <nvtx3/nvtx3.hpp>
#include <future>
namespace mlx::core {
@ -107,6 +108,16 @@ void CommandEncoder::commit() {
worker_.commit(stream_.last_cuda_stream());
}
void CommandEncoder::synchronize() {
stream().synchronize();
auto p = std::make_shared<std::promise<void>>();
std::future<void> f = p->get_future();
add_completed_handler([p = std::move(p)]() { p->set_value(); });
worker_.end_batch();
worker_.commit();
f.wait();
}
Device& device(mlx::core::Device device) {
static std::unordered_map<int, Device> devices;
auto it = devices.find(device.index);

View File

@ -123,6 +123,9 @@ class CommandEncoder {
return has_gpu_work_;
}
// Wait until kernels and completion handlers are finished
void synchronize();
private:
Device& device_;
DeviceStream& stream_;

View File

@ -62,7 +62,7 @@ void finalize(Stream s) {
void synchronize(Stream s) {
nvtx3::scoped_range r("gpu::synchronize");
cu::get_stream(s).synchronize();
cu::get_command_encoder(s).synchronize();
}
} // namespace mlx::core::gpu

View File

@ -80,7 +80,9 @@ void Worker::thread_fn() {
}
worker_tasks_.erase(worker_tasks_.begin(), end);
}
for (auto& task : tasks) {
// Make sure tasks are cleared before the next wait
for (int i = 0; i < tasks.size(); ++i) {
auto task = std::move(tasks[i]);
task();
}
worker_event_.wait(batch + 1);

View File

@ -6,7 +6,6 @@ cuda_skip = {
"TestEinsum.test_ellipses",
"TestEinsum.test_opt_einsum_test_cases",
"TestLoad.test_load_f8_e4m3",
"TestMemory.test_memory_info",
"TestLayers.test_group_norm",
"TestLayers.test_pooling",
"TestLayers.test_quantized_embedding",