Faster synchronization Fence primitive (#1773)

* try faster synchronization

move event

fixes

update bench

fix

fix

* non-functioning kernel

* try alternative fence

* cleanup barrier

* get rid of event_fence

* update benchmarks

* doc string in metal fence
This commit is contained in:
Awni Hannun
2025-01-17 18:42:19 -08:00
committed by GitHub
parent 0c259961ac
commit a4667da1eb
11 changed files with 362 additions and 31 deletions

View File

@@ -134,6 +134,13 @@ CommandEncoder::~CommandEncoder() {
enc_->release();
}
void CommandEncoder::set_buffer(
const MTL::Buffer* buf,
int idx,
int64_t offset /* = 0 */) {
enc_->setBuffer(buf, offset, idx);
}
void CommandEncoder::set_input_array(
const array& a,
int idx,
@@ -155,6 +162,10 @@ void CommandEncoder::set_output_array(
int64_t offset /* = 0 */) {
// Add barriers before adding the output to the output set
set_input_array(a, idx, offset);
register_output_array(a);
}
void CommandEncoder::register_output_array(array& a) {
all_outputs_.insert(a.buffer().ptr());
auto buf = static_cast<MTL::Resource*>(a.buffer().ptr());
if (concurrent_) {
@@ -189,6 +200,10 @@ void CommandEncoder::dispatch_threads(
enc_->dispatchThreads(grid_dims, group_dims);
}
void CommandEncoder::barrier() {
enc_->memoryBarrier(MTL::BarrierScopeBuffers);
}
Device::Device() {
auto pool = new_scoped_memory_pool();
device_ = load_device();