[CUDA] Reduce use of managed memory (#2725)
Some checks failed
Nightly Build / build_linux_release (3.10) (push) Has been cancelled
Nightly Build / build_linux_release (3.14) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.10) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.11) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.12) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.13) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.14) (push) Has been cancelled
Nightly Build / build_mac_release (3.10) (push) Has been cancelled
Nightly Build / build_mac_release (3.13) (push) Has been cancelled
Nightly Build / build_cuda_with_tests (push) Has been cancelled
Nightly Build / build_cuda_release (push) Has been cancelled
Nightly Build / Linux Fedora CPP Build (aarch64) (push) Has been cancelled
Nightly Build / Linux Fedora CPP Build (x86_64) (push) Has been cancelled

* Use async cuda malloc managed with cuda 13

* add pool threshold

* refactor for regular cuda malloc

* load eval gpu for cuda

* remove use of cuda pool, use cuda free async

* fix

* fix

* fix

* fix

* fix + comment
This commit is contained in:
Awni Hannun
2025-11-05 16:05:23 -08:00
committed by GitHub
parent 27778156dc
commit df58b4133a
79 changed files with 795 additions and 515 deletions

View File

@@ -168,10 +168,10 @@ void ternary_op_gpu_inplace(
num_blocks,
block_dims,
0,
a.data<bool>(),
b.data<DType>(),
c.data<DType>(),
out.data<DType>(),
gpu_ptr<bool>(a),
gpu_ptr<DType>(b),
gpu_ptr<DType>(c),
gpu_ptr<DType>(out),
out.data_size());
});
} else {
@@ -211,10 +211,10 @@ void ternary_op_gpu_inplace(
{num_blocks_x, num_blocks_y},
block_dims,
0,
a.data<bool>(),
b.data<DType>(),
c.data<DType>(),
out.data<DType>(),
gpu_ptr<bool>(a),
gpu_ptr<DType>(b),
gpu_ptr<DType>(c),
gpu_ptr<DType>(out),
rest,
const_param<dims_constant()>(shape),
const_param<dims_constant()>(a_strides),
@@ -231,10 +231,10 @@ void ternary_op_gpu_inplace(
{num_blocks_x, num_blocks_y},
block_dims,
0,
a.data<bool>(),
b.data<DType>(),
c.data<DType>(),
out.data<DType>(),
gpu_ptr<bool>(a),
gpu_ptr<DType>(b),
gpu_ptr<DType>(c),
gpu_ptr<DType>(out),
rest,
const_param(shape),
const_param(a_strides),
@@ -256,7 +256,10 @@ void ternary_op_gpu(
auto& b = inputs[1];
auto& c = inputs[2];
auto topt = get_ternary_op_type(a, b, c);
set_ternary_op_output_data(a, b, c, out, topt);
auto& encoder = cu::get_command_encoder(s);
set_ternary_op_output_data(a, b, c, out, topt, [&](auto n) {
return cu::malloc_async(n, encoder.stream());
});
ternary_op_gpu_inplace<Op>(inputs, out, s);
}