From c9934fe8a46ffff2485fcecd2ae697db6b634c6c Mon Sep 17 00:00:00 2001 From: Awni Hannun Date: Thu, 11 Jan 2024 11:57:24 -0800 Subject: [PATCH] Metal validation (#432) * tests clear metal validation * add cpp test with metal validation to circleci * nit --- .circleci/config.yml | 7 +++ mlx/backend/metal/allocator.cpp | 5 +++ mlx/backend/metal/copy.cpp | 3 ++ mlx/backend/metal/indexing.cpp | 75 +++++++++++++++++++++++--------- mlx/backend/metal/primitives.cpp | 30 +++++++++++-- mlx/backend/metal/reduce.cpp | 19 +++++--- mlx/ops.cpp | 1 - tests/allocator_tests.cpp | 2 - tests/arg_reduce_tests.cpp | 2 - tests/metal_tests.cpp | 33 ++++++++++++++ 10 files changed, 142 insertions(+), 35 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 13cebce75..b5636fa6c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -80,6 +80,13 @@ jobs: DEVICE=gpu python -m xmlrunner discover -v python/tests -o test-results/gpu - store_test_results: path: test-results + - run: + name: Build CPP only + command: | + mkdir -p build && cd build && cmake .. && make -j + - run: + name: Run CPP tests + command: METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 ./build/tests/tests build_release: machine: true diff --git a/mlx/backend/metal/allocator.cpp b/mlx/backend/metal/allocator.cpp index 3debc4742..3e856ac15 100644 --- a/mlx/backend/metal/allocator.cpp +++ b/mlx/backend/metal/allocator.cpp @@ -153,6 +153,11 @@ MetalAllocator::MetalAllocator() gc_limit_(0.95 * device_->recommendedMaxWorkingSetSize()) {} Buffer MetalAllocator::malloc(size_t size, bool allow_swap /* = false */) { + // Metal doesn't like empty buffers + if (size == 0) { + return Buffer{nullptr}; + } + // Align up memory if (size > vm_page_size) { size = vm_page_size * ((size + vm_page_size - 1) / vm_page_size); diff --git a/mlx/backend/metal/copy.cpp b/mlx/backend/metal/copy.cpp index c14c1022e..67e839481 100644 --- a/mlx/backend/metal/copy.cpp +++ b/mlx/backend/metal/copy.cpp @@ -20,6 +20,9 @@ void copy_gpu(const array& in, array& out, CopyType ctype, const Stream& s) { } else { out.set_data(allocator::malloc_or_wait(out.nbytes())); } + if (out.size() == 0) { + return; + } if (ctype == CopyType::GeneralGeneral) { ctype = CopyType::General; } diff --git a/mlx/backend/metal/indexing.cpp b/mlx/backend/metal/indexing.cpp index 1f905db1a..ce543bd67 100644 --- a/mlx/backend/metal/indexing.cpp +++ b/mlx/backend/metal/indexing.cpp @@ -1,5 +1,4 @@ // Copyright © 2023 Apple Inc. - #include #include #include @@ -33,6 +32,9 @@ void Gather::eval_gpu(const std::vector& inputs, array& out) { } out.set_data(allocator::malloc_or_wait(out.nbytes())); + if (out.size() == 0) { + return; + } auto& s = stream(); auto& d = metal::device(s.device); @@ -110,14 +112,18 @@ void Gather::eval_gpu(const std::vector& inputs, array& out) { for (int i = 0; i < nidx; ++i) { set_array_buffer(compute_encoder, arg_enc, inputs[i + 1], i); } - arg_enc->setBuffer( - static_cast(idx_shapes_buf.ptr()), 0, nidx + 1); - compute_encoder->useResource( - static_cast(idx_shapes_buf.ptr()), MTL::ResourceUsageRead); - arg_enc->setBuffer( - static_cast(idx_strides_buf.ptr()), 0, nidx + 2); - compute_encoder->useResource( - static_cast(idx_strides_buf.ptr()), MTL::ResourceUsageRead); + if (idx_ndim > 0) { + arg_enc->setBuffer( + static_cast(idx_shapes_buf.ptr()), 0, nidx + 1); + compute_encoder->useResource( + static_cast(idx_shapes_buf.ptr()), + MTL::ResourceUsageRead); + arg_enc->setBuffer( + static_cast(idx_strides_buf.ptr()), 0, nidx + 2); + compute_encoder->useResource( + static_cast(idx_strides_buf.ptr()), + MTL::ResourceUsageRead); + } *static_cast(arg_enc->constantData(nidx + 3)) = idx_ndim; // Set all the buffers @@ -163,6 +169,11 @@ void Scatter::eval_gpu(const std::vector& inputs, array& out) { inputs[0].data_size() == 1 ? CopyType::Scalar : CopyType::General; copy_gpu(inputs[0], out, copy_type); + // Empty update + if (inputs.back().size() == 0) { + return; + } + // Get stream auto& s = stream(); auto& d = metal::device(s.device); @@ -254,14 +265,18 @@ void Scatter::eval_gpu(const std::vector& inputs, array& out) { for (int i = 0; i < nidx; ++i) { set_array_buffer(compute_encoder, arg_enc, inputs[i + 1], i); } - arg_enc->setBuffer( - static_cast(idx_shapes_buf.ptr()), 0, nidx + 1); - compute_encoder->useResource( - static_cast(idx_shapes_buf.ptr()), MTL::ResourceUsageRead); - arg_enc->setBuffer( - static_cast(idx_strides_buf.ptr()), 0, nidx + 2); - compute_encoder->useResource( - static_cast(idx_strides_buf.ptr()), MTL::ResourceUsageRead); + if (idx_ndim > 0) { + arg_enc->setBuffer( + static_cast(idx_shapes_buf.ptr()), 0, nidx + 1); + compute_encoder->useResource( + static_cast(idx_shapes_buf.ptr()), + MTL::ResourceUsageRead); + arg_enc->setBuffer( + static_cast(idx_strides_buf.ptr()), 0, nidx + 2); + compute_encoder->useResource( + static_cast(idx_strides_buf.ptr()), + MTL::ResourceUsageRead); + } *static_cast(arg_enc->constantData(nidx + 3)) = idx_ndim; compute_encoder->setBuffer(static_cast(arg_buf.ptr()), 0, 0); @@ -272,14 +287,32 @@ void Scatter::eval_gpu(const std::vector& inputs, array& out) { } set_array_buffer(compute_encoder, upd, 1); set_array_buffer(compute_encoder, out, 2); - compute_encoder->setBytes(upd.shape().data(), upd_ndim * sizeof(int), 3); - compute_encoder->setBytes(upd.strides().data(), upd_ndim * sizeof(size_t), 4); + if (upd_ndim == 0) { + // Need placeholders so Metal doesn't compalain + int shape_ = 0; + size_t stride_ = 0; + compute_encoder->setBytes(&shape_, sizeof(int), 3); + compute_encoder->setBytes(&stride_, sizeof(size_t), 4); + } else { + compute_encoder->setBytes(upd.shape().data(), upd_ndim * sizeof(int), 3); + compute_encoder->setBytes( + upd.strides().data(), upd_ndim * sizeof(size_t), 4); + } compute_encoder->setBytes(&upd_ndim, sizeof(size_t), 5); compute_encoder->setBytes(&upd_size, sizeof(size_t), 6); size_t out_ndim = out.ndim(); - compute_encoder->setBytes(out.shape().data(), out_ndim * sizeof(int), 7); - compute_encoder->setBytes(out.strides().data(), out_ndim * sizeof(size_t), 8); + if (out_ndim == 0) { + // Need placeholders so Metal doesn't compalain + int shape_ = 0; + size_t stride_ = 0; + compute_encoder->setBytes(&shape_, sizeof(int), 7); + compute_encoder->setBytes(&stride_, sizeof(size_t), 8); + } else { + compute_encoder->setBytes(out.shape().data(), out_ndim * sizeof(int), 7); + compute_encoder->setBytes( + out.strides().data(), out_ndim * sizeof(size_t), 8); + } compute_encoder->setBytes(&out_ndim, sizeof(size_t), 9); compute_encoder->setBytes(axes_.data(), axes_.size() * sizeof(int), 10); diff --git a/mlx/backend/metal/primitives.cpp b/mlx/backend/metal/primitives.cpp index e8dac264b..ecea66e85 100644 --- a/mlx/backend/metal/primitives.cpp +++ b/mlx/backend/metal/primitives.cpp @@ -31,6 +31,9 @@ void binary_op( set_binary_op_output_data(a, b, outputs[1], bopt); auto& out = outputs[0]; + if (out.size() == 0) { + return; + } // Try to collapse contiguous dims auto [shape, strides] = collapse_contiguous_dims(a, b, out); @@ -120,6 +123,9 @@ void binary_op( auto& b = inputs[1]; auto bopt = get_binary_op_type(a, b); set_binary_op_output_data(a, b, out, bopt); + if (out.size() == 0) { + return; + } // Try to collapse contiguous dims auto [shape, strides] = collapse_contiguous_dims(a, b, out); @@ -214,6 +220,9 @@ void unary_op( } else { out.set_data(allocator::malloc_or_wait(out.nbytes())); } + if (in.size() == 0) { + return; + } auto& s = out.primitive().stream(); auto& d = metal::device(s.device); @@ -263,6 +272,9 @@ void arange_set_scalars(T start, T next, MTL::ComputeCommandEncoder* enc) { void Arange::eval_gpu(const std::vector& inputs, array& out) { assert(inputs.size() == 0); out.set_data(allocator::malloc_or_wait(out.nbytes())); + if (out.size() == 0) { + return; + } auto& s = stream(); auto& d = metal::device(s.device); auto kernel = d.get_kernel("arange" + type_to_name(out)); @@ -390,9 +402,18 @@ void ArgReduce::eval_gpu(const std::vector& inputs, array& out) { compute_encoder->setComputePipelineState(kernel); set_array_buffer(compute_encoder, in, 0); set_array_buffer(compute_encoder, out, 1); - compute_encoder->setBytes(shape.data(), ndim * sizeof(int), 2); - compute_encoder->setBytes(in_strides.data(), ndim * sizeof(size_t), 3); - compute_encoder->setBytes(out_strides.data(), ndim * sizeof(size_t), 4); + if (ndim == 0) { + // Pass place holders so metal doesn't complain + int shape_ = 0; + size_t stride_ = 0; + compute_encoder->setBytes(&shape_, sizeof(int), 2); + compute_encoder->setBytes(&stride_, sizeof(size_t), 3); + compute_encoder->setBytes(&stride_, sizeof(size_t), 4); + } else { + compute_encoder->setBytes(shape.data(), ndim * sizeof(int), 2); + compute_encoder->setBytes(in_strides.data(), ndim * sizeof(size_t), 3); + compute_encoder->setBytes(out_strides.data(), ndim * sizeof(size_t), 4); + } compute_encoder->setBytes(&ndim, sizeof(size_t), 5); compute_encoder->setBytes(&axis_stride, sizeof(size_t), 6); compute_encoder->setBytes(&axis_size, sizeof(size_t), 7); @@ -629,6 +650,9 @@ void RandomBits::eval_gpu(const std::vector& inputs, array& out) { size_t elems_per_key = out.size() / num_keys; size_t bytes_per_key = out.itemsize() * elems_per_key; out.set_data(allocator::malloc_or_wait(out.nbytes())); + if (out.size() == 0) { + return; + } size_t out_per_key = (bytes_per_key + 4 - 1) / 4; size_t half_size = out_per_key / 2; diff --git a/mlx/backend/metal/reduce.cpp b/mlx/backend/metal/reduce.cpp index 012038178..601ae4b61 100644 --- a/mlx/backend/metal/reduce.cpp +++ b/mlx/backend/metal/reduce.cpp @@ -2,7 +2,6 @@ #include #include -#include #include #include "mlx/backend/common/reduce.h" @@ -21,10 +20,14 @@ namespace mlx::core { namespace { -inline auto safe_divup(size_t n, size_t m) { +inline auto safe_div(size_t n, size_t m) { return m == 0 ? 0 : (n + m - 1) / m; } +inline auto safe_divup(size_t n, size_t m) { + return safe_div(n, m) * m; +} + // All Reduce void all_reduce_dispatch( const array& in, @@ -56,7 +59,7 @@ void all_reduce_dispatch( mod_in_size > thread_group_size ? thread_group_size : mod_in_size; // If the number of thread groups needed exceeds 1024, we reuse threads groups - uint n_thread_groups = safe_divup(mod_in_size, thread_group_size); + uint n_thread_groups = safe_div(mod_in_size, thread_group_size); n_thread_groups = std::min(n_thread_groups, 1024u); uint nthreads = n_thread_groups * thread_group_size; @@ -204,7 +207,8 @@ void strided_reduce_general_dispatch( // if we ever come to doubles. In that case, we should also cut // down the number of threads we launch in a threadgroup compute_encoder->setThreadgroupMemoryLength( - threadgroup_dim_x * threadgroup_dim_y * out.itemsize(), 0); + safe_divup(threadgroup_dim_x * threadgroup_dim_y * out.itemsize(), 16), + 0); compute_encoder->dispatchThreadgroups(grid_dims, group_dims); } @@ -231,7 +235,10 @@ void Reduce::eval_gpu(const std::vector& inputs, array& out) { assert(!axes_.empty()); // Continue with reduction operation - out.set_data(allocator::malloc_or_wait(out.nbytes())); + // Minimum of 4 bytes since we use size 4 structs for all reduce + // and metal will complain o/w + size_t min_bytes = std::max(out.nbytes(), 4ul); + out.set_data(allocator::malloc_or_wait(min_bytes)); std::string op_name; switch (reduce_type_) { case Reduce::And: @@ -273,7 +280,7 @@ void Reduce::eval_gpu(const std::vector& inputs, array& out) { } // Reduce - { + if (in.size() > 0) { std::vector copies; ReductionPlan plan = get_reduction_plan(in, axes_); diff --git a/mlx/ops.cpp b/mlx/ops.cpp index 98a2bd889..60a02bb86 100644 --- a/mlx/ops.cpp +++ b/mlx/ops.cpp @@ -1,5 +1,4 @@ // Copyright © 2023 Apple Inc. - #include #include #include diff --git a/tests/allocator_tests.cpp b/tests/allocator_tests.cpp index 0c1f30008..db155d23e 100644 --- a/tests/allocator_tests.cpp +++ b/tests/allocator_tests.cpp @@ -38,6 +38,4 @@ TEST_CASE("test large allocations") { auto buffer = allocator::malloc(size); allocator::free(buffer); } - // Shouldn't be able to allocate an exabyte anytime soon. - CHECK_THROWS_AS(allocator::malloc(1ull << 60), std::runtime_error); } diff --git a/tests/arg_reduce_tests.cpp b/tests/arg_reduce_tests.cpp index b571c8f61..f10cf5285 100644 --- a/tests/arg_reduce_tests.cpp +++ b/tests/arg_reduce_tests.cpp @@ -1,7 +1,5 @@ // Copyright © 2023 Apple Inc. -#include - #include "doctest/doctest.h" #include "mlx/mlx.h" diff --git a/tests/metal_tests.cpp b/tests/metal_tests.cpp index adeeb742a..38cf47a45 100644 --- a/tests/metal_tests.cpp +++ b/tests/metal_tests.cpp @@ -438,3 +438,36 @@ TEST_CASE("test metal matmul") { CHECK(array_equal(out, full({3, 3, 2, 2}, 2.0f), Device::cpu).item()); } } + +TEST_CASE("test metal validation") { + // Run this test with Metal validation enabled + // METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 ./tests/tests \ + // -tc="test metal validation" \ + + auto x = array({}); + eval(exp(x)); + + auto y = array({}); + eval(add(x, y)); + + eval(sum(x)); + + x = array({1, 2, 3}); + y = array(0); + eval(gather(x, y, 0, {0})); + eval(gather(x, y, 0, {2})); + + eval(gather(x, y, 0, {0})); + eval(gather(x, y, 0, {2})); + + eval(scatter(x, y, array({2}), 0)); + + x = arange(0, -3, 1); + eval(x); + array_equal(x, array({})).item(); + + x = array({1.0, 0.0}); + eval(argmax(x)); + + eval(scatter_max(array(1), {}, array(2), std::vector{})); +}