mlx/mlx/backend/metal/compiled.cpp

// Copyright © 2023-2024 Apple Inc.

#include <sstream>

#include "mlx/backend/common/compiled.h"
#include "mlx/backend/common/utils.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/jit/includes.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/graph_utils.h"
#include "mlx/primitives.h"
#include "mlx/utils.h"

namespace mlx::core {

constexpr int WORK_PER_THREAD = 4;

inline void build_kernel(
    std::ostream& os,
    const std::string& kernel_name,
    const std::vector<array>& inputs,
    const std::vector<array>& outputs,
    const std::vector<array>& tape,
    const std::unordered_set<uintptr_t>& constant_ids,
    bool contiguous,
    int ndim,
    bool dynamic_dims,
    bool use_big_index = false,
    int work_per_thread = 1) {
  // All outputs should have the exact same shape and will be row contiguous
  auto output_shape = outputs[0].shape();
  auto output_strides = outputs[0].strides();

  // Constants are scalars that are captured by value and cannot change
  auto is_constant = [&constant_ids](const array& x) {
    return constant_ids.find(x.id()) != constant_ids.end();
  };

  NodeNamer namer;
  bool add_indices = false;
  int cnt = 0;

  // Start the kernel
  os << "[[host_name(\"" << kernel_name << "\")]]\n"
     << "[[kernel]] void " << kernel_name << "(\n";

  // Add the input arguments
  for (auto& x : inputs) {
    auto& xname = namer.get_name(x);

    // Skip constants from the input list
    if (is_constant(x)) {
      continue;
    }

    // Scalars and contiguous need no strides
    if (is_scalar(x) || contiguous) {
      os << "    device const " << get_type_string(x.dtype()) << "* " << xname
         << " [[buffer(" << cnt++ << ")]],\n";
    } else {
      add_indices = true;
      os << "    device const " << get_type_string(x.dtype()) << "* " << xname
         << " [[buffer(" << cnt++ << ")]],\n";
    }
  }

  if (add_indices) {
    os << "    constant const size_t* in_strides [[buffer(" << cnt++
       << ")]],\n";
  }

  // Add the output arguments
  for (auto& x : outputs) {
    os << "    device " << get_type_string(x.dtype()) << "* "
       << namer.get_name(x) << " [[buffer(" << cnt++ << ")]],\n";
  }
  // Add output strides and shape to extract the indices.
  if (!contiguous) {
    os << "    constant const size_t* output_strides [[buffer(" << cnt++
       << ")]],\n"
       << "    constant const int* output_shape [[buffer(" << cnt++ << ")]],\n";
  }
  if (dynamic_dims) {
    os << "    constant const int& ndim [[buffer(" << cnt++ << ")]],\n";
  }

  // The thread index in the whole grid
  os << "    uint3 pos [[thread_position_in_grid]],\n"
     << "    uint3 grid [[threads_per_grid]]) {\n";

  if (use_big_index) {
    // This is only used for contiguous kernels which don't have
    // a third grid dimension
    os << "  size_t index = pos.x + grid.x * size_t(pos.y);\n";
  } else if (work_per_thread > 1) {
    os << "  constexpr int N_ = " << std::to_string(work_per_thread) << ";\n"
       << "  int xshape = output_shape["
       << (dynamic_dims ? "ndim - 1" : std::to_string(ndim - 1)) << "];\n"
       << "  size_t index = N_ * pos.x + xshape * (pos.y + size_t(grid.y) * pos.z);\n";
  } else {
    os << " size_t index = pos.x + grid.x * (pos.y + size_t(grid.y) * pos.z);\n";
  }

  // Read constant / contiguous inputs in tmps
  std::vector<array> nc_inputs;
  for (int i = 0; i < inputs.size(); ++i) {
    auto& x = inputs[i];
    auto& xname = namer.get_name(x);

    if (is_constant(x)) {
      auto type_str = get_type_string(x.dtype());
      os << "  auto tmp_" << xname << " = static_cast<"
         << get_type_string(x.dtype()) << ">(";
      print_constant(os, x);
      os << ");\n";
    } else if (is_scalar(x)) {
      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = "
         << xname << "[0];\n";
    } else if (contiguous) {
      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = "
         << xname << "[index];\n";
    } else {
      nc_inputs.push_back(x);
    }
  }

  // Initialize the indices for non-contiguous inputs
  for (int i = 0; i < nc_inputs.size(); ++i) {
    auto& xname = namer.get_name(nc_inputs[i]);
    if (ndim == 1) {
      int offset = i * ndim;
      os << "  size_t index_" << xname << " = elem_to_loc_1(pos.x, "
         << "in_strides[" << offset << "]);\n";
    } else if (ndim == 2) {
      int offset = i * ndim;
      os << "  size_t index_" << xname << " = elem_to_loc_2({pos.x, pos.y}, "
         << "in_strides + " << offset << ");\n";
    } else if (ndim == 3) {
      int offset = i * ndim;
      os << "  size_t index_" << xname << " = elem_to_loc_3(pos, "
         << "in_strides + " << offset << ");\n";
    } else if (!dynamic_dims) {
      int offset = i * ndim;
      os << "  size_t index_" << xname << " = N_ * pos.x * in_strides["
         << offset + ndim - 1 << "]"
         << " + pos.y * in_strides[" << offset + ndim - 2 << "];\n";
    } else {
      os << "  size_t index_" << xname << " = N_ * pos.x * in_strides[ndim * "
         << i << " + ndim - 1]"
         << " + pos.y * in_strides[ndim * " << i << " + ndim - 2];\n";
    }
  }
  if (!nc_inputs.empty() && (ndim > 3 || dynamic_dims)) {
    os << "  uint zpos = pos.z;\n";
    if (dynamic_dims) {
      os << "  for (int d = ndim - 3; d >= 0; --d) {\n";
    } else {
      os << "  for (int d = " << ndim - 3 << "; d >= 0; --d) {\n";
    }
    os << "    uint l = zpos % output_shape[d];\n";
    for (int i = 0; i < nc_inputs.size(); ++i) {
      auto& xname = namer.get_name(nc_inputs[i]);
      os << "    index_" << xname << " += ";
      if (dynamic_dims) {
        os << "l * in_strides[" << i << " * ndim + d];\n";
      } else {
        os << "l * in_strides[" << i * ndim << " + d];\n";
      }
    }
    os << "    zpos /= output_shape[d];\n  }\n";
  }

  // Open per-thread loop
  if (work_per_thread > 1) {
    os << "  for (int i = 0; i < N_ && (int(N_ * pos.x) + i) < xshape; ++i) {\n";
  }

  // Read non-contiguous inputs into tmps
  for (int i = 0; i < nc_inputs.size(); ++i) {
    auto& x = nc_inputs[i];
    auto& xname = namer.get_name(x);
    os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = "
       << xname << "[index_" << xname << "];\n";
  }

  // Actually write the computation
  for (auto& x : tape) {
    os << "  " << get_type_string(x.dtype()) << " tmp_" << namer.get_name(x)
       << " = ";
    if (is_static_cast(x.primitive())) {
      os << "static_cast<" << get_type_string(x.dtype()) << ">(tmp_"
         << namer.get_name(x.inputs()[0]) << ");\n";
    } else {
      x.primitive().print(os);
      os << "()(";
      for (int i = 0; i < x.inputs().size() - 1; i++) {
        os << "tmp_" << namer.get_name(x.inputs()[i]) << ", ";
      }
      os << "tmp_" << namer.get_name(x.inputs().back()) << ");\n";
    }
  }

  // Write the outputs from tmps
  for (auto& x : outputs) {
    os << "  " << namer.get_name(x) << "[index] = tmp_" << namer.get_name(x)
       << ";\n";
  }
  // Increment indices and close per thread loop
  if (work_per_thread > 1) {
    for (int i = 0; i < nc_inputs.size(); ++i) {
      auto& x = nc_inputs[i];
      auto& xname = namer.get_name(x);
      if (!dynamic_dims) {
        os << "  index_" << xname << " += "
           << "in_strides[" << i * ndim + ndim - 1 << "];\n";
      } else {
        os << "  index_" << xname << " += "
           << "in_strides[" << i << " * ndim + ndim - 1];\n";
      }
    }
    os << "  index++;\n  }\n";
  }

  // Finish the kernel
  os << "}\n";

  if (cnt > 31) {
    std::ostringstream msg;
    msg << "[compile] Too many inputs/outputs fused in the Metal Compiled "
        << "primitive which exhausted the available argument buffers for "
        << "the kernel. Please file an issue with the function that results "
        << "in this error. The name of the kernel is '" << kernel_name << "'";
    throw std::runtime_error(msg.str());
  }
}

void Compiled::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  // Make the name for the kernel library
  if (kernel_lib_.empty()) {
    kernel_lib_ = build_lib_name(inputs_, outputs_, tape_, constant_ids_);
  }

  // Get the kernel if someone else built it already
  auto& s = stream();
  auto& d = metal::device(s.device);
  auto lib = d.get_library(kernel_lib_, [&]() {
    std::ostringstream kernel;
    kernel << metal::utils() << metal::unary_ops() << metal::binary_ops()
           << metal::ternary_ops();
    build_kernel(
        kernel,
        kernel_lib_ + "_contiguous",
        inputs_,
        outputs_,
        tape_,
        constant_ids_,
        /* contiguous = */ true,
        /* ndim = */ 0,
        /* dynamic_dims = */ false);
    build_kernel(
        kernel,
        kernel_lib_ + "_contiguous_big",
        inputs_,
        outputs_,
        tape_,
        constant_ids_,
        /* contiguous = */ true,
        /* ndim = */ 0,
        /* dynamic_dims = */ false,
        /* use_big_index = */ true);
    for (int i = 1; i < 8; i++) {
      build_kernel(
          kernel,
          kernel_lib_ + "_strided_" + std::to_string(i),
          inputs_,
          outputs_,
          tape_,
          constant_ids_,
          /* contiguous = */ false,
          /* ndim = */ i,
          /* dynamic_dims = */ false,
          /* use_big_index = */ false,
          /* work_per_thread = */ i > 3 ? WORK_PER_THREAD : 1);
    }
    build_kernel(
        kernel,
        kernel_lib_ + "_strided_dynamic",
        inputs_,
        outputs_,
        tape_,
        constant_ids_,
        /* contiguous = */ false,
        /* ndim = */ 0,
        /* dynamic_dims = */ true,
        /* use_big_index = */ false,
        /* work_per_thread = */ WORK_PER_THREAD);
    return kernel.str();
  });

  // Figure out which kernel we are using
  auto& output_shape = outputs[0].shape();
  bool contiguous = compiled_check_contiguity(inputs, output_shape);

  // Collapse contiguous dims to route to a faster kernel if possible. Also
  // handle all broadcasting.
  std::vector<std::vector<size_t>> initial_strides;
  initial_strides.push_back(outputs[0].strides());
  std::vector<int> shape;
  std::vector<std::vector<size_t>> strides;
  if (!contiguous) {
    for (int i = 0; i < inputs.size(); i++) {
      // Skip constants.
      if (constant_ids_.find(inputs_[i].id()) != constant_ids_.end()) {
        continue;
      }
      auto& x = inputs[i];

      // Skip scalar inputs.
      if (is_scalar(x)) {
        continue;
      }

      // Broadcast the inputs to the output shape.
      std::vector<size_t> xstrides;
      int j = 0;
      for (; j < output_shape.size() - x.ndim(); j++) {
        if (output_shape[j] == 1) {
          xstrides.push_back(outputs[0].strides()[j]);
        } else {
          xstrides.push_back(0);
        }
      }
      for (int i = 0; i < x.ndim(); i++, j++) {
        if (x.shape(i) == 1) {
          if (output_shape[j] == 1) {
            xstrides.push_back(outputs[0].strides()[j]);
          } else {
            xstrides.push_back(0);
          }
        } else {
          xstrides.push_back(x.strides()[i]);
        }
      }
      initial_strides.push_back(std::move(xstrides));
    }
    std::tie(shape, strides) =
        collapse_contiguous_dims(output_shape, initial_strides, INT32_MAX);
  }

  bool use_2d = false;
  if (contiguous) {
    size_t max_size = 0;
    for (auto& in : inputs) {
      max_size = std::max(max_size, in.data_size());
    }
    use_2d = (max_size > UINT32_MAX);
  }

  // Get the kernel from the lib
  int ndim = shape.size();
  bool dynamic = ndim >= 8;
  auto kernel_name = kernel_lib_ + (contiguous ? "_contiguous" : "_strided_");
  if (!contiguous) {
    if (dynamic) {
      kernel_name += "dynamic";
    } else {
      kernel_name += std::to_string(shape.size());
    }
  } else if (use_2d) {
    kernel_name += "_big";
  }
  auto kernel = d.get_kernel(kernel_name, lib);
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Put the inputs in
  int cnt = 0;
  int stride_idx = 1; // idx 0 is the output strides
  std::vector<size_t> in_strides;
  for (int i = 0; i < inputs.size(); i++) {
    if (constant_ids_.find(inputs_[i].id()) != constant_ids_.end()) {
      continue;
    }
    auto& x = inputs[i];
    compute_encoder.set_input_array(x, cnt++);
    if (!contiguous && !is_scalar(x)) {
      in_strides.insert(
          in_strides.end(),
          strides[stride_idx].begin(),
          strides[stride_idx].end());
      stride_idx++;
    }
  }
  if (!in_strides.empty()) {
    compute_encoder.set_vector_bytes(in_strides, cnt++);
  }

  compiled_allocate_outputs(
      inputs, outputs, inputs_, constant_ids_, contiguous, true);

  // Put the outputs in
  for (auto& x : outputs) {
    compute_encoder.set_output_array(x, cnt++);
  }

  // Put the output shape and strides in
  if (!contiguous) {
    compute_encoder.set_vector_bytes(strides[0], cnt++);
    compute_encoder.set_vector_bytes(shape, cnt++);
  }

  // Put the number of dims in if it is dynamic
  if (dynamic) {
    compute_encoder.set_bytes(ndim, cnt++);
  }

  // Launch the kernel
  if (contiguous) {
    size_t nthreads = outputs[0].data_size();
    MTL::Size group_dims(
        std::min(nthreads, kernel->maxTotalThreadsPerThreadgroup()), 1, 1);

    MTL::Size grid_dims = use_2d
        ? get_2d_grid_dims(outputs[0].shape(), outputs[0].strides())
        : MTL::Size(nthreads, 1, 1);
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  } else {
    size_t dim0 = ndim > 0 ? shape[ndim - 1] : 1;
    size_t dim1 = ndim > 1 ? shape[ndim - 2] : 1;
    size_t rest = outputs[0].size() / (dim0 * dim1);
    int work_per_thread = ndim > 3 ? WORK_PER_THREAD : 1;
    dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
    int pow2;
    if (thread_group_size == 1024) {
      pow2 = 10;
    } else if (thread_group_size > 512) {
      pow2 = 9;
    } else {
      throw std::runtime_error("[Metal::compiled] Must use > 512 sized block");
    }
    auto group_dims = get_block_dims(dim0, dim1, rest, pow2);
    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  }
}

} // namespace mlx::core