mirror of
https://github.com/ml-explore/mlx.git
synced 2025-06-24 17:31:16 +08:00
441 lines
13 KiB
C++
441 lines
13 KiB
C++
// Copyright © 2023-2024 Apple Inc.
|
|
#include <cstdlib>
|
|
#include <map>
|
|
#include <unordered_map>
|
|
#include <unordered_set>
|
|
|
|
#include "mlx/allocator.h"
|
|
#include "mlx/primitives.h"
|
|
#include "mlx/transforms.h"
|
|
#include "mlx/transforms_impl.h"
|
|
|
|
namespace mlx::core {
|
|
|
|
namespace detail {
|
|
|
|
bool& compiler_disabled() {
|
|
auto get_val = []() {
|
|
if (const char* buff_str = std::getenv("MLX_DISABLE_COMPILE")) {
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
};
|
|
static bool compiler_disabled_ = get_val();
|
|
return compiler_disabled_;
|
|
}
|
|
|
|
#define MAX_OPS_PER_BUFFER max_ops_per_buffer()
|
|
|
|
using CompileFn = std::function<std::vector<array>(const std::vector<array>&)>;
|
|
using ParentsMap =
|
|
std::unordered_map<std::uintptr_t, std::vector<std::pair<array, int>>>;
|
|
|
|
template <typename T, typename... U>
|
|
size_t getAddress(std::function<T(U...)> f) {
|
|
typedef T(fnType)(U...);
|
|
fnType** fnPointer = f.template target<fnType*>();
|
|
if (fnPointer == nullptr) {
|
|
throw std::invalid_argument(
|
|
"[compile] Cannot compile a non-addressable function.");
|
|
}
|
|
return (size_t)*fnPointer;
|
|
}
|
|
|
|
struct CompilerCache {
|
|
struct CacheEntry {
|
|
std::vector<array> inputs;
|
|
std::vector<array> outputs;
|
|
std::vector<array> tape;
|
|
bool empty{true};
|
|
};
|
|
|
|
// Returns a reference to a CacheEntry which can be updated
|
|
// by the caller to avoid copying large tapes / inputs / outputs
|
|
CacheEntry& find(size_t fun_id, const std::vector<array>& inputs) {
|
|
// Try to find the entry
|
|
auto inserted = cache_.insert({fun_id, {}});
|
|
auto& entries = inserted.first->second;
|
|
auto is_match = [](const std::vector<array>& in1,
|
|
const std::vector<array>& in2) {
|
|
if (in1.size() != in2.size()) {
|
|
throw std::runtime_error(
|
|
"[compiler] Got different number of inputs to function,"
|
|
" this should never happen.");
|
|
}
|
|
for (int i = 0; i < in1.size(); ++i) {
|
|
if (in1[i].shape() != in2[i].shape()) {
|
|
return false;
|
|
}
|
|
if (in1[i].dtype() != in2[i].dtype()) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
};
|
|
|
|
// Loop over entries and check inputs match i.e. shapes and types must be
|
|
// equal. Note this could get really slow if one compiles the same
|
|
// function with many different shapes. May want to store entries in a
|
|
// more easily searchable structure.
|
|
for (auto& entry : entries) {
|
|
// Check the inputs match and return if so
|
|
if (is_match(inputs, entry.inputs)) {
|
|
return entry;
|
|
}
|
|
}
|
|
// Otherwise append a new cache entry
|
|
entries.push_back(CacheEntry{});
|
|
return entries.back();
|
|
};
|
|
|
|
void erase(size_t fun_id) {
|
|
cache_.erase(fun_id);
|
|
}
|
|
|
|
private:
|
|
CompilerCache() {
|
|
// Make sure the allocator is fully
|
|
// initialized before the compiler cache
|
|
allocator::allocator();
|
|
}
|
|
friend CompilerCache& compiler_cache();
|
|
std::unordered_map<size_t, std::vector<CacheEntry>> cache_;
|
|
};
|
|
|
|
CompilerCache& compiler_cache() {
|
|
static CompilerCache compiler_cache_;
|
|
return compiler_cache_;
|
|
}
|
|
|
|
std::pair<std::vector<array>, std::vector<array>> compile_trace(
|
|
const std::function<std::vector<array>(const std::vector<array>&)>& fun,
|
|
const std::vector<array>& inputs) {
|
|
// Set the global tracing flag.
|
|
detail::InTracing in_tracing;
|
|
|
|
// Run the function on placeholder inputs
|
|
// to get compute graph
|
|
std::vector<array> tracer_inputs;
|
|
for (int i = 0; i < inputs.size(); ++i) {
|
|
array in(inputs[i].shape(), inputs[i].dtype(), nullptr, {});
|
|
in.set_tracer(true);
|
|
tracer_inputs.push_back(std::move(in));
|
|
}
|
|
return {tracer_inputs, fun(tracer_inputs)};
|
|
}
|
|
|
|
// Traverses the graph to build a tape and a map of array ids to their parents
|
|
std::pair<std::vector<array>, ParentsMap> compile_dfs(
|
|
const std::vector<array>& inputs,
|
|
const std::vector<array>& outputs) {
|
|
std::function<void(const array&)> recurse;
|
|
std::vector<array> tape;
|
|
std::unordered_set<std::uintptr_t> input_set;
|
|
std::unordered_map<std::uintptr_t, std::vector<std::pair<array, int>>>
|
|
parents_map;
|
|
for (int i = 0; i < inputs.size(); ++i) {
|
|
auto in = inputs[i];
|
|
input_set.insert(in.id());
|
|
}
|
|
|
|
// DFS the graph to build the tape, and log parents and scalars
|
|
std::unordered_set<std::uintptr_t> cache;
|
|
recurse = [&](const array& a) {
|
|
auto id = a.id();
|
|
if (cache.find(id) != cache.end()) {
|
|
return;
|
|
}
|
|
for (int i = 0; i < a.inputs().size(); i++) {
|
|
auto& in = a.inputs()[i];
|
|
parents_map[in.id()].push_back({a, i});
|
|
for (auto& s : a.siblings()) {
|
|
parents_map[in.id()].push_back({s, i});
|
|
}
|
|
// Don't recurse on inputs (but add them to the tape for the purpose
|
|
// of future optimizations)
|
|
if (input_set.find(a.id()) == input_set.end()) {
|
|
recurse(in);
|
|
}
|
|
}
|
|
cache.insert(id);
|
|
for (auto& s : a.siblings()) {
|
|
cache.insert(s.id());
|
|
}
|
|
tape.push_back(a);
|
|
};
|
|
for (auto& a : outputs) {
|
|
recurse(a);
|
|
}
|
|
return {tape, parents_map};
|
|
}
|
|
|
|
// Simplify the tape. Note, this function modifies in-place both the tape and
|
|
// the parents map to remove orphaned arrays
|
|
void compile_simplify(
|
|
std::vector<array>& tape,
|
|
ParentsMap& parents_map,
|
|
const std::vector<array>& outputs,
|
|
int passes) {
|
|
// Helpers to identify identical scalars
|
|
std::map<std::pair<uint64_t, Dtype::Val>, array> scalars;
|
|
auto is_scalar = [](const array& a) {
|
|
return a.is_evaled() && a.ndim() == 0;
|
|
};
|
|
auto get_scalar_rep = [](const array& a) {
|
|
uint64_t v = 0;
|
|
int dtype;
|
|
switch (a.dtype().size) {
|
|
case 1:
|
|
v = *a.data<uint8_t>();
|
|
break;
|
|
case 4:
|
|
v = *a.data<uint32_t>();
|
|
break;
|
|
case 8:
|
|
v = *a.data<uint64_t>();
|
|
break;
|
|
}
|
|
return std::make_pair(v, a.dtype().val);
|
|
};
|
|
|
|
for (auto& a : tape) {
|
|
if (is_scalar(a)) {
|
|
scalars.insert({get_scalar_rep(a), a});
|
|
}
|
|
}
|
|
|
|
// Helper that fuses two arrays in the graph by setting the parents of the
|
|
// source to point to the destination
|
|
auto fuse = [&](array& dst, array& src) {
|
|
// Canonicalize the order of the primitives outputs
|
|
auto sources = src.outputs();
|
|
auto dests = dst.outputs();
|
|
// For each src parent, point it to the corresponding dest
|
|
for (int i = 0; i < sources.size(); ++i) {
|
|
auto src_parents = parents_map.find(sources[i].id());
|
|
if (src_parents == parents_map.end()) {
|
|
continue;
|
|
}
|
|
auto& pairs = parents_map[dests[i].id()];
|
|
for (auto& parent : src_parents->second) {
|
|
parent.first.inputs()[parent.second] = dests[i];
|
|
pairs.push_back(parent);
|
|
}
|
|
// Remove the source from the map to avoid fusing with it again
|
|
parents_map.erase(src_parents);
|
|
}
|
|
};
|
|
|
|
// Depth-1 array equivalence check.
|
|
auto array_equivalent = [](const array& a, const array& b) {
|
|
if (!a.has_primitive() || !b.has_primitive()) {
|
|
return false;
|
|
}
|
|
if (a.primitive_id() == b.primitive_id()) {
|
|
return false;
|
|
}
|
|
const auto& pa = a.primitive();
|
|
const auto& pb = b.primitive();
|
|
if (typeid(pa) != typeid(pb)) {
|
|
return false;
|
|
}
|
|
|
|
if (a.inputs().size() != b.inputs().size()) {
|
|
return false;
|
|
}
|
|
|
|
for (int i = 0; i < a.inputs().size(); i++) {
|
|
if (a.inputs()[i].id() != b.inputs()[i].id()) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return pa.is_equivalent(pb);
|
|
};
|
|
|
|
// Pass 0: fuse scalars
|
|
std::vector<array> new_tape;
|
|
for (auto& arr : tape) {
|
|
// Check if we can fuse scalars
|
|
if (is_scalar(arr)) {
|
|
auto scalar = scalars.find(get_scalar_rep(arr));
|
|
if (scalar->second.id() != arr.id()) {
|
|
fuse(scalar->second, arr);
|
|
// Don't keep orphaned scalars in the tape
|
|
continue;
|
|
}
|
|
}
|
|
new_tape.push_back(std::move(arr));
|
|
}
|
|
|
|
tape = std::move(new_tape);
|
|
|
|
std::unordered_set<uintptr_t> output_set;
|
|
for (auto& o : outputs) {
|
|
output_set.insert(o.id());
|
|
}
|
|
// Pass 1..passes: fuse only keeping non-orphaned arrays in the tape
|
|
for (int pass = 0; pass < passes; ++pass) {
|
|
for (auto& arr : tape) {
|
|
// Helper to check if we can fuse the parents of the
|
|
// given array
|
|
auto maybe_fuse_parents = [&](auto& a) {
|
|
auto parents = parents_map.find(a.id());
|
|
if (parents != parents_map.end()) {
|
|
auto N = parents->second.size();
|
|
std::vector<bool> mask(N, false);
|
|
for (int i = 0; i < N; i++) {
|
|
if (mask[i]) {
|
|
continue;
|
|
}
|
|
for (int j = i + 1; j < N; j++) {
|
|
if (mask[j]) {
|
|
continue;
|
|
}
|
|
auto& src = parents->second[j].first;
|
|
auto& dst = parents->second[i].first;
|
|
if (src.id() != dst.id() && array_equivalent(src, dst)) {
|
|
fuse(dst, src);
|
|
mask[j] = true;
|
|
}
|
|
}
|
|
}
|
|
// Erase orphaned parents so we don't keep fusing with them
|
|
for (int i = N - 1; i > 0; --i) {
|
|
if (mask[i]) {
|
|
parents->second.erase(parents->second.begin() + i);
|
|
}
|
|
}
|
|
return false;
|
|
} else {
|
|
return output_set.find(a.id()) == output_set.end();
|
|
}
|
|
};
|
|
|
|
bool discard = maybe_fuse_parents(arr);
|
|
for (auto& s : arr.siblings()) {
|
|
discard &= maybe_fuse_parents(s);
|
|
}
|
|
// If an array and its siblings have no parents, and none of them are
|
|
// outputs, it is safe to remove it from the tape
|
|
if (!discard) {
|
|
new_tape.push_back(std::move(arr));
|
|
}
|
|
}
|
|
tape = std::move(new_tape);
|
|
}
|
|
}
|
|
|
|
std::vector<array> compile_replace(
|
|
const std::vector<array>& tape,
|
|
const std::vector<array>& trace_inputs,
|
|
const std::vector<array>& trace_outputs,
|
|
const std::vector<array>& inputs) {
|
|
std::unordered_map<uintptr_t, array> trace_to_real;
|
|
for (int i = 0; i < inputs.size(); ++i) {
|
|
trace_to_real.insert({trace_inputs[i].id(), inputs[i]});
|
|
}
|
|
|
|
for (auto& a : tape) {
|
|
// Arrays in the tape without primitives are constants
|
|
// and can be used directly
|
|
if (!a.has_primitive()) {
|
|
trace_to_real.insert({a.id(), a});
|
|
} else {
|
|
// Find real inputs
|
|
std::vector<array> real_inputs;
|
|
for (auto& in : a.inputs()) {
|
|
real_inputs.push_back(trace_to_real.at(in.id()));
|
|
}
|
|
if (a.siblings().empty()) {
|
|
auto real_a = array(
|
|
a.shape(), a.dtype(), a.primitive_ptr(), std::move(real_inputs));
|
|
trace_to_real.insert({a.id(), std::move(real_a)});
|
|
} else {
|
|
// Ensure the order is correct for multi-output primitives
|
|
std::vector<std::vector<int>> shapes;
|
|
std::vector<Dtype> types;
|
|
auto trace_out = a.outputs();
|
|
for (auto& o : trace_out) {
|
|
shapes.push_back(o.shape());
|
|
types.push_back(o.dtype());
|
|
}
|
|
auto real_out =
|
|
array::make_arrays(shapes, types, a.primitive_ptr(), real_inputs);
|
|
for (int i = 0; i < trace_out.size(); ++i) {
|
|
trace_to_real.insert({trace_out[i].id(), std::move(real_out[i])});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
std::vector<array> outputs;
|
|
for (auto& o : trace_outputs) {
|
|
outputs.push_back(trace_to_real.at(o.id()));
|
|
}
|
|
return outputs;
|
|
}
|
|
|
|
std::function<std::vector<array>(const std::vector<array>&)> compile(
|
|
const std::function<std::vector<array>(const std::vector<array>&)>& fun,
|
|
size_t fun_id) {
|
|
if (compiler_disabled()) {
|
|
return fun;
|
|
}
|
|
return [fun, fun_id](const std::vector<array>& inputs) {
|
|
// Find a cache entry with the correct inputs
|
|
auto& entry = compiler_cache().find(fun_id, inputs);
|
|
|
|
// No matching cache entry existed, so compile
|
|
if (entry.empty) {
|
|
// Mark the entry as not empty since we are about to fill it
|
|
entry.empty = false;
|
|
// Trace to build the graph
|
|
std::tie(entry.inputs, entry.outputs) = compile_trace(fun, inputs);
|
|
|
|
// DFS the graph and get a tape, and a map of array id to (parent,
|
|
// position in parent inputs)
|
|
std::unordered_map<uintptr_t, std::vector<std::pair<array, int>>>
|
|
parents_map;
|
|
std::tie(entry.tape, parents_map) =
|
|
compile_dfs(entry.inputs, entry.outputs);
|
|
|
|
// Simplify the tape
|
|
compile_simplify(entry.tape, parents_map, entry.outputs, /* passes */ 3);
|
|
|
|
// This is a good point to do more optimizations, e.g. kernel fusion to
|
|
// generate new primitives. The tape needs to be updated accordingly
|
|
}
|
|
|
|
// At this point we must have a tape, now replace the placeholders
|
|
// with real arrays that can be evaluated
|
|
return compile_replace(entry.tape, entry.inputs, entry.outputs, inputs);
|
|
};
|
|
}
|
|
|
|
void compile_erase(size_t fun_id) {
|
|
detail::compiler_cache().erase(fun_id);
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
std::function<std::vector<array>(const std::vector<array>&)> compile(
|
|
const std::function<std::vector<array>(const std::vector<array>&)>& fun) {
|
|
if (detail::compiler_disabled()) {
|
|
return fun;
|
|
}
|
|
auto fun_id = detail::getAddress(fun);
|
|
return detail::compile(fun, fun_id);
|
|
}
|
|
|
|
void disable_compile() {
|
|
detail::compiler_disabled() = true;
|
|
}
|
|
|
|
void enable_compile() {
|
|
detail::compiler_disabled() = false;
|
|
}
|
|
|
|
} // namespace mlx::core
|