build/html/common_2binary_8h_source.html

// Copyright © 2023 Apple Inc.


#pragma once

#include <cassert>


#include "mlx/allocator.h"

#include "mlx/array.h"

#include "mlx/backend/common/utils.h"


namespace mlx::core {


namespace {


enum class BinaryOpType {

  ScalarScalar,

  ScalarVector,

  VectorScalar,

  VectorVector,

  General,

};


BinaryOpType get_binary_op_type(const array& a, const array& b) {

  BinaryOpType bopt;

  if (a.data_size() == 1 && b.data_size() == 1) {

    bopt = BinaryOpType::ScalarScalar;

  } else if (a.data_size() == 1 && b.flags().contiguous) {

    bopt = BinaryOpType::ScalarVector;

  } else if (b.data_size() == 1 && a.flags().contiguous) {

    bopt = BinaryOpType::VectorScalar;

  } else if (

      a.flags().row_contiguous && b.flags().row_contiguous ||

      a.flags().col_contiguous && b.flags().col_contiguous) {

    bopt = BinaryOpType::VectorVector;

  } else {

    bopt = BinaryOpType::General;

  }

  return bopt;

}


void set_binary_op_output_data(

    const array& a,

    const array& b,

    array& out,

    BinaryOpType bopt,

    bool donate_with_move = false) {

  switch (bopt) {

    case BinaryOpType::ScalarScalar:

      out.set_data(

          allocator::malloc_or_wait(out.itemsize()), 1, a.strides(), a.flags());

      break;

    case BinaryOpType::ScalarVector:

      if (b.is_donatable() && b.itemsize() == out.itemsize()) {

        if (donate_with_move) {

          out.move_shared_buffer(b);

        } else {

          out.copy_shared_buffer(b);

        }

      } else {

        out.set_data(

            allocator::malloc_or_wait(b.data_size() * out.itemsize()),

            b.data_size(),

            b.strides(),

            b.flags());

      }

      break;

    case BinaryOpType::VectorScalar:

      if (a.is_donatable() && a.itemsize() == out.itemsize()) {

        if (donate_with_move) {

          out.move_shared_buffer(a);

        } else {

          out.copy_shared_buffer(a);

        }

      } else {

        out.set_data(

            allocator::malloc_or_wait(a.data_size() * out.itemsize()),

            a.data_size(),

            a.strides(),

            a.flags());

      }

      break;

    case BinaryOpType::VectorVector:

      if (a.is_donatable() && a.itemsize() == out.itemsize()) {

        if (donate_with_move) {

          out.move_shared_buffer(a);

        } else {

          out.copy_shared_buffer(a);

        }

      } else if (b.is_donatable() && b.itemsize() == out.itemsize()) {

        if (donate_with_move) {

          out.move_shared_buffer(b);

        } else {

          out.copy_shared_buffer(b);

        }

      } else {

        out.set_data(

            allocator::malloc_or_wait(a.data_size() * out.itemsize()),

            a.data_size(),

            a.strides(),

            a.flags());

      }

      break;

    case BinaryOpType::General:

      if (a.is_donatable() && a.flags().row_contiguous &&

          a.itemsize() == out.itemsize() && a.size() == out.size()) {

        if (donate_with_move) {

          out.move_shared_buffer(a);

        } else {

          out.copy_shared_buffer(a);

        }

      } else if (

          b.is_donatable() && b.flags().row_contiguous &&

          b.itemsize() == out.itemsize() && b.size() == out.size()) {

        if (donate_with_move) {

          out.move_shared_buffer(b);

        } else {

          out.copy_shared_buffer(b);

        }

      } else {

        out.set_data(allocator::malloc_or_wait(out.nbytes()));

      }

      break;

  }

}


struct UseDefaultBinaryOp {

  template <typename T, typename U>

  void operator()(const T* a, const T* b, U* dst, int size) {

    // Should we throw? This should normally never be called.

    assert(false);

  }


  template <typename T, typename U>

  void operator()(const T* a, const T* b, U* dst_a, U* dst_b, int size) {

    // Should we throw? This should normally never be called.

    assert(false);

  }

};


template <typename T, typename U, typename Op>

struct DefaultVectorScalar {

  Op op;


  DefaultVectorScalar(Op op_) : op(op_) {}


  void operator()(const T* a, const T* b, U* dst, int size) {

    T scalar = *b;

    while (size-- > 0) {

      *dst = op(*a, scalar);

      dst++;

      a++;

    }

  }


  void operator()(const T* a, const T* b, U* dst_a, U* dst_b, int size) {

    T scalar = *b;

    while (size-- > 0) {

      auto dst = op(*a, scalar);

      *dst_a = dst.first;

      *dst_b = dst.second;

      dst_a++;

      dst_b++;

      a++;

    }

  }

};


template <typename T, typename U, typename Op>

struct DefaultScalarVector {

  Op op;


  DefaultScalarVector(Op op_) : op(op_) {}


  void operator()(const T* a, const T* b, U* dst, int size) {

    T scalar = *a;

    while (size-- > 0) {

      *dst = op(scalar, *b);

      dst++;

      b++;

    }

  }


  void operator()(const T* a, const T* b, U* dst_a, U* dst_b, int size) {

    T scalar = *a;

    while (size-- > 0) {

      auto dst = op(scalar, *b);

      *dst_a = dst.first;

      *dst_b = dst.second;

      dst_a++;

      dst_b++;

      b++;

    }

  }

};


template <typename T, typename U, typename Op>

struct DefaultVectorVector {

  Op op;


  DefaultVectorVector(Op op_) : op(op_) {}


  void operator()(const T* a, const T* b, U* dst, int size) {

    while (size-- > 0) {

      *dst = op(*a, *b);

      dst++;

      a++;

      b++;

    }

  }


  void operator()(const T* a, const T* b, U* dst_a, U* dst_b, int size) {

    while (size-- > 0) {

      auto dst = op(*a, *b);

      *dst_a = dst.first;

      *dst_b = dst.second;

      dst_a++;

      dst_b++;

      a++;

      b++;

    }

  }

};


template <typename T, typename U, typename Op>

void binary_op_dims1(const array& a, const array& b, array& out, Op op) {

  const T* a_ptr = a.data<T>();

  const T* b_ptr = b.data<T>();

  U* dst = out.data<U>();

  size_t a_idx = 0;

  size_t b_idx = 0;

  for (size_t i = 0; i < out.size(); ++i) {

    dst[i] = op(a_ptr[a_idx], b_ptr[b_idx]);

    a_idx += a.strides()[0];

    b_idx += b.strides()[0];

  }

}


template <typename T, typename U, typename Op>

void binary_op_dims1(

    const array& a,

    const array& b,

    array& out,

    Op op,

    int stride) {

  const T* a_ptr = a.data<T>();

  const T* b_ptr = b.data<T>();

  U* dst = out.data<U>();

  size_t a_idx = 0;

  size_t b_idx = 0;

  for (size_t i = 0; i < a.shape()[0]; i++) {

    op(a_ptr + a_idx, b_ptr + b_idx, dst, stride);

    a_idx += a.strides()[0];

    b_idx += b.strides()[0];

    dst += stride;

  }

}


template <typename T, typename U, typename Op>

void binary_op_dims2(const array& a, const array& b, array& out, Op op) {

  const T* a_ptr = a.data<T>();

  const T* b_ptr = b.data<T>();

  U* dst = out.data<U>();

  size_t a_idx = 0;

  size_t b_idx = 0;

  size_t out_idx = 0;

  for (size_t i = 0; i < a.shape()[0]; ++i) {

    for (size_t j = 0; j < a.shape()[1]; ++j) {

      dst[out_idx++] = op(a_ptr[a_idx], b_ptr[b_idx]);

      a_idx += a.strides()[1];

      b_idx += b.strides()[1];

    }

    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];

    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];

  }

}


template <typename T, typename U, typename Op>

void binary_op_dims2(

    const array& a,

    const array& b,

    array& out,

    Op op,

    int stride) {

  const T* a_ptr = a.data<T>();

  const T* b_ptr = b.data<T>();

  U* dst = out.data<U>();

  size_t a_idx = 0;

  size_t b_idx = 0;

  for (size_t i = 0; i < a.shape()[0]; ++i) {

    for (size_t j = 0; j < a.shape()[1]; ++j) {

      op(a_ptr + a_idx, b_ptr + b_idx, dst, stride);

      a_idx += a.strides()[1];

      b_idx += b.strides()[1];

      dst += stride;

    }

    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];

    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];

  }

}


template <typename T, typename U, typename Op>

void binary_op_dims3(const array& a, const array& b, array& out, Op op) {

  const T* a_ptr = a.data<T>();

  const T* b_ptr = b.data<T>();

  U* dst = out.data<U>();

  size_t a_idx = 0;

  size_t b_idx = 0;

  size_t out_idx = 0;

  for (size_t i = 0; i < a.shape()[0]; ++i) {

    for (size_t j = 0; j < a.shape()[1]; ++j) {

      for (size_t k = 0; k < a.shape()[2]; ++k) {

        dst[out_idx++] = op(a_ptr[a_idx], b_ptr[b_idx]);

        a_idx += a.strides()[2];

        b_idx += b.strides()[2];

      }

      a_idx += a.strides()[1] - a.strides()[2] * a.shape()[2];

      b_idx += b.strides()[1] - b.strides()[2] * b.shape()[2];

    }

    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];

    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];

  }

}


template <typename T, typename U, typename Op>

void binary_op_dims4(const array& a, const array& b, array& out, Op op) {

  const T* a_ptr = a.data<T>();

  const T* b_ptr = b.data<T>();

  U* dst = out.data<U>();

  size_t a_idx = 0;

  size_t b_idx = 0;

  size_t out_idx = 0;

  for (size_t i = 0; i < a.shape()[0]; ++i) {

    for (size_t j = 0; j < a.shape()[1]; ++j) {

      for (size_t k = 0; k < a.shape()[2]; ++k) {

        for (size_t ii = 0; ii < a.shape()[3]; ++ii) {

          dst[out_idx++] = op(a_ptr[a_idx], b_ptr[b_idx]);

          a_idx += a.strides()[3];

          b_idx += b.strides()[3];

        }

        a_idx += a.strides()[2] - a.strides()[3] * a.shape()[3];

        b_idx += b.strides()[2] - b.strides()[3] * b.shape()[3];

      }

      a_idx += a.strides()[1] - a.strides()[2] * a.shape()[2];

      b_idx += b.strides()[1] - b.strides()[2] * b.shape()[2];

    }

    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];

    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];

  }

}


template <typename T, typename U, typename Op>

void binary_op_dispatch_dims(

    const array& a,

    const array& b,

    array& out,

    Op op) {

  switch (out.ndim()) {

    case 1:

      binary_op_dims1<T, U, Op>(a, b, out, op);

      return;

    case 2:

      binary_op_dims2<T, U, Op>(a, b, out, op);

      return;

    case 3:

      binary_op_dims3<T, U, Op>(a, b, out, op);

      return;

    case 4:

      binary_op_dims4<T, U, Op>(a, b, out, op);

      return;

  }


  const T* a_ptr = a.data<T>();

  const T* b_ptr = b.data<T>();

  U* dst = out.data<U>();

  for (size_t i = 0; i < out.size(); i++) {

    int a_idx = elem_to_loc(i, a.shape(), a.strides());

    int b_idx = elem_to_loc(i, b.shape(), b.strides());

    dst[i] = op(a_ptr[a_idx], b_ptr[b_idx]);

  }

}


template <typename T, typename U, typename Op>

void binary_op_dispatch_dims(

    const array& a,

    const array& b,

    array& out,

    Op op,

    int dim,

    int stride) {

  // Number of dimensions to loop over for vectorized ops

  switch (dim) {

    case 1:

      binary_op_dims1<T, U, Op>(a, b, out, op, stride);

      return;

    case 2:

      binary_op_dims2<T, U, Op>(a, b, out, op, stride);

      return;

  }


  const T* a_ptr = a.data<T>();

  const T* b_ptr = b.data<T>();

  U* dst = out.data<U>();

  for (size_t i = 0; i < out.size(); i += stride) {

    int a_idx = elem_to_loc(i, a.shape(), a.strides());

    int b_idx = elem_to_loc(i, b.shape(), b.strides());

    op(a_ptr + a_idx, b_ptr + b_idx, dst, stride);

    dst += stride;

  }

}


template <

    typename T,

    typename U,

    typename Op,

    typename OpSV,

    typename OpVS,

    typename OpVV>

void binary_op(

    const array& a,

    const array& b,

    array& out,

    Op op,

    OpSV opsv,

    OpVS opvs,

    OpVV opvv) {

  auto bopt = get_binary_op_type(a, b);

  set_binary_op_output_data(a, b, out, bopt);


  // The full computation is scalar scalar so call the base op once

  if (bopt == BinaryOpType::ScalarScalar) {

    *(out.data<U>()) = op(*a.data<T>(), *b.data<T>());

    return;

  }


  // The full computation is scalar vector so delegate to the op

  if (bopt == BinaryOpType::ScalarVector) {

    opsv(a.data<T>(), b.data<T>(), out.data<U>(), b.data_size());

    return;

  }


  // The full computation is vector scalar so delegate to the op

  if (bopt == BinaryOpType::VectorScalar) {

    opvs(a.data<T>(), b.data<T>(), out.data<U>(), a.data_size());

    return;

  }


  // The full computation is vector vector so delegate to the op

  if (bopt == BinaryOpType::VectorVector) {

    opvv(a.data<T>(), b.data<T>(), out.data<U>(), out.size());

    return;

  }


  // General computation so let's try to optimize


  // Get the left-most dim such that the array is row contiguous after

  auto& strides = out.strides();

  auto leftmost_rc_dim = [&strides](const array& arr) {

    int d = arr.ndim() - 1;

    for (; d >= 0 && arr.strides()[d] == strides[d]; d--) {

    }

    return d + 1;

  };

  auto a_rc_dim = leftmost_rc_dim(a);

  auto b_rc_dim = leftmost_rc_dim(b);


  // Get the left-most dim such that the array is a broadcasted "scalar" after

  auto leftmost_s_dim = [](const array& arr) {

    int d = arr.ndim() - 1;

    for (; d >= 0 && arr.strides()[d] == 0; d--) {

    }

    return d + 1;

  };

  auto a_s_dim = leftmost_s_dim(a);

  auto b_s_dim = leftmost_s_dim(b);


  auto ndim = out.ndim();


  // Case 1: LxM and FxM where L and F are broadcastable and M is row contiguous

  int dim = ndim;

  if (int d = std::max(a_rc_dim, b_rc_dim); d < ndim) {

    bopt = BinaryOpType::VectorVector;

    dim = d;

    // Case 2: LxM and Fx1 where L and F are broadcastable and M is row

    // contiguous

  } else if (int d = std::max(a_rc_dim, b_s_dim); d < ndim) {

    bopt = BinaryOpType::VectorScalar;

    dim = d;

    // Case 3: Lx1 and FxM where L and F are broadcastable and M is row

    // contiguous

  } else if (int d = std::max(a_s_dim, b_rc_dim); d < ndim) {

    bopt = BinaryOpType::ScalarVector;

    dim = d;

  }


  // Can be sure dim > 0 since otherwise we would have used one of the fully

  // contiguous methods above. Except for the case that the flags do not

  // correspond to the underlying contiguity.

  size_t stride;

  if (dim == 0 || strides[dim - 1] < 16) {

    stride = 1;

    bopt = BinaryOpType::General;

    dim = ndim;

  } else {

    stride = strides[dim - 1];

  }


  switch (bopt) {

    case BinaryOpType::VectorVector:

      binary_op_dispatch_dims<T, U>(a, b, out, opvv, dim, stride);

      break;

    case BinaryOpType::VectorScalar:

      binary_op_dispatch_dims<T, U>(a, b, out, opvs, dim, stride);

      break;

    case BinaryOpType::ScalarVector:

      binary_op_dispatch_dims<T, U>(a, b, out, opsv, dim, stride);

      break;

    default:

      binary_op_dispatch_dims<T, U>(a, b, out, op);

      break;

  }

}


template <typename T, typename Op, typename OpSV, typename OpVS, typename OpVV>

void binary_op(

    const array& a,

    const array& b,

    array& out,

    Op op,

    OpSV opsv,

    OpVS opvs,

    OpVV opvv) {

  // TODO: The following mess of constexpr evaluations can probably be achieved

  //       with template specializations and overloading. Would it be simpler?


  if (std::is_same<decltype(opsv), UseDefaultBinaryOp>::value) {

    if (std::is_same<decltype(opvs), UseDefaultBinaryOp>::value) {

      if (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {

        // All ops are UseDefaultBinaryOp (why oh why would someone call that?)

        binary_op<T, T>(

            a,

            b,

            out,

            op,

            DefaultScalarVector<T, T, Op>(op),

            DefaultVectorScalar<T, T, Op>(op),

            DefaultVectorVector<T, T, Op>(op));

      } else {

        // opsv and opvs were UseDefaultBinaryOp

        binary_op<T, T>(

            a,

            b,

            out,

            op,

            DefaultScalarVector<T, T, Op>(op),

            DefaultVectorScalar<T, T, Op>(op),

            opvv);

      }

    } else if (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {

      // opsv and opvv were UseDefaultBinaryOp

      binary_op<T, T>(

          a,

          b,

          out,

          op,

          DefaultScalarVector<T, T, Op>(op),

          opvs,

          DefaultVectorVector<T, T, Op>(op));

    } else {

      // opsv was UseDefaultBinaryOp

      binary_op<T, T>(

          a, b, out, op, DefaultScalarVector<T, T, Op>(op), opvs, opvv);

    }

  } else if (std::is_same<decltype(opvs), UseDefaultBinaryOp>::value) {

    if (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {

      // opvs and opvv were UseDefaultBinaryOp

      binary_op<T, T>(

          a,

          b,

          out,

          op,

          opsv,

          DefaultVectorScalar<T, T, Op>(op),

          DefaultVectorVector<T, T, Op>(op));

    } else {

      // opvs was UseDefaultBinaryOp

      binary_op<T, T>(

          a, b, out, op, opsv, DefaultVectorScalar<T, T, Op>(op), opvv);

    }

  } else if (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {

    // opvv was UseDefaultBinaryOp

    binary_op<T, T>(

        a, b, out, op, opsv, opvs, DefaultVectorVector<T, T, Op>(op));

  } else {

    // All ops provided

    binary_op<T, T>(a, b, out, op, opsv, opvs, opvv);

  }

}


template <typename T, typename Op>

void binary_op(const array& a, const array& b, array& out, Op op) {

  DefaultScalarVector<T, T, Op> opsv(op);

  DefaultVectorScalar<T, T, Op> opvs(op);

  DefaultVectorVector<T, T, Op> opvv(op);

  binary_op<T, T>(a, b, out, op, opsv, opvs, opvv);

}


template <typename... Ops>

void binary(const array& a, const array& b, array& out, Ops... ops) {

  switch (out.dtype()) {

    case bool_:

      binary_op<bool>(a, b, out, ops...);

      break;

    case uint8:

      binary_op<uint8_t>(a, b, out, ops...);

      break;

    case uint16:

      binary_op<uint16_t>(a, b, out, ops...);

      break;

    case uint32:

      binary_op<uint32_t>(a, b, out, ops...);

      break;

    case uint64:

      binary_op<uint64_t>(a, b, out, ops...);

      break;

    case int8:

      binary_op<int8_t>(a, b, out, ops...);

      break;

    case int16:

      binary_op<int16_t>(a, b, out, ops...);

      break;

    case int32:

      binary_op<int32_t>(a, b, out, ops...);

      break;

    case int64:

      binary_op<int64_t>(a, b, out, ops...);

      break;

    case float16:

      binary_op<float16_t>(a, b, out, ops...);

      break;

    case float32:

      binary_op<float>(a, b, out, ops...);

      break;

    case bfloat16:

      binary_op<bfloat16_t>(a, b, out, ops...);

      break;

    case complex64:

      binary_op<complex64_t>(a, b, out, ops...);

      break;

  }

}


} // namespace


} // namespace mlx::core

allocator.h

array.h

utils.h

op
Op op
Definition binary.h:141

mlx::core::allocator::malloc_or_wait
Buffer malloc_or_wait(size_t size)

mlx::core::metal::binary
const char * binary()

mlx::core
Definition allocator.h:7

mlx::core::bool_
constexpr Dtype bool_
Definition dtype.h:60

mlx::core::uint64
constexpr Dtype uint64
Definition dtype.h:65

mlx::core::uint16
constexpr Dtype uint16
Definition dtype.h:63

mlx::core::elem_to_loc
stride_t elem_to_loc(int elem, const std::vector< int > &shape, const std::vector< stride_t > &strides)
Definition utils.h:12

mlx::core::bfloat16
constexpr Dtype bfloat16
Definition dtype.h:74

mlx::core::int32
constexpr Dtype int32
Definition dtype.h:69

mlx::core::float32
constexpr Dtype float32
Definition dtype.h:73

mlx::core::int16
constexpr Dtype int16
Definition dtype.h:68

mlx::core::int8
constexpr Dtype int8
Definition dtype.h:67

mlx::core::int64
constexpr Dtype int64
Definition dtype.h:70

mlx::core::uint8
constexpr Dtype uint8
Definition dtype.h:62

mlx::core::CopyType::General
@ General

mlx::core::float16
constexpr Dtype float16
Definition dtype.h:72

mlx::core::uint32
constexpr Dtype uint32
Definition dtype.h:64

mlx::core::complex64
constexpr Dtype complex64
Definition dtype.h:75