// Copyright © 2023-2024 Apple Inc. // Required for using M_2_SQRTPI in MSVC. #define _USE_MATH_DEFINES #include #include #include #include #include #include #include "mlx/backend/common/utils.h" #include "mlx/fft.h" #include "mlx/linalg.h" #include "mlx/ops.h" #include "mlx/primitives.h" #include "mlx/utils.h" namespace mlx::core { namespace { std::tuple vmap_binary_op( const std::vector& inputs, const std::vector& axes, const Stream& stream) { assert(inputs.size() == 2); assert(axes.size() == 2); if (axes[0] == -1 && axes[1] == -1) { return {inputs[0], inputs[1], -1}; } auto a = inputs[0]; auto b = inputs[1]; int ndim = std::max(a.ndim() + (axes[0] == -1), b.ndim() + (axes[1] == -1)); auto expand_dims = [stream, ndim](auto in) { auto shape = in.shape(); shape.insert(shape.begin(), ndim - shape.size(), 1); return reshape(in, shape, stream); }; int to_ax = (ndim - a.ndim()) + axes[0]; int from_ax = (ndim - b.ndim()) + axes[1]; a = expand_dims(a); b = expand_dims(b); if (from_ax != to_ax) { std::vector tdims(b.ndim()); std::iota(tdims.begin(), tdims.end(), 0); tdims.erase(tdims.begin() + from_ax); tdims.insert(tdims.begin() + to_ax, from_ax); b = transpose(b, tdims, stream); } return {a, b, to_ax}; } std::tuple vmap_ternary_op( const std::vector& inputs, const std::vector& axes, const Stream& stream) { assert(inputs.size() == 3); assert(axes.size() == 3); if (axes[0] == -1 && axes[1] == -1 && axes[2] == -1) { return {inputs[0], inputs[1], inputs[2], -1}; } auto a = inputs[0]; auto b = inputs[1]; auto c = inputs[2]; int ndim = std::max( {a.ndim() + (axes[0] == -1), b.ndim() + (axes[1] == -1), c.ndim() + (axes[2] == -1)}); auto expand_dims = [stream, ndim](auto in) { auto shape = in.shape(); shape.insert(shape.begin(), ndim - shape.size(), 1); return reshape(in, shape, stream); }; int to_ax = (ndim - a.ndim()) + axes[0]; int from_ax1 = (ndim - b.ndim()) + axes[1]; int from_ax2 = (ndim - c.ndim()) + axes[2]; a = expand_dims(a); b = expand_dims(b); c = expand_dims(c); auto find_tdims = [](auto x, int to_ax, int from_ax) { std::vector tdims(x.ndim()); std::iota(tdims.begin(), tdims.end(), 0); tdims.erase(tdims.begin() + from_ax); tdims.insert(tdims.begin() + to_ax, from_ax); return tdims; }; if (to_ax != from_ax1) { std::vector tdims = find_tdims(b, to_ax, from_ax1); b = transpose(b, tdims, stream); } if (to_ax != from_ax2) { std::vector tdims = find_tdims(c, to_ax, from_ax2); c = transpose(c, tdims, stream); } return {a, b, c, to_ax}; } } // namespace std::vector Primitive::jvp( const std::vector&, const std::vector&, const std::vector&) { std::ostringstream msg; msg << "[Primitive::jvp] Not implemented for "; print(msg); msg << "."; throw std::invalid_argument(msg.str()); } std::vector Primitive::vjp( const std::vector&, const std::vector&, const std::vector&, const std::vector&) { std::ostringstream msg; msg << "[Primitive::vjp] Not implemented for "; print(msg); msg << "."; throw std::invalid_argument(msg.str()); } std::pair, std::vector> Primitive::vmap( const std::vector&, const std::vector&) { std::ostringstream msg; msg << "[Primitive::vmap] Not implemented for "; print(msg); msg << "."; throw std::invalid_argument(msg.str()); } std::vector Primitive::output_shapes(const std::vector&) { std::ostringstream msg; msg << "[Primitive::output_shapes] "; this->print(msg); msg << " cannot infer output shapes."; throw std::invalid_argument(msg.str()); } std::vector Abs::vjp( const std::vector& primals, const std::vector& cotangents, const std::vector& argnums, const std::vector&) { return jvp(primals, cotangents, argnums); } std::vector Abs::jvp( const std::vector& primals, const std::vector& tangents, const std::vector& argnums) { assert(primals.size() == 1); assert(argnums.size() == 1); return {multiply(tangents[0], sign(primals[0], stream()), stream())}; } std::pair, std::vector> Abs::vmap( const std::vector& inputs, const std::vector& axes) { assert(inputs.size() == 1); assert(axes.size() == 1); return {{abs(inputs[0], stream())}, axes}; } std::vector Add::jvp( const std::vector& primals, const std::vector& tangents, const std::vector& argnums) { return { tangents.size() > 1 ? add(tangents[0], tangents[1], stream()) : tangents[0]}; } std::vector Add::vjp( const std::vector& primals, const std::vector& cotangents, const std::vector& argnums, const std::vector&) { if (argnums.size() == 1) { return cotangents; } else { return {cotangents[0], cotangents[0]}; } } std::pair, std::vector> Add::vmap( const std::vector& inputs, const std::vector& axes) { auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream()); return {{add(a, b, stream())}, {to_ax}}; } std::vector AddMM::vjp( const std::vector& primals, const std::vector& cotangents, const std::vector& argnums, const std::vector&) { std::vector vjps; auto& cotan = cotangents[0]; std::vector reorder(cotan.ndim()); std::iota(reorder.begin(), reorder.end(), 0); std::iter_swap(reorder.end() - 1, reorder.end() - 2); for (auto arg : argnums) { if (arg == 0) { // M X N * (K X N).T -> M X K auto cotan_scaled = cotan; if (alpha_ != 1.) { auto alpha_arr = array(alpha_, cotan.dtype()); cotan_scaled = (multiply(alpha_arr, cotan_scaled, stream())); } vjps.push_back(matmul( cotan_scaled, transpose(primals[1], reorder, stream()), stream())); } else if (arg == 1) { // (M X K).T * M X N -> K X N auto cotan_scaled = cotan; if (alpha_ != 1.) { auto alpha_arr = array(alpha_, cotan.dtype()); cotan_scaled = (multiply(alpha_arr, cotan_scaled, stream())); } vjps.push_back(matmul( transpose(primals[0], reorder, stream()), cotan_scaled, stream())); } else { auto cotan_scaled = cotan; if (beta_ != 1.) { auto beta_arr = array(beta_, cotan.dtype()); cotan_scaled = (multiply(beta_arr, cotan_scaled, stream())); } vjps.push_back(cotan_scaled); } } return vjps; } bool AddMM::is_equivalent(const Primitive& other) const { const AddMM& a_other = static_cast(other); return (alpha_ == a_other.alpha_ && beta_ == a_other.beta_); } std::pair, std::vector> AddMM::vmap( const std::vector& inputs, const std::vector& axes) { auto maybe_move_ax = [this](auto& arr, auto ax) { return ax > 0 ? moveaxis(arr, ax, 0, stream()) : arr; }; auto a = maybe_move_ax(inputs[0], axes[0]); auto b = maybe_move_ax(inputs[1], axes[1]); auto c = maybe_move_ax(inputs[2], axes[2]); return {{addmm(c, a, b, alpha_, beta_, stream())}, {0}}; } bool Arange::is_equivalent(const Primitive& other) const { const Arange& a_other = static_cast(other); return ( start_ == a_other.start_ && stop_ == a_other.stop_ && step_ == a_other.step_); } std::vector Arange::output_shapes(const std::vector&) { auto real_size = std::ceil((stop_ - start_) / step_); return {{std::max(static_cast(real_size), 0)}}; } std::vector ArcCos::vjp( const std::vector& primals, const std::vector& cotangents, const std::vector& argnums, const std::vector&) { return jvp(primals, cotangents, argnums); } std::vector ArcCos::jvp( const std::vector& primals, const std::vector& tangents, const std::vector& argnums) { assert(primals.size() == 1); assert(argnums.size() == 1); array one = array(1., primals[0].dtype()); array t = subtract(one, square(primals[0], stream()), stream()); array denom = negative(rsqrt(t, stream()), stream()); return {multiply(tangents[0], denom, stream())}; } std::pair, std::vector> ArcCos::vmap( const std::vector& inputs, const std::vector& axes) { assert(inputs.size() == 1); assert(axes.size() == 1); return {{arccos(inputs[0], stream())}, axes}; } std::vector ArcCosh::vjp( const std::vector& primals, const std::vector& cotangents, const std::vector& argnums, const std::vector&) { return jvp(primals, cotangents, argnums); } std::vector ArcCosh::jvp( const std::vector& primals, const std::vector& tangents, const std::vector& argnums) { assert(primals.size() == 1); assert(argnums.size() == 1); array one = array(1., primals[0].dtype()); array t = subtract(square(primals[0], stream()), one, stream()); return {multiply(tangents[0], rsqrt(t, stream()), stream())}; } std::pair, std::vector> ArcCosh::vmap( const std::vector& inputs, const std::vector& axes) { assert(inputs.size() == 1); assert(axes.size() == 1); return {{arccosh(inputs[0], stream())}, axes}; } std::vector ArcSin::vjp( const std::vector& primals, const std::vector& cotangents, const std::vector& argnums, const std::vector&) { return jvp(primals, cotangents, argnums); } std::vector ArcSin::jvp( const std::vector& primals, const std::vector& tangents, const std::vector& argnums) { assert(primals.size() == 1); assert(argnums.size() == 1); array one = array(1., primals[0].dtype()); array t = subtract(one, square(primals[0], stream()), stream()); return {multiply(tangents[0], rsqrt(t, stream()), stream())}; } std::pair, std::vector> ArcSin::vmap( const std::vector& inputs, const std::vector& axes) { assert(inputs.size() == 1); assert(axes.size() == 1); return {{arcsin(inputs[0], stream())}, axes}; } std::vector ArcSinh::vjp( const std::vector& primals, const std::vector& cotangents, const std::vector& argnums, const std::vector&) { return jvp(primals, cotangents, argnums); } std::vector ArcSinh::jvp( const std::vector& primals, const std::vector& tangents, const std::vector& argnums) { assert(primals.size() == 1); assert(argnums.size() == 1); array one = array(1., primals[0].dtype()); array t = add(square(primals[0], stream()), one, stream()); return {multiply(tangents[0], rsqrt(t, stream()), stream())}; } std::pair, std::vector> ArcSinh::vmap( const std::vector& inputs, const std::vector& axes) { assert(inputs.size() == 1); assert(axes.size() == 1); return {{arcsinh(inputs[0], stream())}, axes}; } std::vector ArcTan::vjp( const std::vector& primals, const std::vector& cotangents, const std::vector& argnums, const std::vector&) { return jvp(primals, cotangents, argnums); } std::vector ArcTan::jvp( const std::vector& primals, const std::vector& tangents, const std::vector& argnums) { assert(primals.size() == 1); assert(argnums.size() == 1); array one = array(1., primals[0].dtype()); array t = add(one, square(primals[0], stream()), stream()); return {divide(tangents[0], t, stream())}; } std::pair, std::vector> ArcTan::vmap( const std::vector& inputs, const std::vector& axes) { assert(inputs.size() == 1); assert(axes.size() == 1); return {{arctan(inputs[0], stream())}, axes}; } std::vector ArcTan2::vjp( const std::vector& primals, const std::vector& cotangents, const std::vector& argnums, const std::vector&) { return jvp(primals, cotangents, argnums); } std::vector ArcTan2::jvp( const std::vector& primals, const std::vector& tangents, const std::vector& argnums) { assert(primals.size() == 2); assert(argnums.size() == 2); array t = add(square(primals[0], stream()), square(primals[1], stream()), stream()); return { divide(tangents[0], t, stream()), divide(negative(tangents[1], stream()), t, stream())}; } std::pair, std::vector> ArcTan2::vmap( const std::vector& inputs, const std::vector& axes) { assert(inputs.size() == 2); assert(axes.size() == 2); auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream()); return {{arctan2(a, b, stream())}, {to_ax}}; } std::vector ArcTanh::vjp( const std::vector& primals, const std::vector& cotangents, const std::vector& argnums, const std::vector&) { return jvp(primals, cotangents, argnums); } std::vector ArcTanh::jvp( const std::vector& primals, const std::vector& tangents, const std::vector& argnums) { assert(primals.size() == 1); assert(argnums.size() == 1); array one = array(1., primals[0].dtype()); array t = subtract(one, square(primals[0], stream()), stream()); return {divide(tangents[0], t, stream())}; } std::pair, std::vector> ArcTanh::vmap( const std::vector& inputs, const std::vector& axes) { assert(inputs.size() == 1); assert(axes.size() == 1); return {{arctanh(inputs[0], stream())}, axes}; } std::pair, std::vector> ArgPartition::vmap( const std::vector& inputs, const std::vector& axes) { assert(inputs.size() == 1); assert(axes.size() == 1); int axis_left = axes[0] >= 0 && axes[0] <= axis_; return {{argpartition(inputs[0], axis_ + axis_left, stream())}, axes}; } std::vector ArgPartition::vjp( const std::vector& primals, const std::vector&, const std::vector&, const std::vector&) { return {zeros_like(primals[0], stream())}; } std::vector ArgPartition::jvp( const std::vector&, const std::vector& tangents, const std::vector&) { return {zeros_like(tangents[0], stream())}; } bool ArgPartition::is_equivalent(const Primitive& other) const { const ArgPartition& r_other = static_cast(other); return axis_ == r_other.axis_ && kth_ == r_other.kth_; } bool ArgReduce::is_equivalent(const Primitive& other) const { const ArgReduce& r_other = static_cast(other); return reduce_type_ == r_other.reduce_type_ && axis_ == r_other.axis_; } std::pair, std::vector> ArgReduce::vmap( const std::vector& inputs, const std::vector& axes) { int reduce_ax = axis_ + (axes[0] >= 0 && axis_ >= axes[0]); auto& in = inputs[0]; std::vector out; if (reduce_type_ == ArgReduce::ArgMin) { out.push_back(argmin(in, reduce_ax, true, stream())); } else { out.push_back(argmax(in, reduce_ax, true, stream())); } return {out, axes}; } std::vector ArgReduce::vjp( const std::vector& primals, const std::vector&, const std::vector&, const std::vector&) { return {zeros_like(primals[0], stream())}; } std::vector ArgReduce::jvp( const std::vector&, const std::vector& tangents, const std::vector&) { return {zeros_like(tangents[0], stream())}; } std::pair, std::vector> ArgSort::vmap( const std::vector& inputs, const std::vector& axes) { assert(inputs.size() == 1); assert(axes.size() == 1); int axis_left = axes[0] >= 0 && axes[0] <= axis_; return {{argsort(inputs[0], axis_ + axis_left, stream())}, axes}; } std::vector ArgReduce::output_shapes(const std::vector& inputs) { auto out_shape = inputs[0].shape(); out_shape[axis_] = 1; return {std::move(out_shape)}; } bool ArgSort::is_equivalent(const Primitive& other) const { const ArgSort& r_other = static_cast(other); return axis_ == r_other.axis_; } std::vector AsType::vjp( const std::vector& primals, const std::vector& cotangents, const std::vector& argnums, const std::vector&) { if (cotangents[0].dtype() != dtype_) { throw std::invalid_argument( "[astype] Type of cotangents does not match primal output type."); } return {astype(cotangents[0], primals[0].dtype(), stream())}; } std::vector AsType::jvp( const std::vector& primals, const std::vector& tangents, const std::vector& argnums) { return {astype(tangents[0], dtype_, stream())}; } std::pair, std::vector> AsType::vmap( const std::vector& inputs, const std::vector& axes) { return {{astype(inputs[0], dtype_, stream())}, axes}; } bool AsType::is_equivalent(const Primitive& other) const { const AsType& a_other = static_cast(other); return dtype_ == a_other.dtype_; } std::vector AsStrided::vjp( const std::vector& primals, const std::vector& cotangents, const std::vector& argnums, const std::vector&) { assert(argnums.size() == 1); // Extract the sizes and cast them to ints int grad_size = primals[0].size(); int cotangents_size = cotangents[0].size(); // Make a flat container to hold the gradients auto grad = zeros_like(primals[0], stream()); grad = reshape(grad, {grad_size}, stream()); // Create the indices that map output to input auto idx = arange(grad_size, stream()); idx = as_strided(idx, shape_, strides_, offset_, stream()); idx = reshape(idx, {cotangents_size}, stream()); // Reshape the cotangentsgent for use with scatter auto flat_cotangents = reshape(cotangents[0], {cotangents_size, 1}, stream()); // Finally accumulate the gradients and reshape them to look like the input grad = scatter_add(grad, idx, flat_cotangents, 0, stream()); grad = reshape(grad, primals[0].shape(), stream()); return {grad}; } std::vector AsStrided::jvp( const std::vector& primals, const std::vector& tangents, const std::vector& argnums) { assert(primals.size() == 1); return {as_strided(tangents[0], shape_, strides_, offset_, stream())}; } bool AsStrided::is_equivalent(const Primitive& other) const { const AsStrided& a_other = static_cast(other); return shape_ == a_other.shape_ && strides_ == a_other.strides_ && offset_ == a_other.offset_; } bool BitwiseBinary::is_equivalent(const Primitive& other) const { const BitwiseBinary& a_other = static_cast(other); return op_ == a_other.op_; } void BitwiseBinary::print(std::ostream& os) { switch (op_) { case BitwiseBinary::And: os << "BitwiseAnd"; break; case BitwiseBinary::Or: os << "BitwiseOr"; break; case BitwiseBinary::Xor: os << "BitwiseXor"; break; case BitwiseBinary::LeftShift: os << "LeftShift"; break; case BitwiseBinary::RightShift: os << "RightShift"; break; } } std::pair, std::vector> BitwiseBinary::vmap( const std::vector& inputs, const std::vector& axes) { auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream()); return { {array( a.shape(), a.dtype(), std::make_shared(stream(), op_), {a, b})}, {to_ax}}; } std::vector BitwiseBinary::jvp( const std::vector& primals, const std::vector& tangents, const std::vector& argnums) { assert(primals.size() == 2); std::vector vjps = {zeros_like(tangents[0], stream())}; if (argnums.size() > 1) { vjps.push_back(vjps.back()); } return vjps; } std::vector BitwiseBinary::vjp( const std::vector& primals, const std::vector& cotangents, const std::vector& argnums, const std::vector&) { return jvp(primals, cotangents, argnums); } std::vector Broadcast::vjp( const std::vector& primals, const std::vector& cotangents, const std::vector& argnums, const std::vector&) { assert(argnums.size() == 1); // Reduce cotangents to the shape of the primal auto& shape = primals[0].shape(); auto& cotan = cotangents[0]; int diff = cotan.ndim() - shape.size(); std::vector reduce_axes; for (int i = 0; i < cotan.ndim(); ++i) { if (i < diff) { reduce_axes.push_back(i); } else if (shape[i - diff] != cotan.shape(i)) { reduce_axes.push_back(i); } } return {reshape(sum(cotan, reduce_axes, true, stream()), shape, stream())}; } std::vector Broadcast::jvp( const std::vector& primals, const std::vector& tangents, const std::vector& argnums) { assert(argnums.size() == 1); return {broadcast_to(tangents[0], shape_, stream())}; } std::pair, std::vector> Broadcast::vmap( const std::vector& inputs, const std::vector& axes) { assert(inputs.size() == 1); assert(axes.size() == 1); auto ax = axes[0]; auto in = inputs[0]; if (ax >= 0) { auto in_shape = in.shape(); int diff = shape_.size() - in.ndim() + 1; assert(diff >= 0); in_shape.insert(in_shape.begin(), diff, 1); ax += diff; shape_.insert(shape_.begin() + ax, in_shape[ax]); in = reshape(in, in_shape, stream()); } return {{broadcast_to(in, shape_, stream())}, {ax}}; } bool Broadcast::is_equivalent(const Primitive& other) const { const Broadcast& b_other = static_cast(other); return shape_ == b_other.shape_; } std::vector Ceil::vjp( const std::vector& primals, const std::vector& cotangents, const std::vector& argnums, const std::vector&) { return jvp(primals, cotangents, argnums); } std::vector Ceil::jvp( const std::vector& primals, const std::vector& tangents, const std::vector& argnums) { assert(primals.size() == 1); assert(argnums.size() == 1); return {zeros_like(primals[0], stream())}; } std::pair, std::vector> Ceil::vmap( const std::vector& inputs, const std::vector& axes) { assert(inputs.size() == 1); assert(axes.size() == 1); return {{ceil(inputs[0], stream())}, axes}; } std::pair, std::vector> Cholesky::vmap( const std::vector& inputs, const std::vector& axes) { auto ax = axes[0] >= 0 ? 0 : -1; auto a = axes[0] > 0 ? moveaxis(inputs[0], axes[0], 0, stream()) : inputs[0]; return {{linalg::cholesky(a, upper_, stream())}, {ax}}; } std::pair, std::vector> Eigh::vmap( const std::vector& inputs, const std::vector& axes) { assert(inputs.size() == 1); assert(axes.size() == 1); bool needs_move = axes[0] >= (inputs[0].ndim() - 2); auto a = needs_move ? moveaxis(inputs[0], axes[0], 0, stream()) : inputs[0]; auto ax = needs_move ? 0 : axes[0]; std::vector outputs; if (compute_eigenvectors_) { auto [values, vectors] = linalg::eigh(a, uplo_, stream()); outputs = {values, vectors}; } else { outputs = {linalg::eigvalsh(a, uplo_, stream())}; } return {outputs, std::vector(outputs.size(), ax)}; } std::vector Eigh::output_shapes(const std::vector& inputs) { auto shape = inputs[0].shape(); shape.pop_back(); // Remove last dimension for eigenvalues if (compute_eigenvectors_) { return { std::move(shape), inputs[0].shape()}; // Eigenvalues and eigenvectors } else { return {std::move(shape)}; // Only eigenvalues } } bool Eigh::is_equivalent(const Primitive& other) const { auto& e_other = static_cast(other); return uplo_ == e_other.uplo_ && compute_eigenvectors_ == e_other.compute_eigenvectors_; } std::vector Concatenate::vjp( const std::vector& primals, const std::vector& cotangents, const std::vector& argnums, const std::vector&) { auto& cotan = cotangents[0]; std::vector start(cotan.ndim(), 0); std::vector stop = cotan.shape(); std::vector sizes; sizes.push_back(0); for (auto& p : primals) { sizes.push_back(p.shape(axis_)); } std::partial_sum(sizes.cbegin(), sizes.cend(), sizes.begin()); std::vector grads; for (auto i : argnums) { start[axis_] = sizes[i]; stop[axis_] = sizes[i + 1]; grads.push_back(slice(cotan, start, stop, stream())); } return grads; } std::vector Concatenate::jvp( const std::vector& primals, const std::vector& tangents, const std::vector& argnums) { std::vector argidx(argnums.size()); std::iota(argidx.begin(), argidx.end(), 0); std::sort(argidx.begin(), argidx.end(), [&argnums](int a, int b) { return argnums[a] < argnums[b]; }); std::vector vals; for (int i = 0, j = 0; i < primals.size(); ++i) { if (j < argnums.size() && argnums[argidx[j]] == i) { vals.push_back(tangents[argidx[j++]]); } else { vals.push_back(zeros_like(primals[i], stream())); } } return {concatenate(vals, axis_, stream())}; } std::pair, std::vector> Concatenate::vmap( const std::vector& inputs, const std::vector& axes) { int out_ax = -1; int first_vmap = -1; // Find the first vmapped input for (int i = 0; i < axes.size(); i++) { if (axes[i] >= 0) { out_ax = axes[i]; first_vmap = i; break; } } // No vmap, should we even be in here? if (out_ax < 0) { return {{concatenate(inputs, axis_, stream())}, {out_ax}}; } // Make sure vmapped arrays have all vmapped axes in the same location and // expand non-vmapped arrays to be compatible with the vmapped ones. std::vector t_inputs; int N = inputs[first_vmap].shape(out_ax); int axis = axis_ + (axis_ >= out_ax); auto cat_shape = inputs[first_vmap].shape(); for (int i = 0; i < axes.size(); i++) { if (axes[i] >= 0) { if (out_ax != axes[i]) { t_inputs.push_back(moveaxis(inputs[i], axes[i], out_ax, stream())); } else { t_inputs.push_back(inputs[i]); } } else { cat_shape[axis] = inputs[i].shape(axis_); t_inputs.push_back(broadcast_to( expand_dims(inputs[i], out_ax, stream()), cat_shape, stream())); } } return {{concatenate(t_inputs, axis, stream())}, {out_ax}}; } bool Concatenate::is_equivalent(const Primitive& other) const { const Concatenate& c_other = static_cast(other); return axis_ == c_other.axis_; } std::vector Concatenate::output_shapes( const std::vector& inputs) { auto shape = inputs[0].shape(); for (int i = 1; i < inputs.size(); ++i) { shape[axis_] += inputs[i].shape(axis_); } return {std::move(shape)}; } std::pair, std::vector> Conjugate::vmap( const std::vector& inputs, const std::vector& axes) { assert(inputs.size() == 1); assert(axes.size() == 1); return {{conjugate(inputs[0], stream())}, axes}; } std::vector Contiguous::vjp( const std::vector&, const std::vector& cotangents, const std::vector&, const std::vector&) { return {cotangents}; } std::vector Contiguous::jvp( const std::vector&, const std::vector& tangents, const std::vector&) { return {tangents}; } std::pair, std::vector> Contiguous::vmap( const std::vector& inputs, const std::vector& axes) { return {{contiguous(inputs[0], allow_col_major_, stream())}, axes}; } bool Contiguous::is_equivalent(const Primitive& other) const { const Contiguous& c_other = static_cast(other); return allow_col_major_ == c_other.allow_col_major_; } array conv_weight_backward_patches( const array& in, const array& wt, const array& cotan, const std::vector& kernel_strides, const std::vector& padding, StreamOrDevice s) { // Resolve Padded input shapes and strides std::vector padding_starts(in.ndim(), 0); std::vector padding_ends = in.shape(); std::vector in_padded_shape = in.shape(); // padded shape for (int i = 1; i < in.ndim() - 1; i++) { in_padded_shape[i] += 2 * padding[i - 1]; padding_ends[i] += padding[i - 1]; padding_starts[i] += padding[i - 1]; } // padded strides (contiguous) Strides in_padded_strides(in.ndim(), 1); for (int i = in.ndim() - 2; i >= 0; --i) { in_padded_strides[i] = in_padded_strides[i + 1] * in_padded_shape[i + 1]; } // Pad input std::vector padded_axes(in.ndim() - 2, 0); std::iota(padded_axes.begin(), padded_axes.end(), 1); auto in_padded = pad( in, padded_axes, padding, padding, array(0, in.dtype()), "constant", s); // Resolve strided patches // patches are shaped as // (batch_dim, out_spatial_dims, weight_spatial_dims, in_channels) Shape patches_shape{cotan.shape().begin(), cotan.shape().end() - 1}; patches_shape.insert( patches_shape.end(), wt.shape().begin() + 1, wt.shape().end()); // Resolve patch strides int n_spatial_dim = in.ndim() - 2; Strides patches_strides(patches_shape.size(), 1); patches_strides[0] = in_padded_strides[0]; for (int i = 1; i < n_spatial_dim + 1; i++) { patches_strides[i] = in_padded_strides[i] * kernel_strides[i - 1]; } for (int i = 1; i < in.ndim(); i++) { patches_strides[n_spatial_dim + i] = in_padded_strides[i]; } // Make patches from in auto in_patches = as_strided(in_padded, patches_shape, patches_strides, 0, s); // Prepare for matmul int O = wt.shape(0); auto cotan_mat = reshape(cotan, {-1, O}, s); in_patches = reshape(in_patches, {cotan_mat.shape(0), -1}, s); auto grad = matmul(transpose(cotan_mat, {1, 0}, s), in_patches, s); grad = reshape(grad, wt.shape(), s); return grad; } std::vector Convolution::vjp( const std::vector& primals, const std::vector& cotangents, const std::vector& argnums, const std::vector&) { assert(primals.size() == 2); std::vector grads; // Collect info auto& in = primals[0]; auto& wt = primals[1]; auto& cotan = cotangents[0]; auto group_transpose = [this](const array& x, int group_dim, int ax_a, int ax_b) { if (groups_ > 1) { auto shape = x.shape(); if (group_dim < 0) { group_dim += shape.size(); } shape.insert(shape.begin() + group_dim, groups_); shape[group_dim + 1] = shape[group_dim + 1] / groups_; auto x_trans = swapaxes( reshape(x, std::move(shape), stream()), ax_a, ax_b, stream()); return flatten(x_trans, group_dim, group_dim + 1, stream()); } else { return swapaxes(x, 0, -1, stream()); } }; for (int a : argnums) { // Grads for input if (a == 0) { std::vector padding_lo = padding_; std::vector padding_hi = padding_; for (int i = 0; i < padding_lo.size(); ++i) { int wt_size = 1 + kernel_dilation_[i] * (wt.shape(1 + i) - 1); padding_lo[i] = wt_size - padding_[i] - 1; int in_size = 1 + input_dilation_[i] * (in.shape(1 + i) - 1); int out_size = 1 + kernel_strides_[i] * (cotan.shape(1 + i) - 1); padding_hi[i] = in_size - out_size + padding_[i]; } // Check for negative padding bool has_neg_padding = false; for (auto& pd : padding_lo) { has_neg_padding |= (pd < 0); } for (auto& pd : padding_hi) { has_neg_padding |= (pd < 0); } auto padding_lo_ = std::vector(padding_lo); auto padding_hi_ = std::vector(padding_hi); // Use negative padding on the gradient output if (has_neg_padding) { for (auto& p : padding_lo_) { p = std::max(0, p); } for (auto& p : padding_hi_) { p = std::max(0, p); } } auto wt_trans = group_transpose(wt, 0, 1, -1); auto grad = conv_general( /* const array& input = */ cotan, /* const array& weight = */ wt_trans, /* std::vector stride = */ input_dilation_, /* std::vector padding_lo = */ padding_lo, /* std::vector padding_hi = */ padding_hi, /* std::vector kernel_dilation = */ kernel_dilation_, /* std::vector input_dilation = */ kernel_strides_, /* int groups = */ groups_, /* bool flip = */ !flip_, stream()); // Handle negative padding if (has_neg_padding) { Shape starts(grad.ndim(), 0); auto stops = grad.shape(); for (int i = 0; i < grad.ndim() - 2; i++) { if (padding_lo[i] < 0) { starts[i + 1] -= padding_lo[i]; padding_lo[i] = 0; } if (padding_hi[i] < 0) { stops[i + 1] += padding_hi[i]; padding_hi[i] = 0; } } grad = slice(grad, std::move(starts), std::move(stops), stream()); } grads.push_back(grad); } // Grads for weight else if (a == 1) { bool no_dilation = true; for (int i = 0; i < input_dilation_.size(); i++) { no_dilation &= (input_dilation_[i] == 1) && (kernel_dilation_[i] == 1); } if (no_dilation && !flip_ && groups_ == 1) { auto grad = conv_weight_backward_patches( in, wt, cotan, kernel_strides_, padding_, stream()); grads.push_back(grad); } else { if (flip_) { auto padding = padding_; for (int i = 0; i < padding.size(); i++) { int wt_size = 1 + kernel_dilation_[i] * (wt.shape(1 + i) - 1); padding[i] = wt_size - padding_[i] - 1; } auto cotan_trans = group_transpose(cotan, -1, 0, -1); auto in_trans = swapaxes(in, 0, -1, stream()); auto grad_trans = conv_general( /* const array& input = */ cotan_trans, /* const array& weight = */ in_trans, /* std::vector stride = */ kernel_dilation_, /* std::vector padding_lo = */ padding, /* std::vector padding_hi = */ padding, /* std::vector kernel_dilation = */ input_dilation_, /* std::vector input_dilation = */ kernel_strides_, /* int groups = */ groups_, /* bool flip = */ false, stream()); if (groups_ > 1) { grads.push_back(group_transpose(grad_trans, -1, 0, -2)); } else { grads.push_back(grad_trans); } } else { std::vector padding_lo = padding_; std::vector padding_hi = padding_; for (int i = 0; i < padding_hi.size(); ++i) { int in_size = 1 + input_dilation_[i] * (in.shape(1 + i) - 1); int out_size = 1 + kernel_strides_[i] * (cotan.shape(1 + i) - 1); int wt_size = 1 + kernel_dilation_[i] * (wt.shape(1 + i) - 1); padding_hi[i] = out_size - in_size + wt_size - padding_[i] - 1; } auto cotan_trans = swapaxes(cotan, 0, -1, stream()); auto in_trans = group_transpose(in, -1, 0, -1); auto grad_trans = conv_general( /* const array& input = */ in_trans, /* const array& weight = */ cotan_trans, /* std::vector stride = */ kernel_dilation_, /* std::vector padding_lo = */ padding_lo, /* std::vector padding_hi = */ padding_hi, /* std::vector kernel_dilation = */ kernel_strides_, /* std::vector input_dilation = */ input_dilation_, /* int groups = */ groups_, /* bool flip = */ false, stream()); grads.push_back(swapaxes(grad_trans, 0, -1, stream())); } } } } return grads; } bool Convolution::is_equivalent(const Primitive& other) const { const Convolution& c_other = static_cast(other); return padding_ == c_other.padding_ && kernel_strides_ == c_other.kernel_strides_ && kernel_dilation_ == c_other.kernel_dilation_ && input_dilation_ == c_other.input_dilation_ && groups_ == c_other.groups_ && flip_ == c_other.flip_; } std::vector Copy::vjp( const std::vector& primals, const std::vector& cotangents, const std::vector& argnums, const std::vector&) { assert(primals.size() == 1); assert(argnums.size() == 1); return cotangents; } std::vector Copy::jvp( const std::vector& primals, const std::vector& tangents, const std::vector& argnums) { assert(primals.size() == 1); assert(argnums.size() == 1); return tangents; } std::pair, std::vector> Copy::vmap( const std::vector& inputs, const std::vector& axes) { assert(inputs.size() == 1); assert(axes.size() == 1); return {{copy(inputs[0], stream())}, axes}; } std::vector Cos::vjp( const std::vector& primals, const std::vector& cotangents, const std::vector& argnums, const std::vector&) { return {jvp(primals, cotangents, argnums)}; } std::vector Cos::jvp( const std::vector& primals, const std::vector& tangents, const std::vector& argnums) { assert(primals.size() == 1); assert(argnums.size() == 1); return {multiply( tangents[0], negative(sin(primals[0], stream()), stream()), stream())}; } std::pair, std::vector> Cos::vmap( const std::vector& inputs, const std::vector& axes) { assert(inputs.size() == 1); assert(axes.size() == 1); return {{cos(inputs[0], stream())}, axes}; } std::vector Cosh::vjp( const std::vector& primals, const std::vector& cotangents, const std::vector& argnums, const std::vector&) { return jvp(primals, cotangents, argnums); } std::vector Cosh::jvp( const std::vector& primals, const std::vector& tangents, const std::vector& argnums) { assert(primals.size() == 1); assert(argnums.size() == 1); return {multiply(tangents[0], sinh(primals[0], stream()), stream())}; } std::pair, std::vector> Cosh::vmap( const std::vector& inputs, const std::vector& axes) { assert(inputs.size() == 1); assert(axes.size() == 1); return {{cosh(inputs[0], stream())}, axes}; } std::vector CustomTransforms::vjp( const std::vector& primals, const std::vector& cotangents, const std::vector& argnums, const std::vector& outputs) { // Extract the inputs to the VJP function std::vector