mlx/mlx/backend/no_gpu/primitives.cpp

// Copyright © 2023-2024 Apple Inc.

#include "mlx/primitives.h"
#include "mlx/distributed/primitives.h"
#include "mlx/fast_primitives.h"

#define NO_GPU_MULTI(func)                                             \
  void func::eval_gpu(                                                 \
      const std::vector<array>& inputs, std::vector<array>& outputs) { \
    throw std::runtime_error(#func " has no GPU implementation.");     \
  }

#define NO_GPU_USE_FALLBACK(func)     \
  bool func::use_fallback(Stream s) { \
    return true;                      \
  }                                   \
  NO_GPU_MULTI(func)

#define NO_GPU(func)                                                  \
  void func::eval_gpu(const std::vector<array>& inputs, array& out) { \
    throw std::runtime_error(#func " has no GPU implementation.");    \
  }

namespace mlx::core {

bool fast::ScaledDotProductAttention::use_fallback(
    const array& q,
    const array& k,
    const array& v,
    bool has_mask,
    bool has_arr_mask,
    bool do_causal,
    Stream s) {
  return true;
}

NO_GPU(Abs)
NO_GPU(Add)
NO_GPU(AddMM)
NO_GPU(Arange)
NO_GPU(ArcCos)
NO_GPU(ArcCosh)
NO_GPU(ArcSin)
NO_GPU(ArcSinh)
NO_GPU(ArcTan)
NO_GPU(ArcTan2)
NO_GPU(ArcTanh)
NO_GPU(ArgPartition)
NO_GPU(ArgReduce)
NO_GPU(ArgSort)
NO_GPU(AsType)
NO_GPU(AsStrided)
NO_GPU(BitwiseBinary)
NO_GPU(BitwiseInvert)
NO_GPU(BlockMaskedMM)
NO_GPU(Broadcast)
NO_GPU(BroadcastAxes)
NO_GPU(Ceil)
NO_GPU_MULTI(Compiled)
NO_GPU(Concatenate)
NO_GPU(Conjugate)
NO_GPU(Contiguous)
NO_GPU(Convolution)
NO_GPU(Copy)
NO_GPU(Cos)
NO_GPU(Cosh)
NO_GPU_MULTI(CustomTransforms)
NO_GPU_MULTI(Depends)
NO_GPU(Divide)
NO_GPU_MULTI(DivMod)
NO_GPU(DynamicSlice)
NO_GPU(DynamicSliceUpdate)
NO_GPU(NumberOfElements)
NO_GPU(Remainder)
NO_GPU(Equal)
NO_GPU(Erf)
NO_GPU(ErfInv)
NO_GPU(Exp)
NO_GPU(ExpandDims)
NO_GPU(Expm1)
NO_GPU(FFT)
NO_GPU(Flatten)
NO_GPU(Floor)
NO_GPU(Full)
NO_GPU(Gather)
NO_GPU(GatherAxis)
NO_GPU(GatherMM)
NO_GPU(GatherQMM)
NO_GPU(Greater)
NO_GPU(GreaterEqual)
NO_GPU(Hadamard)
NO_GPU(Imag)
NO_GPU(Less)
NO_GPU(LessEqual)
NO_GPU(Load)
NO_GPU(Log)
NO_GPU(Log1p)
NO_GPU(LogicalNot)
NO_GPU(LogicalAnd)
NO_GPU(LogicalOr)
NO_GPU(LogAddExp)
NO_GPU(LogSumExp)
NO_GPU_MULTI(LUF)
NO_GPU(Matmul)
NO_GPU(Maximum)
NO_GPU(Minimum)
NO_GPU(Multiply)
NO_GPU(Negative)
NO_GPU(NotEqual)
NO_GPU(Pad)
NO_GPU(Partition)
NO_GPU(Power)
NO_GPU_MULTI(QRF)
NO_GPU(QuantizedMatmul)
NO_GPU(RandomBits)
NO_GPU(Real)
NO_GPU(Reduce)
NO_GPU(Reshape)
NO_GPU(Round)
NO_GPU(Scan)
NO_GPU(Scatter)
NO_GPU(ScatterAxis)
NO_GPU(Select)
NO_GPU(Sigmoid)
NO_GPU(Sign)
NO_GPU(Sin)
NO_GPU(Sinh)
NO_GPU(Slice)
NO_GPU(SliceUpdate)
NO_GPU(Softmax)
NO_GPU(Sort)
NO_GPU_MULTI(Split)
NO_GPU(Square)
NO_GPU(Squeeze)
NO_GPU(Sqrt)
NO_GPU(StopGradient)
NO_GPU(Subtract)
NO_GPU_MULTI(SVD)
NO_GPU(Tan)
NO_GPU(Tanh)
NO_GPU(Transpose)
NO_GPU(Unflatten)
NO_GPU(Inverse)
NO_GPU(Cholesky)
NO_GPU_MULTI(Eigh)
NO_GPU_MULTI(Eig)
NO_GPU(View)

namespace fast {
NO_GPU_USE_FALLBACK(LayerNorm)
NO_GPU_MULTI(LayerNormVJP)
NO_GPU_USE_FALLBACK(RMSNorm)
NO_GPU_MULTI(RMSNormVJP)
NO_GPU_USE_FALLBACK(RoPE)
NO_GPU(ScaledDotProductAttention)
NO_GPU_MULTI(AffineQuantize)
NO_GPU_MULTI(CustomKernel)
} // namespace fast

namespace distributed {
NO_GPU_MULTI(AllReduce)
NO_GPU_MULTI(AllGather)
NO_GPU_MULTI(Send)
NO_GPU_MULTI(Recv)
} // namespace distributed

} // namespace mlx::core
Custom VJP and checkpointing (#541) * Implement custom_vjp and checkpointing * Add a dependency management primitive * Change the eval order to deep branches first * Add graph depth tracking to the array 2024-01-31 08:04:45 +08:00			`// Copyright © 2023-2024 Apple Inc.`
copyright + ack 2023-12-01 03:12:53 +08:00
jagrit's commit files 2023-11-30 02:52:08 +08:00			`#include "mlx/primitives.h"`
MPI ops in GPU stream for faster comms (#1356) 2024-08-27 06:12:50 +08:00			`#include "mlx/distributed/primitives.h"`
Separate fast ops and primitives (#699) 2024-02-17 11:16:39 +08:00			`#include "mlx/fast_primitives.h"`
jagrit's commit files 2023-11-30 02:52:08 +08:00
Multi output primitives (#330) * Multi-output primitives --------- Co-authored-by: Angelos Katharopoulos <a_katharopoulos@apple.com> 2024-01-09 08:39:08 +08:00			`#define NO_GPU_MULTI(func) \`
			`void func::eval_gpu( \`
			`const std::vector<array>& inputs, std::vector<array>& outputs) { \`
			`throw std::runtime_error(#func " has no GPU implementation."); \`
			`}`

Fast primitives decide when to use the fallback (#2216) 2025-06-03 04:26:37 +08:00			`#define NO_GPU_USE_FALLBACK(func) \`
			`bool func::use_fallback(Stream s) { \`
			`return true; \`
			`} \`
			`NO_GPU_MULTI(func)`

jagrit's commit files 2023-11-30 02:52:08 +08:00			`#define NO_GPU(func) \`
			`void func::eval_gpu(const std::vector<array>& inputs, array& out) { \`
			`throw std::runtime_error(#func " has no GPU implementation."); \`
			`}`

			`namespace mlx::core {`

Fast primitives decide when to use the fallback (#2216) 2025-06-03 04:26:37 +08:00			`bool fast::ScaledDotProductAttention::use_fallback(`
			`const array& q,`
			`const array& k,`
			`const array& v,`
			`bool has_mask,`
			`bool has_arr_mask,`
			`bool do_causal,`
			`Stream s) {`
			`return true;`
			`}`

jagrit's commit files 2023-11-30 02:52:08 +08:00			`NO_GPU(Abs)`
			`NO_GPU(Add)`
Update GEMM (#424) * Organize and collect metal subroutine templates and elements in `metal/kernels/steel/` * Update gemm elements for better performance * Add split-K specialization for gemm * Add `addmm` primitive, op and bindings for fused matmul and bias addition * Update tests and benchmarks as needed 2024-01-18 04:42:39 +08:00			`NO_GPU(AddMM)`
jagrit's commit files 2023-11-30 02:52:08 +08:00			`NO_GPU(Arange)`
			`NO_GPU(ArcCos)`
			`NO_GPU(ArcCosh)`
			`NO_GPU(ArcSin)`
			`NO_GPU(ArcSinh)`
			`NO_GPU(ArcTan)`
Added ArcTan2 operation (#1079) * Added ArcTan2 operation * Cleanup, bug fixes from code review * Minor cleanup, fixed Linux tests 2024-05-08 23:35:15 +08:00			`NO_GPU(ArcTan2)`
jagrit's commit files 2023-11-30 02:52:08 +08:00			`NO_GPU(ArcTanh)`
			`NO_GPU(ArgPartition)`
			`NO_GPU(ArgReduce)`
			`NO_GPU(ArgSort)`
			`NO_GPU(AsType)`
			`NO_GPU(AsStrided)`
Add bitwise ops (#1037) * bitwise ops * fix tests 2024-04-27 13:03:42 +08:00			`NO_GPU(BitwiseBinary)`
Bitwise Inverse (#1862) * add bitwise inverse * add vmap + fix nojit * inverse -> invert * add to compile + remove unused 2025-02-14 00:44:14 +08:00			`NO_GPU(BitwiseInvert)`
Block sparse mm (#1058) 2024-05-03 05:03:58 +08:00			`NO_GPU(BlockMaskedMM)`
jagrit's commit files 2023-11-30 02:52:08 +08:00			`NO_GPU(Broadcast)`
Dynamic broadcasting for shapeless compile/export (#1722) * working towards dynamic broadcast * shapeless broadcast * fix build + nits * use broadcast arrays in quantize matmul * some cleanup / consistency * mend * some comments * add vjp, jvp for broadcast axes 2025-01-10 03:04:24 +08:00			`NO_GPU(BroadcastAxes)`
Floor and Ceil (#150) * Implements Floor and Ceil Ops 2023-12-15 02:00:23 +08:00			`NO_GPU(Ceil)`
Compile primitive (#571) * Compiled primitive with basic binary, unary graph-level fusion 2024-02-05 22:51:22 +08:00			`NO_GPU_MULTI(Compiled)`
jagrit's commit files 2023-11-30 02:52:08 +08:00			`NO_GPU(Concatenate)`
Add conjugate operator (#1100) * cpu and gpu impl * add mx.conj and array.conj() --------- Co-authored-by: Alex Barron <abarron22@apple.com> 2024-05-10 22:22:20 +08:00			`NO_GPU(Conjugate)`
contiguous op / prim (#1612) 2024-11-22 11:51:49 +08:00			`NO_GPU(Contiguous)`
jagrit's commit files 2023-11-30 02:52:08 +08:00			`NO_GPU(Convolution)`
			`NO_GPU(Copy)`
			`NO_GPU(Cos)`
			`NO_GPU(Cosh)`
Custom transforms (#1246) 2024-07-11 09:00:01 +08:00			`NO_GPU_MULTI(CustomTransforms)`
Custom VJP and checkpointing (#541) * Implement custom_vjp and checkpointing * Add a dependency management primitive * Change the eval order to deep branches first * Add graph depth tracking to the array 2024-01-31 08:04:45 +08:00			`NO_GPU_MULTI(Depends)`
jagrit's commit files 2023-11-30 02:52:08 +08:00			`NO_GPU(Divide)`
Compile primitive (#571) * Compiled primitive with basic binary, unary graph-level fusion 2024-02-05 22:51:22 +08:00			`NO_GPU_MULTI(DivMod)`
Dynamic slicing (#1741) * dynamic slice and slice update * python bindings + tests + fix set item * fix compile issue * comment * fix jit 2025-01-08 06:02:16 +08:00			`NO_GPU(DynamicSlice)`
			`NO_GPU(DynamicSliceUpdate)`
NumberOfElements for shapeless compile and vmap fixes (#802) 2024-03-14 01:34:14 +08:00			`NO_GPU(NumberOfElements)`
Add the remainder op (#85) * Add remainder in the C++ backend * Add the python binding and test 2023-12-09 07:08:52 +08:00			`NO_GPU(Remainder)`
jagrit's commit files 2023-11-30 02:52:08 +08:00			`NO_GPU(Equal)`
			`NO_GPU(Erf)`
			`NO_GPU(ErfInv)`
			`NO_GPU(Exp)`
`ExpandDims` primitive (#1687) * add squeeze primitive * simplify squeeze, use in gather * fix * fix * fix * fix * fix no cpu * use squeeze in matmul and friends * expand dims primitive * comment 2024-12-11 08:39:07 +08:00			`NO_GPU(ExpandDims)`
std and expm1 (#973) * std and expm1 * actually add expm1 * fix linux * fix vjp * relax tol for linux test * Add it to the compilable primitives --------- Co-authored-by: Angelos Katharopoulos <a_katharopoulos@apple.com> 2024-04-09 05:26:01 +08:00			`NO_GPU(Expm1)`
jagrit's commit files 2023-11-30 02:52:08 +08:00			`NO_GPU(FFT)`
Flatten and unflatten (#1692) * flatten and unflatten * fix grad * fix shape infer * use squeeze + unsqueeze in get_item 2024-12-12 13:51:37 +08:00			`NO_GPU(Flatten)`
Floor and Ceil (#150) * Implements Floor and Ceil Ops 2023-12-15 02:00:23 +08:00			`NO_GPU(Floor)`
jagrit's commit files 2023-11-30 02:52:08 +08:00			`NO_GPU(Full)`
			`NO_GPU(Gather)`
scatter axis + gather axis primitives (#1813) * scatter axis + gather axis primitives * add transforms * comment 2025-02-01 12:48:08 +08:00			`NO_GPU(GatherAxis)`
Rename block sparse (#1149) * block_sparse_mm to gather_mm * rename * nit * nit 2024-05-22 22:48:34 +08:00			`NO_GPU(GatherMM)`
			`NO_GPU(GatherQMM)`
jagrit's commit files 2023-11-30 02:52:08 +08:00			`NO_GPU(Greater)`
			`NO_GPU(GreaterEqual)`
Fast Hadamard Transform (#1249) * Working hadamard for powers of 2 * working for m2^k add scale and check contiguity * add size check * clean up * fix test * add grads + vmap * gpu only * skip on linux * test typo * add cpu impl * remove gpu only tests * fix linux build + add is_equivalent 2024-07-10 11:39:01 +08:00			`NO_GPU(Hadamard)`
Real and Imag (#1490) * real and imag * fix * fix 2024-10-16 07:23:15 +08:00			`NO_GPU(Imag)`
jagrit's commit files 2023-11-30 02:52:08 +08:00			`NO_GPU(Less)`
			`NO_GPU(LessEqual)`
			`NO_GPU(Load)`
			`NO_GPU(Log)`
			`NO_GPU(Log1p)`
			`NO_GPU(LogicalNot)`
feat: add logicalAnd and logicalOR (#386) * feat: add logicalAnd and logicalOR * run pre-commit * Refactor logical_and and logical_or functions * Add acknowledgement * Add logical AND and logical OR operators * Refactor logical_and and logical_or functions * Add support for logical operators on bool arrays * Update mlx/ops.cpp Co-authored-by: Awni Hannun <awni.hannun@gmail.com> * Update mlx/ops.cpp Co-authored-by: Awni Hannun <awni.hannun@gmail.com> * Add logical AND and OR operators for arrays and scalars * Refactor vjp and jvp methods in primitives.cpp * Add overloaded operators for logical AND and OR * format --------- Co-authored-by: Awni Hannun <awni.hannun@gmail.com> Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-08 23:00:05 +08:00			`NO_GPU(LogicalAnd)`
			`NO_GPU(LogicalOr)`
jagrit's commit files 2023-11-30 02:52:08 +08:00			`NO_GPU(LogAddExp)`
Custom logsumexp (#2028) * initial custom logsumexp * more tests * comments + fix 2025-03-31 22:36:55 +08:00			`NO_GPU(LogSumExp)`
CPU LU factorization and linear solvers (#1451) * linalg solve backend * nits * more nits + fix * luf primitive and lu, solve, and solve_triangular backends * changes / nits --------- Co-authored-by: Awni Hannun <awni@apple.com> 2025-02-11 04:32:24 +08:00			`NO_GPU_MULTI(LUF)`
jagrit's commit files 2023-11-30 02:52:08 +08:00			`NO_GPU(Matmul)`
			`NO_GPU(Maximum)`
			`NO_GPU(Minimum)`
			`NO_GPU(Multiply)`
			`NO_GPU(Negative)`
			`NO_GPU(NotEqual)`
			`NO_GPU(Pad)`
			`NO_GPU(Partition)`
			`NO_GPU(Power)`
Compile primitive (#571) * Compiled primitive with basic binary, unary graph-level fusion 2024-02-05 22:51:22 +08:00			`NO_GPU_MULTI(QRF)`
An initial quantized matmul implementation (#205) * Add quantized matvec * Add quantized matrix matrix with 2nd matrix transposed * Add quantized matmul tests * Add a slow cpu quantized matmul * Add a slightly faster vectorized cpu version 2023-12-19 15:18:57 +08:00			`NO_GPU(QuantizedMatmul)`
jagrit's commit files 2023-11-30 02:52:08 +08:00			`NO_GPU(RandomBits)`
Real and Imag (#1490) * real and imag * fix * fix 2024-10-16 07:23:15 +08:00			`NO_GPU(Real)`
jagrit's commit files 2023-11-30 02:52:08 +08:00			`NO_GPU(Reduce)`
			`NO_GPU(Reshape)`
Adds round op and primitive (#203) 2023-12-19 03:32:48 +08:00			`NO_GPU(Round)`
jagrit's commit files 2023-11-30 02:52:08 +08:00			`NO_GPU(Scan)`
			`NO_GPU(Scatter)`
scatter axis + gather axis primitives (#1813) * scatter axis + gather axis primitives * add transforms * comment 2025-02-01 12:48:08 +08:00			`NO_GPU(ScatterAxis)`
Implement the 'where' primitive for conditional selection (#664) 2024-02-23 07:10:48 +08:00			`NO_GPU(Select)`
jagrit's commit files 2023-11-30 02:52:08 +08:00			`NO_GPU(Sigmoid)`
			`NO_GPU(Sign)`
			`NO_GPU(Sin)`
			`NO_GPU(Sinh)`
			`NO_GPU(Slice)`
Add a SliceUpdate op and primitive (#850) * Enable copy to work with int64 strides * Fix uniform buffer indices or copy kernel arguments * Update utils.h * Remove manual unrolling of elem to loc loop * GPU copy updated to handle negative strides * Add slice update primitive 2024-03-21 01:39:25 +08:00			`NO_GPU(SliceUpdate)`
jagrit's commit files 2023-11-30 02:52:08 +08:00			`NO_GPU(Softmax)`
			`NO_GPU(Sort)`
Split multi output (#461) * Multi-output split primitive * Add the multi-output split to the ArrayIterator * Add some grad tests for split 2024-01-17 05:33:55 +08:00			`NO_GPU_MULTI(Split)`
jagrit's commit files 2023-11-30 02:52:08 +08:00			`NO_GPU(Square)`
`ExpandDims` primitive (#1687) * add squeeze primitive * simplify squeeze, use in gather * fix * fix * fix * fix * fix no cpu * use squeeze in matmul and friends * expand dims primitive * comment 2024-12-11 08:39:07 +08:00			`NO_GPU(Squeeze)`
jagrit's commit files 2023-11-30 02:52:08 +08:00			`NO_GPU(Sqrt)`
			`NO_GPU(StopGradient)`
			`NO_GPU(Subtract)`
Add SVD primitive (#809) Add SVD op using Accelerate's LAPACK following https://developer.apple.com/documentation/accelerate/ compressing_an_image_using_linear_algebra Co-authored-by: Nicolo Valigi <nvaligi@apple.com> 2024-03-13 03:30:11 +08:00			`NO_GPU_MULTI(SVD)`
jagrit's commit files 2023-11-30 02:52:08 +08:00			`NO_GPU(Tan)`
			`NO_GPU(Tanh)`
			`NO_GPU(Transpose)`
Flatten and unflatten (#1692) * flatten and unflatten * fix grad * fix shape infer * use squeeze + unsqueeze in get_item 2024-12-12 13:51:37 +08:00			`NO_GPU(Unflatten)`
Add matrix inversion primitive (#822) 2024-03-15 21:34:36 +08:00			`NO_GPU(Inverse)`
Implemented Cholesky on CPU (#1119) 2024-05-18 03:31:59 +08:00			`NO_GPU(Cholesky)`
Eigenvalues and eigenvectors (#1334) * initial eigvalsh * add compute_vectors * add compute_vectors_ * return a pair * add eigh to return only eigenvectors * fixed typo * merge merge Eighvalsh and Eigh into a single primitive * use the same primate with the flag * fix primatives * use MULTI * fix eval_gpu * fix decleration * rename EighPrimitive to Eigh * tests * tests * fix rebase and format * cleanup lapack * format * add cblas.h --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-10-23 03:18:48 +08:00			`NO_GPU_MULTI(Eigh)`
non-symmetric eig and eigh (#2188) 2025-05-16 04:01:44 +08:00			`NO_GPU_MULTI(Eig)`
Add view op (#1179) * add view primitive * nit * fix view 2024-06-04 23:05:27 +08:00			`NO_GPU(View)`
Compile primitive (#571) * Compiled primitive with basic binary, unary graph-level fusion 2024-02-05 22:51:22 +08:00
Custom primitive + RoPE fat op (#676) * extensions start * rope custom op * fix build * docs + rope benchmark * fix test * Add a Metal kernel for RoPE * Fix position of traditional * transform tests * Move rope computation to float and fix tests * Fix the test and a typo * change to fast * fix no metal build --------- Co-authored-by: Angelos Katharopoulos <a_katharopoulos@apple.com> 2024-02-15 06:04:25 +08:00			`namespace fast {`
Fast primitives decide when to use the fallback (#2216) 2025-06-03 04:26:37 +08:00			`NO_GPU_USE_FALLBACK(LayerNorm)`
Implement vjps for some primitives in the fast namespace (#883) * Implement rope vjp in terms of rope * RMSNormVJP primitive and kernel * Add LayerNormVJP primitive and kernel 2024-03-27 07:35:34 +08:00			`NO_GPU_MULTI(LayerNormVJP)`
Fast primitives decide when to use the fallback (#2216) 2025-06-03 04:26:37 +08:00			`NO_GPU_USE_FALLBACK(RMSNorm)`
Implement vjps for some primitives in the fast namespace (#883) * Implement rope vjp in terms of rope * RMSNormVJP primitive and kernel * Add LayerNormVJP primitive and kernel 2024-03-27 07:35:34 +08:00			`NO_GPU_MULTI(RMSNormVJP)`
Fast primitives decide when to use the fallback (#2216) 2025-06-03 04:26:37 +08:00			`NO_GPU_USE_FALLBACK(RoPE)`
Fast Inference SDPA op (#735) * Fast Inference SDPA op Implements metal shaders for: o = mx.fast_inference_sdpa(queries, keys, values, scale, mask) Supports fp16, fp32 dtypes; assumes d_k = 128. Generic op support / prompt encoding supported via mlx primitives. Metal implementation is for the inference use case only. Majority of performance benefits appears to results from GQA & reduced bandwidth requirements; there is approximate performance parity for the MHA use case (from some measurements on M3 Max). * Flush shared memory to zero before unprotected reads for (scores @ values) * Move to fast:: namespace, address reviewer comments ... also attempt to revert formatter auto-change for files not relevant to this change * Shared memory flush to top of kernel * Resolve compiler warnings * Update python/src/fast.cpp Co-authored-by: Awni Hannun <awni.hannun@gmail.com> * Update python/src/fast.cpp Co-authored-by: Awni Hannun <awni.hannun@gmail.com> * Update python/src/fast.cpp Co-authored-by: Awni Hannun <awni.hannun@gmail.com> * Update python/src/fast.cpp Co-authored-by: Awni Hannun <awni.hannun@gmail.com> * Update docstring per PR feedback * Softmax in higher precision, ... * route to fallback for more use cases - batch size > 1, head_dim other than 128, etc. * Address linux build failure * Address other reviewer comments * Remove extraneous eval_cpu function per review --------- Co-authored-by: Atila Orhon <64497909+atiorh@users.noreply.github.com> Co-authored-by: Awni Hannun <awni.hannun@gmail.com> Co-authored-by: atila <atiorh@icloud.com> 2024-03-05 13:06:11 +08:00			`NO_GPU(ScaledDotProductAttention)`
Fused Affine Quantize/Dequantize ops (#1282) * Add fast affine dequantize * add full quantize kernel * fused kernel with scale/bias computation * fix docstring * fix no jit error * fix test * test fix * reduce fast api to only affine_quantize 2024-07-30 06:11:38 +08:00			`NO_GPU_MULTI(AffineQuantize)`
Custom Metal Kernels from Python (#1325) * start * simple kernels working * restructure * inverse example working * docs + fixes * missing file * fix imports * address comments * add docs + fix test * Review comments + refactor to a single function * update docs * remove hashing * fix contig bug in test * back to a class * trailing whitespace * fix tests * match c++ and python apis * add link + make args kw_only 2024-08-23 04:46:29 +08:00			`NO_GPU_MULTI(CustomKernel)`
Custom primitive + RoPE fat op (#676) * extensions start * rope custom op * fix build * docs + rope benchmark * fix test * Add a Metal kernel for RoPE * Fix position of traditional * transform tests * Move rope computation to float and fix tests * Fix the test and a typo * change to fast * fix no metal build --------- Co-authored-by: Angelos Katharopoulos <a_katharopoulos@apple.com> 2024-02-15 06:04:25 +08:00			`} // namespace fast`

MPI ops in GPU stream for faster comms (#1356) 2024-08-27 06:12:50 +08:00			`namespace distributed {`
			`NO_GPU_MULTI(AllReduce)`
			`NO_GPU_MULTI(AllGather)`
Adds send/recv ops in distributed (#1366) 2024-08-27 14:01:37 +08:00			`NO_GPU_MULTI(Send)`
			`NO_GPU_MULTI(Recv)`
MPI ops in GPU stream for faster comms (#1356) 2024-08-27 06:12:50 +08:00			`} // namespace distributed`

jagrit's commit files 2023-11-30 02:52:08 +08:00			`} // namespace mlx::core`