mlx/mlx/backend/cpu/unary.h

// Copyright © 2023 Apple Inc.

#pragma once

#include "mlx/allocator.h"
#include "mlx/array.h"
#include "mlx/backend/common/utils.h"
#include "mlx/backend/cpu/simd/simd.h"
#include "mlx/utils.h"

namespace mlx::core {

void set_unary_output_data(const array& in, array& out) {
  if (is_donatable(in, out)) {
    out.copy_shared_buffer(in);
  } else {
    auto size = in.data_size();
    out.set_data(
        allocator::malloc_or_wait(size * out.itemsize()),
        size,
        in.strides(),
        in.flags());
  }
}

template <typename T, typename U = T, typename Op>
void unary_op(const T* a, U* out, Op op, size_t shape, size_t stride) {
  for (size_t i = 0; i < shape; i += 1) {
    out[i] = op(*a);
    a += stride;
  }
}

template <typename T, typename U = T, typename Op>
void unary_op(const array& a, array& out, Op op) {
  const T* a_ptr = a.data<T>();
  if (a.flags().contiguous) {
    set_unary_output_data(a, out);
    U* dst = out.data<U>();
    constexpr int N = simd::max_size<T>;
    size_t size = a.data_size();
    while (size >= N) {
      simd::store(dst, op(simd::load<T, N>(a_ptr)));
      size -= N;
      a_ptr += N;
      dst += N;
    }
    while (size > 0) {
      *dst = op(*a_ptr);
      size--;
      dst++;
      a_ptr++;
    }
  } else {
    out.set_data(allocator::malloc_or_wait(out.nbytes()));
    U* dst = out.data<U>();
    size_t shape = a.ndim() > 0 ? a.shape(-1) : 1;
    size_t stride = a.ndim() > 0 ? a.strides(-1) : 1;
    if (a.ndim() <= 1) {
      unary_op(a_ptr, dst, op, shape, stride);
      return;
    }
    ContiguousIterator it(a.shape(), a.strides(), a.ndim() - 1);
    for (size_t elem = 0; elem < a.size(); elem += shape) {
      unary_op(a_ptr + it.loc, dst + elem, op, shape, stride);
      it.step();
    }
  }
}

template <typename Op>
void unary(const array& a, array& out, Op op) {
  switch (out.dtype()) {
    case bool_:
      unary_op<bool>(a, out, op);
      break;
    case uint8:
      unary_op<uint8_t>(a, out, op);
      break;
    case uint16:
      unary_op<uint16_t>(a, out, op);
      break;
    case uint32:
      unary_op<uint32_t>(a, out, op);
      break;
    case uint64:
      unary_op<uint64_t>(a, out, op);
      break;
    case int8:
      unary_op<int8_t>(a, out, op);
      break;
    case int16:
      unary_op<int16_t>(a, out, op);
      break;
    case int32:
      unary_op<int32_t>(a, out, op);
      break;
    case int64:
      unary_op<int64_t>(a, out, op);
      break;
    case float16:
      unary_op<float16_t>(a, out, op);
      break;
    case float32:
      unary_op<float>(a, out, op);
      break;
    case bfloat16:
      unary_op<bfloat16_t>(a, out, op);
      break;
    case complex64:
      unary_op<complex64_t>(a, out, op);
      break;
  }
}

template <typename Op>
void unary_fp(const array& a, array& out, Op op) {
  switch (out.dtype()) {
    case bfloat16:
      unary_op<bfloat16_t>(a, out, op);
      break;
    case float16:
      unary_op<float16_t>(a, out, op);
      break;
    case float32:
      unary_op<float>(a, out, op);
      break;
    case complex64:
      unary_op<complex64_t>(a, out, op);
      break;
    default:
      std::ostringstream err;
      err << "[unary_fp] Does not support " << out.dtype();
      throw std::runtime_error(err.str());
  }
}

} // namespace mlx::core
copyright + ack 2023-12-01 03:12:53 +08:00			`// Copyright © 2023 Apple Inc.`

awni's commit files 2023-11-30 02:30:41 +08:00			`#pragma once`

			`#include "mlx/allocator.h"`
			`#include "mlx/array.h"`
			`#include "mlx/backend/common/utils.h"`
Refactor common into cpu specific and truly common (#1817) * refactor * fix extension example * fix no-cpu 2025-02-04 07:58:02 +08:00			`#include "mlx/backend/cpu/simd/simd.h"`
awni's commit files 2023-11-30 02:30:41 +08:00			`#include "mlx/utils.h"`

			`namespace mlx::core {`

Buffer Donation (#519) * buffer donation * fix to move shared pointer * format * gpu in place for copy and binary * revert ops test * cpu in place * a little cleanup * remove useless bench 2024-01-27 08:30:33 +08:00			`void set_unary_output_data(const array& in, array& out) {`
Allow querying the allocator for the buffer size (#1404) 2024-09-12 12:02:16 +08:00			`if (is_donatable(in, out)) {`
Buffer Donation (#519) * buffer donation * fix to move shared pointer * format * gpu in place for copy and binary * revert ops test * cpu in place * a little cleanup * remove useless bench 2024-01-27 08:30:33 +08:00			`out.copy_shared_buffer(in);`
			`} else {`
			`auto size = in.data_size();`
			`out.set_data(`
			`allocator::malloc_or_wait(size * out.itemsize()),`
			`size,`
			`in.strides(),`
			`in.flags());`
			`}`
			`}`

Real and Imag (#1490) * real and imag * fix * fix 2024-10-16 07:23:15 +08:00			`template <typename T, typename U = T, typename Op>`
			`void unary_op(const T* a, U* out, Op op, size_t shape, size_t stride) {`
Faster cpu ops (#1434) * faster binary and cleaner copy * use recursive template for other ops * more cleanup * fix from cleanup * more clean * fix binary * use contiguous iterator * add 3d * nits * fix * fix? * fix * fix rebase 2024-09-27 00:19:13 +08:00			`for (size_t i = 0; i < shape; i += 1) {`
			`out[i] = op(*a);`
			`a += stride;`
			`}`
			`}`

Real and Imag (#1490) * real and imag * fix * fix 2024-10-16 07:23:15 +08:00			`template <typename T, typename U = T, typename Op>`
awni's commit files 2023-11-30 02:30:41 +08:00			`void unary_op(const array& a, array& out, Op op) {`
			`const T* a_ptr = a.data<T>();`
			`if (a.flags().contiguous) {`
Buffer Donation (#519) * buffer donation * fix to move shared pointer * format * gpu in place for copy and binary * revert ops test * cpu in place * a little cleanup * remove useless bench 2024-01-27 08:30:33 +08:00			`set_unary_output_data(a, out);`
Real and Imag (#1490) * real and imag * fix * fix 2024-10-16 07:23:15 +08:00			`U* dst = out.data<U>();`
Start to cleanup/unify accelerate and common back-ends (Part 1/N) (#1777) * start to cleanup/unify accelerate and common back-ends * more progress * simplify * add half type and allow infs in simd exp * unify softmax + quantized, more dispatches to simd quantized mm * add sin/cos, use simd in vector-scalar ops * faster CPU vectorize quant * faster erf/erfinv 2025-01-30 06:34:49 +08:00			`constexpr int N = simd::max_size<T>;`
			`size_t size = a.data_size();`
			`while (size >= N) {`
			`simd::store(dst, op(simd::load<T, N>(a_ptr)));`
			`size -= N;`
			`a_ptr += N;`
			`dst += N;`
			`}`
			`while (size > 0) {`
			`dst = op(a_ptr);`
			`size--;`
			`dst++;`
			`a_ptr++;`
awni's commit files 2023-11-30 02:30:41 +08:00			`}`
			`} else {`
			`out.set_data(allocator::malloc_or_wait(out.nbytes()));`
Real and Imag (#1490) * real and imag * fix * fix 2024-10-16 07:23:15 +08:00			`U* dst = out.data<U>();`
Faster cpu ops (#1434) * faster binary and cleaner copy * use recursive template for other ops * more cleanup * fix from cleanup * more clean * fix binary * use contiguous iterator * add 3d * nits * fix * fix? * fix * fix rebase 2024-09-27 00:19:13 +08:00			`size_t shape = a.ndim() > 0 ? a.shape(-1) : 1;`
			`size_t stride = a.ndim() > 0 ? a.strides(-1) : 1;`
			`if (a.ndim() <= 1) {`
			`unary_op(a_ptr, dst, op, shape, stride);`
			`return;`
			`}`
			`ContiguousIterator it(a.shape(), a.strides(), a.ndim() - 1);`
			`for (size_t elem = 0; elem < a.size(); elem += shape) {`
			`unary_op(a_ptr + it.loc, dst + elem, op, shape, stride);`
			`it.step();`
awni's commit files 2023-11-30 02:30:41 +08:00			`}`
			`}`
			`}`

			`template <typename Op>`
			`void unary(const array& a, array& out, Op op) {`
			`switch (out.dtype()) {`
			`case bool_:`
			`unary_op<bool>(a, out, op);`
			`break;`
			`case uint8:`
			`unary_op<uint8_t>(a, out, op);`
			`break;`
			`case uint16:`
			`unary_op<uint16_t>(a, out, op);`
			`break;`
			`case uint32:`
			`unary_op<uint32_t>(a, out, op);`
			`break;`
			`case uint64:`
			`unary_op<uint64_t>(a, out, op);`
			`break;`
			`case int8:`
			`unary_op<int8_t>(a, out, op);`
			`break;`
			`case int16:`
			`unary_op<int16_t>(a, out, op);`
			`break;`
			`case int32:`
			`unary_op<int32_t>(a, out, op);`
			`break;`
			`case int64:`
			`unary_op<int64_t>(a, out, op);`
			`break;`
			`case float16:`
			`unary_op<float16_t>(a, out, op);`
			`break;`
			`case float32:`
			`unary_op<float>(a, out, op);`
			`break;`
			`case bfloat16:`
			`unary_op<bfloat16_t>(a, out, op);`
			`break;`
			`case complex64:`
			`unary_op<complex64_t>(a, out, op);`
			`break;`
			`}`
			`}`

			`template <typename Op>`
			`void unary_fp(const array& a, array& out, Op op) {`
			`switch (out.dtype()) {`
			`case bfloat16:`
			`unary_op<bfloat16_t>(a, out, op);`
			`break;`
			`case float16:`
			`unary_op<float16_t>(a, out, op);`
			`break;`
			`case float32:`
			`unary_op<float>(a, out, op);`
			`break;`
			`case complex64:`
			`unary_op<complex64_t>(a, out, op);`
			`break;`
			`default:`
			`std::ostringstream err;`
			`err << "[unary_fp] Does not support " << out.dtype();`
			`throw std::runtime_error(err.str());`
			`}`
			`}`

			`} // namespace mlx::core`