mirror of
https://github.com/ml-explore/mlx.git
synced 2025-12-16 01:49:05 +08:00
Compare commits
5 Commits
jagrit06/c
...
bd9977acbb
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bd9977acbb | ||
|
|
102f3ba579 | ||
|
|
5e542d98e0 | ||
|
|
f403ea1764 | ||
|
|
1034009b82 |
@@ -8,7 +8,6 @@ target_sources(
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/arange.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/binary.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/binary_two.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/copy.cu
|
||||
@@ -24,7 +23,6 @@ target_sources(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/fence.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/gemms/gemv.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/gemms/cublas_gemm.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/gemms/steel_gemm.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/jit_module.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kernel_utils.cu
|
||||
@@ -46,12 +44,14 @@ target_sources(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/softmax.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/sort.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/ternary.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/unary.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/quantized/affine_quantize.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/quantized/quantized.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/worker.cpp)
|
||||
|
||||
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/binary)
|
||||
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/unary)
|
||||
|
||||
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9.0)
|
||||
target_sources(
|
||||
mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/cublas_gemm_batched_12_9.cu)
|
||||
|
||||
21
mlx/backend/cuda/binary/CMakeLists.txt
Normal file
21
mlx/backend/cuda/binary/CMakeLists.txt
Normal file
@@ -0,0 +1,21 @@
|
||||
target_sources(
|
||||
mlx
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/add.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arctan2.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/bitwise_binary.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/divide.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/equal.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/greater.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/greater_equal.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/less.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/less_equal.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/logical_and.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/logical_or.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/log_add_exp.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/minimum.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/maximum.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/multiply.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/power.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/remainder.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/not_equal.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/subtract.cu)
|
||||
7
mlx/backend/cuda/binary/add.cu
Normal file
7
mlx/backend/cuda/binary/add.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/binary/binary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
BINARY_GPU(Add)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/binary/arctan2.cu
Normal file
7
mlx/backend/cuda/binary/arctan2.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/binary/binary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
BINARY_GPU(ArcTan2)
|
||||
} // namespace mlx::core
|
||||
@@ -99,39 +99,89 @@ __global__ void binary_vv(const In* a, const In* b, Out* out, IdxT size) {
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Op, typename In, typename Out, typename IdxT, int NDIM>
|
||||
template <
|
||||
typename Op,
|
||||
typename In,
|
||||
typename Out,
|
||||
typename IdxT,
|
||||
int NDIM,
|
||||
int N_READS>
|
||||
__global__ void binary_g_nd(
|
||||
const In* a,
|
||||
const In* b,
|
||||
Out* out,
|
||||
IdxT size,
|
||||
IdxT size_rest,
|
||||
const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
|
||||
const __grid_constant__ cuda::std::array<int64_t, NDIM> a_strides,
|
||||
const __grid_constant__ cuda::std::array<int64_t, NDIM> b_strides) {
|
||||
IdxT index = cg::this_grid().thread_rank();
|
||||
if (index < size) {
|
||||
auto [a_idx, b_idx] = elem_to_loc_nd<NDIM>(
|
||||
index, shape.data(), a_strides.data(), b_strides.data());
|
||||
out[index] = Op{}(a[a_idx], b[b_idx]);
|
||||
auto block = cg::this_thread_block();
|
||||
auto grid = cg::this_grid();
|
||||
IdxT index_rest =
|
||||
grid.block_index().y * block.dim_threads().y + block.thread_index().y;
|
||||
if (index_rest >= size_rest) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto shape_x = shape[NDIM - 1];
|
||||
auto a_stride_x = a_strides[NDIM - 1];
|
||||
auto b_stride_x = b_strides[NDIM - 1];
|
||||
IdxT index_x =
|
||||
grid.block_index().x * block.dim_threads().x + block.thread_index().x;
|
||||
auto [a_idx, b_idx] = elem_to_loc_nd<NDIM>(
|
||||
index_rest * shape_x, shape.data(), a_strides.data(), b_strides.data());
|
||||
auto a_vec =
|
||||
load_vector<N_READS>(a + a_idx, index_x, shape_x, a_stride_x, In(0));
|
||||
auto b_vec =
|
||||
load_vector<N_READS>(b + b_idx, index_x, shape_x, b_stride_x, In(0));
|
||||
|
||||
AlignedVector<Out, N_READS> out_vec;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < N_READS; ++i) {
|
||||
out_vec[i] = Op{}(a_vec[i], b_vec[i]);
|
||||
}
|
||||
store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
|
||||
}
|
||||
|
||||
template <typename Op, typename In, typename Out, typename IdxT>
|
||||
template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
|
||||
__global__ void binary_g(
|
||||
const In* a,
|
||||
const In* b,
|
||||
Out* out,
|
||||
IdxT size,
|
||||
IdxT size_rest,
|
||||
const __grid_constant__ Shape shape,
|
||||
const __grid_constant__ Strides a_strides,
|
||||
const __grid_constant__ Strides b_strides,
|
||||
int ndim) {
|
||||
IdxT index = cg::this_grid().thread_rank();
|
||||
if (index < size) {
|
||||
auto [a_idx, b_idx] = elem_to_loc(
|
||||
index, shape.data(), a_strides.data(), b_strides.data(), ndim);
|
||||
out[index] = Op{}(a[a_idx], b[b_idx]);
|
||||
auto block = cg::this_thread_block();
|
||||
auto grid = cg::this_grid();
|
||||
IdxT index_rest =
|
||||
grid.block_index().y * block.dim_threads().y + block.thread_index().y;
|
||||
if (index_rest >= size_rest) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto shape_x = shape[ndim - 1];
|
||||
auto a_stride_x = a_strides[ndim - 1];
|
||||
auto b_stride_x = b_strides[ndim - 1];
|
||||
IdxT index_x =
|
||||
grid.block_index().x * block.dim_threads().x + block.thread_index().x;
|
||||
auto [a_idx, b_idx] = elem_to_loc(
|
||||
index_rest * shape_x,
|
||||
shape.data(),
|
||||
a_strides.data(),
|
||||
b_strides.data(),
|
||||
ndim);
|
||||
auto a_vec =
|
||||
load_vector<N_READS>(a + a_idx, index_x, shape_x, a_stride_x, In(0));
|
||||
auto b_vec =
|
||||
load_vector<N_READS>(b + b_idx, index_x, shape_x, b_stride_x, In(0));
|
||||
|
||||
AlignedVector<Out, N_READS> out_vec;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < N_READS; ++i) {
|
||||
out_vec[i] = Op{}(a_vec[i], b_vec[i]);
|
||||
}
|
||||
store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
|
||||
}
|
||||
|
||||
template <typename Op, typename In, typename Out>
|
||||
@@ -209,39 +259,61 @@ void binary_op_gpu_inplace(
|
||||
auto& a_strides = strides[0];
|
||||
auto& b_strides = strides[1];
|
||||
int ndim = shape.size();
|
||||
int work_per_thread = 1;
|
||||
auto dim0 = ndim > 0 ? shape.back() : 1;
|
||||
auto rest = out.size() / dim0;
|
||||
if (dim0 >= 4) {
|
||||
work_per_thread = 4;
|
||||
}
|
||||
dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
|
||||
auto block_dims = get_block_dims(dim0, rest, 1);
|
||||
uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
|
||||
uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);
|
||||
if (ndim <= 3) {
|
||||
dispatch_1_2_3(ndim, [&](auto dims_constant) {
|
||||
auto [num_blocks, block_dims] =
|
||||
get_launch_args(out, large());
|
||||
auto kernel = cu::binary_g_nd<
|
||||
Op,
|
||||
InType,
|
||||
OutType,
|
||||
IdxT,
|
||||
dims_constant(),
|
||||
1>;
|
||||
if (work_per_thread == 4) {
|
||||
kernel = cu::binary_g_nd<
|
||||
Op,
|
||||
InType,
|
||||
OutType,
|
||||
IdxT,
|
||||
dims_constant(),
|
||||
4>;
|
||||
}
|
||||
encoder.add_kernel_node(
|
||||
cu::binary_g_nd<
|
||||
Op,
|
||||
InType,
|
||||
OutType,
|
||||
IdxT,
|
||||
dims_constant()>,
|
||||
num_blocks,
|
||||
kernel,
|
||||
{num_blocks_x, num_blocks_y},
|
||||
block_dims,
|
||||
0,
|
||||
a.data<InType>(),
|
||||
b.data<InType>(),
|
||||
out.data<OutType>(),
|
||||
out.size(),
|
||||
rest,
|
||||
const_param<dims_constant()>(shape),
|
||||
const_param<dims_constant()>(a_strides),
|
||||
const_param<dims_constant()>(b_strides));
|
||||
});
|
||||
} else {
|
||||
auto [num_blocks, block_dims] = get_launch_args(out, large());
|
||||
auto kernel = cu::binary_g<Op, InType, OutType, IdxT, 1>;
|
||||
if (work_per_thread == 4) {
|
||||
kernel = cu::binary_g<Op, InType, OutType, IdxT, 4>;
|
||||
}
|
||||
encoder.add_kernel_node(
|
||||
cu::binary_g<Op, InType, OutType, IdxT>,
|
||||
num_blocks,
|
||||
kernel,
|
||||
{num_blocks_x, num_blocks_y},
|
||||
block_dims,
|
||||
0,
|
||||
a.data<InType>(),
|
||||
b.data<InType>(),
|
||||
out.data<OutType>(),
|
||||
out.size(),
|
||||
rest,
|
||||
const_param(shape),
|
||||
const_param(a_strides),
|
||||
const_param(b_strides),
|
||||
@@ -304,54 +376,4 @@ void binary_op_gpu(
|
||||
binary_op_gpu<cu::func>(inputs, out, name(), s); \
|
||||
}
|
||||
|
||||
BINARY_GPU(Add)
|
||||
BINARY_GPU(ArcTan2)
|
||||
BINARY_GPU(Divide)
|
||||
BINARY_GPU(Remainder)
|
||||
BINARY_GPU(Greater)
|
||||
BINARY_GPU(GreaterEqual)
|
||||
BINARY_GPU(Less)
|
||||
BINARY_GPU(LessEqual)
|
||||
BINARY_GPU(LogicalAnd)
|
||||
BINARY_GPU(LogicalOr)
|
||||
BINARY_GPU(LogAddExp)
|
||||
BINARY_GPU(Maximum)
|
||||
BINARY_GPU(Minimum)
|
||||
BINARY_GPU(Multiply)
|
||||
BINARY_GPU(NotEqual)
|
||||
BINARY_GPU(Power)
|
||||
BINARY_GPU(Subtract)
|
||||
|
||||
void Equal::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
nvtx3::scoped_range r("Equal::eval_gpu");
|
||||
auto& s = out.primitive().stream();
|
||||
if (equal_nan_) {
|
||||
binary_op_gpu<cu::NaNEqual>(inputs, out, name(), s);
|
||||
} else {
|
||||
binary_op_gpu<cu::Equal>(inputs, out, name(), s);
|
||||
}
|
||||
}
|
||||
|
||||
void BitwiseBinary::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
nvtx3::scoped_range r("BitwiseBinary::eval_gpu");
|
||||
auto& s = out.primitive().stream();
|
||||
switch (op_) {
|
||||
case BitwiseBinary::And:
|
||||
binary_op_gpu<cu::BitwiseAnd>(inputs, out, name(), s);
|
||||
break;
|
||||
case BitwiseBinary::Or:
|
||||
binary_op_gpu<cu::BitwiseOr>(inputs, out, name(), s);
|
||||
break;
|
||||
case BitwiseBinary::Xor:
|
||||
binary_op_gpu<cu::BitwiseXor>(inputs, out, name(), s);
|
||||
break;
|
||||
case BitwiseBinary::LeftShift:
|
||||
binary_op_gpu<cu::LeftShift>(inputs, out, name(), s);
|
||||
break;
|
||||
case BitwiseBinary::RightShift:
|
||||
binary_op_gpu<cu::RightShift>(inputs, out, name(), s);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace mlx::core
|
||||
27
mlx/backend/cuda/binary/bitwise_binary.cu
Normal file
27
mlx/backend/cuda/binary/bitwise_binary.cu
Normal file
@@ -0,0 +1,27 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/binary/binary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
void BitwiseBinary::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
nvtx3::scoped_range r("BitwiseBinary::eval_gpu");
|
||||
auto& s = out.primitive().stream();
|
||||
switch (op_) {
|
||||
case BitwiseBinary::And:
|
||||
binary_op_gpu<cu::BitwiseAnd>(inputs, out, name(), s);
|
||||
break;
|
||||
case BitwiseBinary::Or:
|
||||
binary_op_gpu<cu::BitwiseOr>(inputs, out, name(), s);
|
||||
break;
|
||||
case BitwiseBinary::Xor:
|
||||
binary_op_gpu<cu::BitwiseXor>(inputs, out, name(), s);
|
||||
break;
|
||||
case BitwiseBinary::LeftShift:
|
||||
binary_op_gpu<cu::LeftShift>(inputs, out, name(), s);
|
||||
break;
|
||||
case BitwiseBinary::RightShift:
|
||||
binary_op_gpu<cu::RightShift>(inputs, out, name(), s);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/binary/divide.cu
Normal file
7
mlx/backend/cuda/binary/divide.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/binary/binary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
BINARY_GPU(Divide)
|
||||
} // namespace mlx::core
|
||||
15
mlx/backend/cuda/binary/equal.cu
Normal file
15
mlx/backend/cuda/binary/equal.cu
Normal file
@@ -0,0 +1,15 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/binary/binary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
void Equal::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
nvtx3::scoped_range r("Equal::eval_gpu");
|
||||
auto& s = out.primitive().stream();
|
||||
if (equal_nan_) {
|
||||
binary_op_gpu<cu::NaNEqual>(inputs, out, name(), s);
|
||||
} else {
|
||||
binary_op_gpu<cu::Equal>(inputs, out, name(), s);
|
||||
}
|
||||
}
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/binary/greater.cu
Normal file
7
mlx/backend/cuda/binary/greater.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/binary/binary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
BINARY_GPU(Greater)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/binary/greater_equal.cu
Normal file
7
mlx/backend/cuda/binary/greater_equal.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/binary/binary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
BINARY_GPU(GreaterEqual)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/binary/less.cu
Normal file
7
mlx/backend/cuda/binary/less.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/binary/binary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
BINARY_GPU(Less)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/binary/less_equal.cu
Normal file
7
mlx/backend/cuda/binary/less_equal.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/binary/binary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
BINARY_GPU(LessEqual)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/binary/log_add_exp.cu
Normal file
7
mlx/backend/cuda/binary/log_add_exp.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/binary/binary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
BINARY_GPU(LogAddExp)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/binary/logical_and.cu
Normal file
7
mlx/backend/cuda/binary/logical_and.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/binary/binary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
BINARY_GPU(LogicalAnd)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/binary/logical_or.cu
Normal file
7
mlx/backend/cuda/binary/logical_or.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/binary/binary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
BINARY_GPU(LogicalOr)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/binary/maximum.cu
Normal file
7
mlx/backend/cuda/binary/maximum.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/binary/binary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
BINARY_GPU(Maximum)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/binary/minimum.cu
Normal file
7
mlx/backend/cuda/binary/minimum.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/binary/binary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
BINARY_GPU(Minimum)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/binary/multiply.cu
Normal file
7
mlx/backend/cuda/binary/multiply.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/binary/binary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
BINARY_GPU(Multiply)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/binary/not_equal.cu
Normal file
7
mlx/backend/cuda/binary/not_equal.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/binary/binary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
BINARY_GPU(NotEqual)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/binary/power.cu
Normal file
7
mlx/backend/cuda/binary/power.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/binary/binary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
BINARY_GPU(Power)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/binary/remainder.cu
Normal file
7
mlx/backend/cuda/binary/remainder.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/binary/binary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
BINARY_GPU(Remainder)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/binary/subtract.cu
Normal file
7
mlx/backend/cuda/binary/subtract.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/binary/binary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
BINARY_GPU(Subtract)
|
||||
} // namespace mlx::core
|
||||
@@ -127,45 +127,99 @@ binary_two_vv(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Op, typename In, typename Out, typename IdxT, int NDIM>
|
||||
template <
|
||||
typename Op,
|
||||
typename In,
|
||||
typename Out,
|
||||
typename IdxT,
|
||||
int NDIM,
|
||||
int N_READS>
|
||||
__global__ void binary_two_g_nd(
|
||||
const In* a,
|
||||
const In* b,
|
||||
Out* out_a,
|
||||
Out* out_b,
|
||||
IdxT size,
|
||||
IdxT size_rest,
|
||||
const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
|
||||
const __grid_constant__ cuda::std::array<int64_t, NDIM> a_strides,
|
||||
const __grid_constant__ cuda::std::array<int64_t, NDIM> b_strides) {
|
||||
IdxT index = cg::this_grid().thread_rank();
|
||||
if (index < size) {
|
||||
auto [a_idx, b_idx] = elem_to_loc_nd<NDIM>(
|
||||
index, shape.data(), a_strides.data(), b_strides.data());
|
||||
auto out = Op{}(a[a_idx], b[b_idx]);
|
||||
out_a[index] = out[0];
|
||||
out_b[index] = out[1];
|
||||
auto block = cg::this_thread_block();
|
||||
auto grid = cg::this_grid();
|
||||
IdxT index_rest =
|
||||
grid.block_index().y * block.dim_threads().y + block.thread_index().y;
|
||||
if (index_rest >= size_rest) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto shape_x = shape[NDIM - 1];
|
||||
auto a_stride_x = a_strides[NDIM - 1];
|
||||
auto b_stride_x = b_strides[NDIM - 1];
|
||||
IdxT index_x =
|
||||
grid.block_index().x * block.dim_threads().x + block.thread_index().x;
|
||||
auto [a_idx, b_idx] = elem_to_loc_nd<NDIM>(
|
||||
index_rest * shape_x, shape.data(), a_strides.data(), b_strides.data());
|
||||
auto a_vec =
|
||||
load_vector<N_READS>(a + a_idx, index_x, shape_x, a_stride_x, In(0));
|
||||
auto b_vec =
|
||||
load_vector<N_READS>(b + b_idx, index_x, shape_x, b_stride_x, In(0));
|
||||
|
||||
AlignedVector<Out, N_READS> out_vec_a;
|
||||
AlignedVector<Out, N_READS> out_vec_b;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < N_READS; ++i) {
|
||||
auto out = Op{}(a_vec[i], b_vec[i]);
|
||||
out_vec_a[i] = out[0];
|
||||
out_vec_b[i] = out[1];
|
||||
}
|
||||
store_vector(out_a + shape_x * index_rest, index_x, out_vec_a, shape_x);
|
||||
store_vector(out_b + shape_x * index_rest, index_x, out_vec_b, shape_x);
|
||||
}
|
||||
|
||||
template <typename Op, typename In, typename Out, typename IdxT>
|
||||
template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
|
||||
__global__ void binary_two_g(
|
||||
const In* a,
|
||||
const In* b,
|
||||
Out* out_a,
|
||||
Out* out_b,
|
||||
IdxT size,
|
||||
IdxT size_rest,
|
||||
const __grid_constant__ Shape shape,
|
||||
const __grid_constant__ Strides a_strides,
|
||||
const __grid_constant__ Strides b_strides,
|
||||
int ndim) {
|
||||
IdxT index = cg::this_grid().thread_rank();
|
||||
if (index < size) {
|
||||
auto [a_idx, b_idx] = elem_to_loc(
|
||||
index, shape.data(), a_strides.data(), b_strides.data(), ndim);
|
||||
auto out = Op{}(a[a_idx], b[b_idx]);
|
||||
out_a[index] = out[0];
|
||||
out_b[index] = out[1];
|
||||
auto block = cg::this_thread_block();
|
||||
auto grid = cg::this_grid();
|
||||
IdxT index_rest =
|
||||
grid.block_index().y * block.dim_threads().y + block.thread_index().y;
|
||||
if (index_rest >= size_rest) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto shape_x = shape[ndim - 1];
|
||||
auto a_stride_x = a_strides[ndim - 1];
|
||||
auto b_stride_x = b_strides[ndim - 1];
|
||||
IdxT index_x =
|
||||
grid.block_index().x * block.dim_threads().x + block.thread_index().x;
|
||||
auto [a_idx, b_idx] = elem_to_loc(
|
||||
index_rest * shape_x,
|
||||
shape.data(),
|
||||
a_strides.data(),
|
||||
b_strides.data(),
|
||||
ndim);
|
||||
auto a_vec =
|
||||
load_vector<N_READS>(a + a_idx, index_x, shape_x, a_stride_x, In(0));
|
||||
auto b_vec =
|
||||
load_vector<N_READS>(b + b_idx, index_x, shape_x, b_stride_x, In(0));
|
||||
|
||||
AlignedVector<Out, N_READS> out_vec_a;
|
||||
AlignedVector<Out, N_READS> out_vec_b;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < N_READS; ++i) {
|
||||
auto out = Op{}(a_vec[i], b_vec[i]);
|
||||
out_vec_a[i] = out[0];
|
||||
out_vec_b[i] = out[1];
|
||||
}
|
||||
store_vector(out_a + shape_x * index_rest, index_x, out_vec_a, shape_x);
|
||||
store_vector(out_b + shape_x * index_rest, index_x, out_vec_b, shape_x);
|
||||
}
|
||||
|
||||
template <typename Op, typename In, typename Out>
|
||||
@@ -225,42 +279,64 @@ void binary_two_op_gpu_inplace(
|
||||
auto& a_strides = strides[0];
|
||||
auto& b_strides = strides[1];
|
||||
int ndim = shape.size();
|
||||
int work_per_thread = 1;
|
||||
auto dim0 = ndim > 0 ? shape.back() : 1;
|
||||
auto rest = out_a.size() / dim0;
|
||||
if (dim0 >= 4) {
|
||||
work_per_thread = 4;
|
||||
}
|
||||
dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
|
||||
auto block_dims = get_block_dims(dim0, rest, 1);
|
||||
uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
|
||||
uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);
|
||||
|
||||
if (ndim <= 3) {
|
||||
dispatch_1_2_3(ndim, [&](auto dims_constant) {
|
||||
auto [num_blocks, block_dims] =
|
||||
get_launch_args(out_a, large());
|
||||
auto kernel = cu::binary_two_g_nd<
|
||||
Op,
|
||||
InType,
|
||||
OutType,
|
||||
IdxT,
|
||||
dims_constant(),
|
||||
1>;
|
||||
if (work_per_thread == 4) {
|
||||
kernel = cu::binary_two_g_nd<
|
||||
Op,
|
||||
InType,
|
||||
OutType,
|
||||
IdxT,
|
||||
dims_constant(),
|
||||
4>;
|
||||
}
|
||||
encoder.add_kernel_node(
|
||||
cu::binary_two_g_nd<
|
||||
Op,
|
||||
InType,
|
||||
OutType,
|
||||
IdxT,
|
||||
dims_constant()>,
|
||||
num_blocks,
|
||||
kernel,
|
||||
{num_blocks_x, num_blocks_y},
|
||||
block_dims,
|
||||
0,
|
||||
a.data<InType>(),
|
||||
b.data<InType>(),
|
||||
out_a.data<OutType>(),
|
||||
out_b.data<OutType>(),
|
||||
out_a.size(),
|
||||
rest,
|
||||
const_param<dims_constant()>(shape),
|
||||
const_param<dims_constant()>(a_strides),
|
||||
const_param<dims_constant()>(b_strides));
|
||||
});
|
||||
} else {
|
||||
auto [num_blocks, block_dims] =
|
||||
get_launch_args(out_a, large());
|
||||
auto kernel = cu::binary_two_g<Op, InType, OutType, IdxT, 1>;
|
||||
if (work_per_thread == 4) {
|
||||
kernel = cu::binary_two_g<Op, InType, OutType, IdxT, 4>;
|
||||
}
|
||||
encoder.add_kernel_node(
|
||||
cu::binary_two_g<Op, InType, OutType, IdxT>,
|
||||
num_blocks,
|
||||
kernel,
|
||||
{num_blocks_x, num_blocks_y},
|
||||
block_dims,
|
||||
0,
|
||||
a.data<InType>(),
|
||||
b.data<InType>(),
|
||||
out_a.data<OutType>(),
|
||||
out_b.data<OutType>(),
|
||||
out_a.size(),
|
||||
rest,
|
||||
const_param(shape),
|
||||
const_param(a_strides),
|
||||
const_param(b_strides),
|
||||
|
||||
@@ -10,37 +10,80 @@ namespace cu {
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
template <typename In, typename Out, typename IdxT, int NDIM>
|
||||
template <typename In, typename Out, typename IdxT, int NDIM, int N_READS>
|
||||
__global__ void copy_gg_nd(
|
||||
const In* in,
|
||||
Out* out,
|
||||
IdxT size,
|
||||
IdxT size_rest,
|
||||
const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
|
||||
const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in,
|
||||
const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_out) {
|
||||
IdxT index = cg::this_grid().thread_rank();
|
||||
if (index < size) {
|
||||
auto [idx_in, idx_out] = elem_to_loc_nd<NDIM>(
|
||||
index, shape.data(), strides_in.data(), strides_out.data());
|
||||
out[idx_out] = CastOp<In, Out>{}(in[idx_in]);
|
||||
auto block = cg::this_thread_block();
|
||||
auto grid = cg::this_grid();
|
||||
IdxT index_rest =
|
||||
grid.block_index().y * block.dim_threads().y + block.thread_index().y;
|
||||
if (index_rest >= size_rest) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto shape_x = shape[NDIM - 1];
|
||||
auto in_stride_x = strides_in[NDIM - 1];
|
||||
auto out_stride_x = strides_out[NDIM - 1];
|
||||
IdxT index_x =
|
||||
grid.block_index().x * block.dim_threads().x + block.thread_index().x;
|
||||
auto [idx_in, idx_out] = elem_to_loc_nd<NDIM>(
|
||||
index_rest * shape_x,
|
||||
shape.data(),
|
||||
strides_in.data(),
|
||||
strides_out.data());
|
||||
|
||||
auto in_vec =
|
||||
load_vector<N_READS>(in + idx_in, index_x, shape_x, in_stride_x, In(0));
|
||||
AlignedVector<Out, N_READS> out_vec;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < N_READS; ++i) {
|
||||
out_vec[i] = CastOp<In, Out>{}(in_vec[i]);
|
||||
}
|
||||
store_vector(out + idx_out, index_x, out_vec, shape_x, out_stride_x);
|
||||
}
|
||||
|
||||
template <typename In, typename Out, typename IdxT>
|
||||
template <typename In, typename Out, typename IdxT, int N_READS>
|
||||
__global__ void copy_gg(
|
||||
const In* in,
|
||||
Out* out,
|
||||
IdxT size,
|
||||
IdxT size_rest,
|
||||
const __grid_constant__ Shape shape,
|
||||
const __grid_constant__ Strides strides_in,
|
||||
const __grid_constant__ Strides strides_out,
|
||||
int ndim) {
|
||||
IdxT index = cg::this_grid().thread_rank();
|
||||
if (index < size) {
|
||||
auto [idx_in, idx_out] = elem_to_loc(
|
||||
index, shape.data(), strides_in.data(), strides_out.data(), ndim);
|
||||
out[idx_out] = CastOp<In, Out>{}(in[idx_in]);
|
||||
auto block = cg::this_thread_block();
|
||||
auto grid = cg::this_grid();
|
||||
IdxT index_rest =
|
||||
grid.block_index().y * block.dim_threads().y + block.thread_index().y;
|
||||
if (index_rest >= size_rest) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto shape_x = shape[ndim - 1];
|
||||
auto in_stride_x = strides_in[ndim - 1];
|
||||
auto out_stride_x = strides_out[ndim - 1];
|
||||
IdxT index_x =
|
||||
grid.block_index().x * block.dim_threads().x + block.thread_index().x;
|
||||
auto [idx_in, idx_out] = elem_to_loc(
|
||||
index_rest * shape_x,
|
||||
shape.data(),
|
||||
strides_in.data(),
|
||||
strides_out.data(),
|
||||
ndim);
|
||||
|
||||
auto in_vec =
|
||||
load_vector<N_READS>(in + idx_in, index_x, shape_x, in_stride_x, In(0));
|
||||
AlignedVector<Out, N_READS> out_vec;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < N_READS; ++i) {
|
||||
out_vec[i] = CastOp<In, Out>{}(in_vec[i]);
|
||||
}
|
||||
store_vector(out + idx_out, index_x, out_vec, shape_x, out_stride_x);
|
||||
}
|
||||
|
||||
} // namespace cu
|
||||
@@ -69,33 +112,52 @@ void copy_general(
|
||||
size_t data_size = 1;
|
||||
for (auto& s : shape)
|
||||
data_size *= s;
|
||||
|
||||
int work_per_thread = 1;
|
||||
auto dim0 = ndim > 0 ? shape.back() : 1;
|
||||
auto rest = data_size / dim0;
|
||||
if (dim0 >= 4) {
|
||||
work_per_thread = 4;
|
||||
}
|
||||
|
||||
dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
|
||||
auto block_dims = get_block_dims(dim0, rest, 1);
|
||||
uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
|
||||
uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);
|
||||
|
||||
if (ndim <= 3) {
|
||||
dispatch_1_2_3(ndim, [&](auto ndim_constant) {
|
||||
auto [num_blocks, block_dims] =
|
||||
get_launch_args(data_size, shape, out.strides(), large());
|
||||
auto kernel =
|
||||
cu::copy_gg_nd<InType, OutType, IdxT, ndim_constant(), 1>;
|
||||
if (work_per_thread == 4) {
|
||||
kernel =
|
||||
cu::copy_gg_nd<InType, OutType, IdxT, ndim_constant(), 4>;
|
||||
}
|
||||
encoder.add_kernel_node(
|
||||
cu::copy_gg_nd<InType, OutType, IdxT, ndim_constant()>,
|
||||
num_blocks,
|
||||
kernel,
|
||||
{num_blocks_x, num_blocks_y},
|
||||
block_dims,
|
||||
0,
|
||||
in_ptr,
|
||||
out_ptr,
|
||||
data_size,
|
||||
rest,
|
||||
const_param<ndim_constant()>(shape),
|
||||
const_param<ndim_constant()>(strides_in),
|
||||
const_param<ndim_constant()>(strides_out));
|
||||
});
|
||||
} else { // ndim >= 4
|
||||
auto [num_blocks, block_dims] =
|
||||
get_launch_args(data_size, shape, out.strides(), large());
|
||||
auto kernel = cu::copy_gg<InType, OutType, IdxT, 1>;
|
||||
if (work_per_thread == 4) {
|
||||
kernel = cu::copy_gg<InType, OutType, IdxT, 4>;
|
||||
}
|
||||
encoder.add_kernel_node(
|
||||
cu::copy_gg<InType, OutType, IdxT>,
|
||||
num_blocks,
|
||||
kernel,
|
||||
{num_blocks_x, num_blocks_y},
|
||||
block_dims,
|
||||
0,
|
||||
in_ptr,
|
||||
out_ptr,
|
||||
data_size,
|
||||
rest,
|
||||
const_param(shape),
|
||||
const_param(strides_in),
|
||||
const_param(strides_out),
|
||||
|
||||
@@ -10,33 +10,67 @@ namespace cu {
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
template <typename In, typename Out, typename IdxT, int NDIM>
|
||||
template <typename In, typename Out, typename IdxT, int NDIM, int N_READS>
|
||||
__global__ void copy_g_nd(
|
||||
const In* in,
|
||||
Out* out,
|
||||
IdxT size,
|
||||
IdxT size_rest,
|
||||
const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
|
||||
const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in) {
|
||||
IdxT index = cg::this_grid().thread_rank();
|
||||
if (index < size) {
|
||||
IdxT idx_in = elem_to_loc_nd<NDIM>(index, shape.data(), strides_in.data());
|
||||
out[index] = CastOp<In, Out>{}(in[idx_in]);
|
||||
const __grid_constant__ cuda::std::array<int64_t, NDIM> strides) {
|
||||
auto block = cg::this_thread_block();
|
||||
auto grid = cg::this_grid();
|
||||
IdxT index_rest =
|
||||
grid.block_index().y * block.dim_threads().y + block.thread_index().y;
|
||||
if (index_rest >= size_rest) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto shape_x = shape[NDIM - 1];
|
||||
auto stride_x = strides[NDIM - 1];
|
||||
IdxT index_x =
|
||||
grid.block_index().x * block.dim_threads().x + block.thread_index().x;
|
||||
auto idx =
|
||||
elem_to_loc_nd<NDIM>(index_rest * shape_x, shape.data(), strides.data());
|
||||
auto in_vec =
|
||||
load_vector<N_READS>(in + idx, index_x, shape_x, stride_x, In(0));
|
||||
AlignedVector<Out, N_READS> out_vec;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < N_READS; ++i) {
|
||||
out_vec[i] = CastOp<In, Out>{}(in_vec[i]);
|
||||
}
|
||||
store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
|
||||
}
|
||||
|
||||
template <typename In, typename Out, typename IdxT>
|
||||
template <typename In, typename Out, typename IdxT, int N_READS>
|
||||
__global__ void copy_g(
|
||||
const In* in,
|
||||
Out* out,
|
||||
IdxT size,
|
||||
IdxT size_rest,
|
||||
const __grid_constant__ Shape shape,
|
||||
const __grid_constant__ Strides strides_in,
|
||||
const __grid_constant__ Strides strides,
|
||||
int ndim) {
|
||||
IdxT index = cg::this_grid().thread_rank();
|
||||
if (index < size) {
|
||||
IdxT idx_in = elem_to_loc(index, shape.data(), strides_in.data(), ndim);
|
||||
out[index] = CastOp<In, Out>{}(in[idx_in]);
|
||||
auto block = cg::this_thread_block();
|
||||
auto grid = cg::this_grid();
|
||||
IdxT index_rest =
|
||||
grid.block_index().y * block.dim_threads().y + block.thread_index().y;
|
||||
if (index_rest >= size_rest) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto shape_x = shape[ndim - 1];
|
||||
auto stride_x = strides[ndim - 1];
|
||||
IdxT index_x =
|
||||
grid.block_index().x * block.dim_threads().x + block.thread_index().x;
|
||||
auto idx =
|
||||
elem_to_loc(index_rest * shape_x, shape.data(), strides.data(), ndim);
|
||||
auto in_vec =
|
||||
load_vector<N_READS>(in + idx, index_x, shape_x, stride_x, In(0));
|
||||
AlignedVector<Out, N_READS> out_vec;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < N_READS; ++i) {
|
||||
out_vec[i] = CastOp<In, Out>{}(in_vec[i]);
|
||||
}
|
||||
store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
|
||||
}
|
||||
|
||||
} // namespace cu
|
||||
@@ -61,30 +95,49 @@ void copy_general_input(
|
||||
const InType* in_ptr = in.data<InType>() + offset_in;
|
||||
OutType* out_ptr = out.data<OutType>() + offset_out;
|
||||
int ndim = shape.size();
|
||||
int work_per_thread = 1;
|
||||
auto dim0 = ndim > 0 ? shape.back() : 1;
|
||||
auto rest = out.size() / dim0;
|
||||
if (dim0 >= 4) {
|
||||
work_per_thread = 4;
|
||||
}
|
||||
dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
|
||||
auto block_dims = get_block_dims(dim0, rest, 1);
|
||||
uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
|
||||
uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);
|
||||
|
||||
if (ndim <= 3) {
|
||||
dispatch_1_2_3(ndim, [&](auto dims_constant) {
|
||||
auto [num_blocks, block_dims] = get_launch_args(out, large());
|
||||
auto kernel =
|
||||
cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 1>;
|
||||
if (work_per_thread == 4) {
|
||||
kernel =
|
||||
cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 4>;
|
||||
}
|
||||
encoder.add_kernel_node(
|
||||
cu::copy_g_nd<InType, OutType, IdxT, dims_constant()>,
|
||||
num_blocks,
|
||||
kernel,
|
||||
{num_blocks_x, num_blocks_y},
|
||||
block_dims,
|
||||
0,
|
||||
in_ptr,
|
||||
out_ptr,
|
||||
out.size(),
|
||||
rest,
|
||||
const_param<dims_constant()>(shape),
|
||||
const_param<dims_constant()>(strides_in));
|
||||
});
|
||||
} else { // ndim >= 4
|
||||
auto [num_blocks, block_dims] = get_launch_args(out, large());
|
||||
auto kernel = cu::copy_g<InType, OutType, IdxT, 1>;
|
||||
if (work_per_thread == 4) {
|
||||
kernel = cu::copy_g<InType, OutType, IdxT, 4>;
|
||||
}
|
||||
encoder.add_kernel_node(
|
||||
cu::copy_g<InType, OutType, IdxT>,
|
||||
num_blocks,
|
||||
kernel,
|
||||
{num_blocks_x, num_blocks_y},
|
||||
block_dims,
|
||||
0,
|
||||
in_ptr,
|
||||
out_ptr,
|
||||
out.size(),
|
||||
rest,
|
||||
const_param(shape),
|
||||
const_param(strides_in),
|
||||
ndim);
|
||||
|
||||
@@ -146,6 +146,23 @@ inline __device__ void store_vector(
|
||||
}
|
||||
}
|
||||
|
||||
template <int N, typename T, typename SizeT>
|
||||
inline __device__ void store_vector(
|
||||
T* ptr,
|
||||
uint32_t offset,
|
||||
const AlignedVector<T, N>& vec,
|
||||
SizeT size,
|
||||
int64_t stride) {
|
||||
if (is_aligned<N>(ptr) && (offset + 1) * N <= size && stride == 1) {
|
||||
auto* to = reinterpret_cast<AlignedVector<T, N>*>(ptr);
|
||||
to[offset] = vec;
|
||||
} else {
|
||||
for (int i = 0; (offset * N + i) < size && i < N; ++i) {
|
||||
ptr[stride * (offset * N + i)] = vec[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Type limits utils
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@@ -1,301 +0,0 @@
|
||||
#include "mlx/backend/common/matmul.h"
|
||||
#include "mlx/backend/cuda/device.h"
|
||||
#include "mlx/backend/cuda/device/utils.cuh"
|
||||
#include "mlx/backend/cuda/gemms/steel_gemm.h"
|
||||
#include "mlx/backend/cuda/kernel_utils.cuh"
|
||||
#include "mlx/primitives.h"
|
||||
|
||||
#include <nvtx3/nvtx3.hpp>
|
||||
#include <numeric>
|
||||
|
||||
#include <cooperative_groups.h>
|
||||
|
||||
#include "mlx/backend/cuda/steel/gemm.cuh"
|
||||
#include "mlx/backend/cuda/steel/mma.cuh"
|
||||
#include "mlx/backend/cuda/steel/tiles.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
|
||||
namespace cu {
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
struct GemmParams {
|
||||
int M;
|
||||
int N;
|
||||
int K;
|
||||
int lda;
|
||||
int ldb;
|
||||
int ldd;
|
||||
|
||||
int NblockM;
|
||||
int NblockN;
|
||||
int NblockK;
|
||||
};
|
||||
|
||||
template <
|
||||
typename T,
|
||||
int BM,
|
||||
int BN,
|
||||
int BK,
|
||||
int WM,
|
||||
int WN,
|
||||
bool transpose_a,
|
||||
bool transpose_b,
|
||||
int SL,
|
||||
int Nstages>
|
||||
__global__ void kernel_steel_gemm(
|
||||
const T* a,
|
||||
const T* b,
|
||||
T* d,
|
||||
__grid_constant__ const GemmParams params) {
|
||||
const int bM_idx = (blockIdx.y << SL) + (blockIdx.x & ((1 << SL) - 1));
|
||||
const int bN_idx = blockIdx.x >> SL;
|
||||
|
||||
if (params.NblockN <= bN_idx || params.NblockM <= bM_idx) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int d_row = bM_idx * BM;
|
||||
const int d_col = bN_idx * BN;
|
||||
const size_t d_row_long = size_t(d_row);
|
||||
const size_t d_col_long = size_t(d_col);
|
||||
|
||||
a += transpose_a ? d_row_long : d_row_long * params.K;
|
||||
b += transpose_b ? d_col_long * params.K : d_col_long;
|
||||
d += d_row_long * params.ldd + d_col_long;
|
||||
|
||||
auto block = cg::this_thread_block();
|
||||
auto warp = cg::tiled_partition<32>(block);
|
||||
|
||||
const int lane_idx = warp.thread_rank();
|
||||
const int warp_idx = warp.meta_group_rank();
|
||||
|
||||
const int wm = warp_idx / WN;
|
||||
const int wn = warp_idx % WN;
|
||||
|
||||
constexpr int SM = BM / WM;
|
||||
constexpr int SN = BN / WN;
|
||||
constexpr int SK = BK;
|
||||
constexpr int TK = SK / 16;
|
||||
|
||||
constexpr int NUM_WARPS = WM * WN;
|
||||
|
||||
// Allocate shared memory
|
||||
extern __shared__ char shmem[];
|
||||
SharedTile<T, BM, BK>(&as)[Nstages] =
|
||||
*(SharedTile<T, BM, BK>(*)[Nstages])(&shmem[0]);
|
||||
SharedTile<T, BN, BK>(&bs)[Nstages] = *(SharedTile<T, BN, BK>(*)[Nstages])(
|
||||
&shmem[sizeof(T) * Nstages * BM * BK]);
|
||||
|
||||
// Allocate registers for the MMA
|
||||
RegisterTile<float, SM, SN> C;
|
||||
RegisterTile<T, SM, 16> A[TK];
|
||||
RegisterTile<T, SN, 16> B[TK];
|
||||
|
||||
// Zero the accumulators
|
||||
C.fill(0);
|
||||
|
||||
// Start gmem -> smem copies
|
||||
int k_block_read = 0;
|
||||
|
||||
MLX_UNROLL
|
||||
for (int bk = 0; bk < (Nstages - 1); bk++) {
|
||||
load_async<NUM_WARPS>(
|
||||
as[bk], as[bk].base_addr(), a + k_block_read, params.K);
|
||||
load_async<NUM_WARPS>(
|
||||
bs[bk], bs[bk].base_addr(), b + k_block_read, params.K);
|
||||
k_block_read += BK;
|
||||
cp_async_commit();
|
||||
}
|
||||
|
||||
int smem_pipe_read = 0;
|
||||
int smem_pipe_write = Nstages - 1;
|
||||
|
||||
// Wait till only 1 remains laoding
|
||||
cp_async_wait<1>();
|
||||
block.sync();
|
||||
|
||||
const int offset_m = wm * SM;
|
||||
const int offset_n = wn * SN;
|
||||
|
||||
// Start smem -> register copy
|
||||
A[0].load(
|
||||
as[smem_pipe_read],
|
||||
as[smem_pipe_read].base_addr(),
|
||||
offset_m + lane_idx % 16,
|
||||
lane_idx / 16 * 8);
|
||||
B[0].load(
|
||||
bs[smem_pipe_read],
|
||||
bs[smem_pipe_read].base_addr(),
|
||||
offset_n + lane_idx % 16,
|
||||
lane_idx / 16 * 8);
|
||||
|
||||
// Main loop
|
||||
for (int kb = 0; kb < params.NblockK; kb++) {
|
||||
// Prepare next registers
|
||||
{
|
||||
A[1].load(
|
||||
as[smem_pipe_read],
|
||||
as[smem_pipe_read].base_addr(),
|
||||
offset_m + lane_idx % 16,
|
||||
16 + lane_idx / 16 * 8);
|
||||
B[1].load(
|
||||
bs[smem_pipe_read],
|
||||
bs[smem_pipe_read].base_addr(),
|
||||
offset_n + lane_idx % 16,
|
||||
16 + lane_idx / 16 * 8);
|
||||
}
|
||||
|
||||
// Prepare next smem
|
||||
if ((kb + Nstages - 1) < params.NblockK) {
|
||||
load_async<NUM_WARPS>(
|
||||
as[smem_pipe_write],
|
||||
as[smem_pipe_write].base_addr(),
|
||||
a + k_block_read,
|
||||
params.K);
|
||||
load_async<NUM_WARPS>(
|
||||
bs[smem_pipe_write],
|
||||
bs[smem_pipe_write].base_addr(),
|
||||
b + k_block_read,
|
||||
params.K);
|
||||
}
|
||||
k_block_read += BK;
|
||||
|
||||
cp_async_commit();
|
||||
|
||||
smem_pipe_write = smem_pipe_read;
|
||||
smem_pipe_read = smem_pipe_read + 1;
|
||||
smem_pipe_read = (smem_pipe_read == Nstages) ? 0 : smem_pipe_read;
|
||||
|
||||
// Do current gemm
|
||||
mma_t(C, A[0], B[0]);
|
||||
|
||||
// Do wait for next register
|
||||
cp_async_wait<1>();
|
||||
block.sync();
|
||||
|
||||
// Prepare next register (smem_pipe_read has moved to the next)
|
||||
{
|
||||
A[0].load(
|
||||
as[smem_pipe_read],
|
||||
as[smem_pipe_read].base_addr(),
|
||||
offset_m + lane_idx % 16,
|
||||
lane_idx / 16 * 8);
|
||||
B[0].load(
|
||||
bs[smem_pipe_read],
|
||||
bs[smem_pipe_read].base_addr(),
|
||||
offset_n + lane_idx % 16,
|
||||
lane_idx / 16 * 8);
|
||||
}
|
||||
|
||||
// Do current gemm
|
||||
mma_t(C, A[1], B[1]);
|
||||
}
|
||||
|
||||
// Wait and clear
|
||||
cp_async_wait_all();
|
||||
block.sync();
|
||||
|
||||
C.store_global(d, params.ldd, offset_m, offset_n);
|
||||
}
|
||||
|
||||
} // namespace cu
|
||||
|
||||
void dispatch_steel_gemm(
|
||||
const Stream& s,
|
||||
cu::CommandEncoder& encoder,
|
||||
const array& a,
|
||||
const array& b,
|
||||
array& d,
|
||||
int M,
|
||||
int N,
|
||||
int K,
|
||||
int lda,
|
||||
int ldb,
|
||||
int ldd,
|
||||
bool a_transposed,
|
||||
bool b_transposed) {
|
||||
using DataType = cuda_type_t<float16_t>;
|
||||
|
||||
encoder.set_input_array(a);
|
||||
encoder.set_input_array(b);
|
||||
encoder.set_output_array(d);
|
||||
|
||||
constexpr int BM = 128;
|
||||
constexpr int BN = 128;
|
||||
constexpr int BK = 32;
|
||||
|
||||
constexpr int WM = 2;
|
||||
constexpr int WN = 2;
|
||||
|
||||
constexpr int SL = 0;
|
||||
constexpr int Nstages = 3;
|
||||
|
||||
constexpr uint32_t smem_bytes = BK * (BM + BN) * Nstages * sizeof(DataType);
|
||||
|
||||
const int NblockM = (M + BM - 1) / BM;
|
||||
const int NblockN = (N + BN - 1) / BN;
|
||||
const int NblockK = (K + BK - 1) / BK;
|
||||
|
||||
cu::GemmParams params{
|
||||
/* int M = */ M,
|
||||
/* int N = */ N,
|
||||
/* int K = */ K,
|
||||
/* int lda = */ lda,
|
||||
/* int ldb = */ ldb,
|
||||
/* int ldd = */ ldd,
|
||||
|
||||
/* int NblockM = */ NblockM,
|
||||
/* int NblockN = */ NblockN,
|
||||
/* int NblockK = */ NblockK,
|
||||
};
|
||||
|
||||
// Prepare launch grid params
|
||||
int tile = 1 << SL;
|
||||
int tm = (NblockM + tile - 1) / tile;
|
||||
int tn = NblockN * tile;
|
||||
|
||||
dim3 grid_dim(tn, tm, 1);
|
||||
dim3 block_dim(32 * WM * WN, 1, 1);
|
||||
|
||||
dispatch_bool(a_transposed, [&](auto ta_) {
|
||||
dispatch_bool(b_transposed, [&](auto tb_) {
|
||||
constexpr bool ta = ta_.value;
|
||||
constexpr bool tb = tb_.value;
|
||||
|
||||
auto kernel = cu::ab_t_aligned<DataType, BM, BN, BK>;
|
||||
cudaFuncSetAttribute(
|
||||
kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
|
||||
|
||||
encoder.add_kernel_node(
|
||||
kernel,
|
||||
grid_dim,
|
||||
block_dim,
|
||||
smem_bytes,
|
||||
a.data<DataType>(),
|
||||
b.data<DataType>(),
|
||||
d.data<DataType>(),
|
||||
N,
|
||||
K);
|
||||
|
||||
// auto kernel = cu::kernel_steel_gemm<DataType, BM, BN, BK, WM, WN, ta,
|
||||
// tb, SL, Nstages>;
|
||||
|
||||
// cudaFuncSetAttribute(kernel,
|
||||
// cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
|
||||
|
||||
// encoder.add_kernel_node(
|
||||
// kernel,
|
||||
// grid_dim,
|
||||
// block_dim,
|
||||
// smem_bytes,
|
||||
// a.data<DataType>(),
|
||||
// b.data<DataType>(),
|
||||
// d.data<DataType>(),
|
||||
// params);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace mlx::core
|
||||
@@ -1,27 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "mlx/backend/common/matmul.h"
|
||||
#include "mlx/backend/cuda/device.h"
|
||||
#include "mlx/primitives.h"
|
||||
|
||||
#include <nvtx3/nvtx3.hpp>
|
||||
#include <numeric>
|
||||
|
||||
namespace mlx::core {
|
||||
|
||||
void dispatch_steel_gemm(
|
||||
const Stream& s,
|
||||
cu::CommandEncoder& encoder,
|
||||
const array& a,
|
||||
const array& b,
|
||||
array& d,
|
||||
int M,
|
||||
int N,
|
||||
int K,
|
||||
int lda,
|
||||
int ldb,
|
||||
int ldd,
|
||||
bool a_transposed,
|
||||
bool b_transposed);
|
||||
|
||||
} // namespace mlx::core
|
||||
@@ -7,8 +7,6 @@
|
||||
#include "mlx/backend/gpu/copy.h"
|
||||
#include "mlx/primitives.h"
|
||||
|
||||
#include "mlx/backend/cuda/gemms/steel_gemm.h"
|
||||
|
||||
#include <nvtx3/nvtx3.hpp>
|
||||
#include <numeric>
|
||||
|
||||
@@ -97,24 +95,6 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (out.dtype() == float16 && batch_count == 1 && !a_transposed &&
|
||||
b_transposed) {
|
||||
return dispatch_steel_gemm(
|
||||
/* const Stream& s = */ s,
|
||||
/* cu::CommandEncoder& encoder = */ encoder,
|
||||
/* const array& a = */ a,
|
||||
/* const array& b = */ b,
|
||||
/* array& d = */ out,
|
||||
/* int M = */ M,
|
||||
/* int N = */ N,
|
||||
/* int K = */ K,
|
||||
/* int lda = */ lda,
|
||||
/* int ldb = */ ldb,
|
||||
/* int ldd = */ N,
|
||||
/* bool a_transposed = */ a_transposed,
|
||||
/* bool b_transposed = */ b_transposed);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
// Invoke cublasLt
|
||||
CublasGemm gemm(
|
||||
|
||||
@@ -143,87 +143,85 @@ struct Tile16x16 {
|
||||
}
|
||||
};
|
||||
|
||||
// /**
|
||||
// * A simple container of multiple Tile16x16.
|
||||
// *
|
||||
// * Provides utility functions for loading and manipulating collections of
|
||||
// basic
|
||||
// * tiles.
|
||||
// */
|
||||
// template <typename T, int ROWS_, int COLS_>
|
||||
// struct RegisterTile {
|
||||
// static constexpr int ROWS = ROWS_;
|
||||
// static constexpr int COLS = COLS_;
|
||||
// static constexpr int TILES_X = COLS / 16;
|
||||
// static constexpr int TILES_Y = ROWS / 16;
|
||||
/**
|
||||
* A simple container of multiple Tile16x16.
|
||||
*
|
||||
* Provides utility functions for loading and manipulating collections of basic
|
||||
* tiles.
|
||||
*/
|
||||
template <typename T, int ROWS_, int COLS_>
|
||||
struct RegisterTile {
|
||||
static constexpr int ROWS = ROWS_;
|
||||
static constexpr int COLS = COLS_;
|
||||
static constexpr int TILES_X = COLS / 16;
|
||||
static constexpr int TILES_Y = ROWS / 16;
|
||||
|
||||
// Tile16x16<T> data[TILES_X * TILES_Y];
|
||||
Tile16x16<T> data[TILES_X * TILES_Y];
|
||||
|
||||
// __device__ inline void fill(T v) {
|
||||
// MLX_UNROLL
|
||||
// for (int i = 0; i < TILES_Y; i++) {
|
||||
// MLX_UNROLL
|
||||
// for (int j = 0; j < TILES_X; j++) {
|
||||
// data[i * TILES_X + j].fill(v);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
__device__ inline void fill(T v) {
|
||||
MLX_UNROLL
|
||||
for (int i = 0; i < TILES_Y; i++) {
|
||||
MLX_UNROLL
|
||||
for (int j = 0; j < TILES_X; j++) {
|
||||
data[i * TILES_X + j].fill(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// template <typename Tile>
|
||||
// __device__ __forceinline__ void
|
||||
// load(Tile& tile, uint32_t base_address, int row, int col) {
|
||||
// MLX_UNROLL
|
||||
// for (int i = 0; i < TILES_Y; i++) {
|
||||
// MLX_UNROLL
|
||||
// for (int j = 0; j < TILES_X; j++) {
|
||||
// data[i * TILES_X + j].load(
|
||||
// tile.loc(base_address, row + i * 16, col + j * 16));
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
template <typename Tile>
|
||||
__device__ __forceinline__ void
|
||||
load(Tile& tile, uint32_t base_address, int row, int col) {
|
||||
MLX_UNROLL
|
||||
for (int i = 0; i < TILES_Y; i++) {
|
||||
MLX_UNROLL
|
||||
for (int j = 0; j < TILES_X; j++) {
|
||||
data[i * TILES_X + j].load(
|
||||
tile.loc(base_address, row + i * 16, col + j * 16));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// template <typename Tile, typename F>
|
||||
// __device__ __forceinline__ void
|
||||
// load(Tile& tile, F f, uint32_t base_address, int row, int col) {
|
||||
// MLX_UNROLL
|
||||
// for (int i = 0; i < TILES_Y; i++) {
|
||||
// MLX_UNROLL
|
||||
// for (int j = 0; j < TILES_X; j++) {
|
||||
// f(data[i * TILES_X + j],
|
||||
// tile,
|
||||
// base_address,
|
||||
// row + i * 16,
|
||||
// col + j * 16);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
template <typename Tile, typename F>
|
||||
__device__ __forceinline__ void
|
||||
load(Tile& tile, F f, uint32_t base_address, int row, int col) {
|
||||
MLX_UNROLL
|
||||
for (int i = 0; i < TILES_Y; i++) {
|
||||
MLX_UNROLL
|
||||
for (int j = 0; j < TILES_X; j++) {
|
||||
f(data[i * TILES_X + j],
|
||||
tile,
|
||||
base_address,
|
||||
row + i * 16,
|
||||
col + j * 16);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// template <typename U>
|
||||
// __device__ inline void store_global(U* x, int N, int row, int col) {
|
||||
// MLX_UNROLL
|
||||
// for (int i = 0; i < TILES_Y; i++) {
|
||||
// MLX_UNROLL
|
||||
// for (int j = 0; j < TILES_X; j++) {
|
||||
// data[i * TILES_X + j].store_global(
|
||||
// x + (row + i * 16) * N + col + j * 16, N);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
template <typename U>
|
||||
__device__ inline void store_global(U* x, int N, int row, int col) {
|
||||
MLX_UNROLL
|
||||
for (int i = 0; i < TILES_Y; i++) {
|
||||
MLX_UNROLL
|
||||
for (int j = 0; j < TILES_X; j++) {
|
||||
data[i * TILES_X + j].store_global(
|
||||
x + (row + i * 16) * N + col + j * 16, N);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// template <typename U>
|
||||
// __device__ inline void
|
||||
// store_global_safe(U* x, int N, int row, int col, int max_rows) {
|
||||
// MLX_UNROLL
|
||||
// for (int i = 0; i < TILES_Y; i++) {
|
||||
// MLX_UNROLL
|
||||
// for (int j = 0; j < TILES_X; j++) {
|
||||
// data[i * TILES_X + j].store_global_safe(
|
||||
// x + (row + i * 16) * N + col + j * 16, N, max_rows - row - i *
|
||||
// 16);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// };
|
||||
template <typename U>
|
||||
__device__ inline void
|
||||
store_global_safe(U* x, int N, int row, int col, int max_rows) {
|
||||
MLX_UNROLL
|
||||
for (int i = 0; i < TILES_Y; i++) {
|
||||
MLX_UNROLL
|
||||
for (int j = 0; j < TILES_X; j++) {
|
||||
data[i * TILES_X + j].store_global_safe(
|
||||
x + (row + i * 16) * N + col + j * 16, N, max_rows - row - i * 16);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* A simple container of multiple Tile16x16.
|
||||
|
||||
@@ -39,52 +39,98 @@ ternary_v(const bool* a, const T* b, const T* c, T* out, IdxT size) {
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Op, typename T, typename IdxT, int NDIM>
|
||||
template <typename Op, typename T, typename IdxT, int NDIM, int N_READS>
|
||||
__global__ void ternary_g_nd(
|
||||
const bool* a,
|
||||
const T* b,
|
||||
const T* c,
|
||||
T* out,
|
||||
IdxT size,
|
||||
IdxT size_rest,
|
||||
const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
|
||||
const __grid_constant__ cuda::std::array<int64_t, NDIM> a_strides,
|
||||
const __grid_constant__ cuda::std::array<int64_t, NDIM> b_strides,
|
||||
const __grid_constant__ cuda::std::array<int64_t, NDIM> c_strides) {
|
||||
IdxT index = cg::this_grid().thread_rank();
|
||||
if (index < size) {
|
||||
auto [a_idx, b_idx, c_idx] = elem_to_loc_nd<NDIM>(
|
||||
index,
|
||||
shape.data(),
|
||||
a_strides.data(),
|
||||
b_strides.data(),
|
||||
c_strides.data());
|
||||
out[index] = Op{}(a[a_idx], b[b_idx], c[c_idx]);
|
||||
auto block = cg::this_thread_block();
|
||||
auto grid = cg::this_grid();
|
||||
IdxT index_rest =
|
||||
grid.block_index().y * block.dim_threads().y + block.thread_index().y;
|
||||
if (index_rest >= size_rest) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto shape_x = shape[NDIM - 1];
|
||||
auto a_stride_x = a_strides[NDIM - 1];
|
||||
auto b_stride_x = b_strides[NDIM - 1];
|
||||
auto c_stride_x = c_strides[NDIM - 1];
|
||||
IdxT index_x =
|
||||
grid.block_index().x * block.dim_threads().x + block.thread_index().x;
|
||||
auto [a_idx, b_idx, c_idx] = elem_to_loc_nd<NDIM>(
|
||||
index_rest * shape_x,
|
||||
shape.data(),
|
||||
a_strides.data(),
|
||||
b_strides.data(),
|
||||
c_strides.data());
|
||||
auto a_vec =
|
||||
load_vector<N_READS>(a + a_idx, index_x, shape_x, a_stride_x, false);
|
||||
auto b_vec =
|
||||
load_vector<N_READS>(b + b_idx, index_x, shape_x, b_stride_x, T(0));
|
||||
auto c_vec =
|
||||
load_vector<N_READS>(c + c_idx, index_x, shape_x, c_stride_x, T(0));
|
||||
|
||||
AlignedVector<T, N_READS> out_vec;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < N_READS; ++i) {
|
||||
out_vec[i] = Op{}(a_vec[i], b_vec[i], c_vec[i]);
|
||||
}
|
||||
store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
|
||||
}
|
||||
|
||||
template <typename Op, typename T, typename IdxT>
|
||||
template <typename Op, typename T, typename IdxT, int N_READS>
|
||||
__global__ void ternary_g(
|
||||
const bool* a,
|
||||
const T* b,
|
||||
const T* c,
|
||||
T* out,
|
||||
IdxT size,
|
||||
IdxT size_rest,
|
||||
const __grid_constant__ Shape shape,
|
||||
const __grid_constant__ Strides a_strides,
|
||||
const __grid_constant__ Strides b_strides,
|
||||
const __grid_constant__ Strides c_strides,
|
||||
int ndim) {
|
||||
IdxT index = cg::this_grid().thread_rank();
|
||||
if (index < size) {
|
||||
auto [a_idx, b_idx, c_idx] = elem_to_loc(
|
||||
index,
|
||||
shape.data(),
|
||||
a_strides.data(),
|
||||
b_strides.data(),
|
||||
c_strides.data(),
|
||||
ndim);
|
||||
out[index] = Op{}(a[a_idx], b[b_idx], c[c_idx]);
|
||||
auto block = cg::this_thread_block();
|
||||
auto grid = cg::this_grid();
|
||||
IdxT index_rest =
|
||||
grid.block_index().y * block.dim_threads().y + block.thread_index().y;
|
||||
if (index_rest >= size_rest) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto shape_x = shape[ndim - 1];
|
||||
auto a_stride_x = a_strides[ndim - 1];
|
||||
auto b_stride_x = b_strides[ndim - 1];
|
||||
auto c_stride_x = c_strides[ndim - 1];
|
||||
IdxT index_x =
|
||||
grid.block_index().x * block.dim_threads().x + block.thread_index().x;
|
||||
auto [a_idx, b_idx, c_idx] = elem_to_loc(
|
||||
index_rest * shape_x,
|
||||
shape.data(),
|
||||
a_strides.data(),
|
||||
b_strides.data(),
|
||||
c_strides.data(),
|
||||
ndim);
|
||||
auto a_vec =
|
||||
load_vector<N_READS>(a + a_idx, index_x, shape_x, a_stride_x, false);
|
||||
auto b_vec =
|
||||
load_vector<N_READS>(b + b_idx, index_x, shape_x, b_stride_x, T(0));
|
||||
auto c_vec =
|
||||
load_vector<N_READS>(c + c_idx, index_x, shape_x, c_stride_x, T(0));
|
||||
|
||||
AlignedVector<T, N_READS> out_vec;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < N_READS; ++i) {
|
||||
out_vec[i] = Op{}(a_vec[i], b_vec[i], c_vec[i]);
|
||||
}
|
||||
store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
|
||||
}
|
||||
|
||||
} // namespace cu
|
||||
@@ -123,36 +169,55 @@ void ternary_op_gpu_inplace(
|
||||
auto& b_strides = strides[1];
|
||||
auto& c_strides = strides[2];
|
||||
int ndim = shape.size();
|
||||
int work_per_thread = 1;
|
||||
auto dim0 = ndim > 0 ? shape.back() : 1;
|
||||
auto rest = out.size() / dim0;
|
||||
if (dim0 >= 4) {
|
||||
work_per_thread = 4;
|
||||
}
|
||||
dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
|
||||
auto block_dims = get_block_dims(dim0, rest, 1);
|
||||
uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
|
||||
uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);
|
||||
|
||||
if (ndim <= 3) {
|
||||
dispatch_1_2_3(ndim, [&](auto dims_constant) {
|
||||
auto [num_blocks, block_dims] = get_launch_args(out, large());
|
||||
auto kernel =
|
||||
cu::ternary_g_nd<Op, DType, IdxT, dims_constant(), 1>;
|
||||
if (work_per_thread == 4) {
|
||||
kernel =
|
||||
cu::ternary_g_nd<Op, DType, IdxT, dims_constant(), 4>;
|
||||
}
|
||||
encoder.add_kernel_node(
|
||||
cu::ternary_g_nd<Op, DType, IdxT, dims_constant()>,
|
||||
num_blocks,
|
||||
kernel,
|
||||
{num_blocks_x, num_blocks_y},
|
||||
block_dims,
|
||||
0,
|
||||
a.data<bool>(),
|
||||
b.data<DType>(),
|
||||
c.data<DType>(),
|
||||
out.data<DType>(),
|
||||
out.size(),
|
||||
rest,
|
||||
const_param<dims_constant()>(shape),
|
||||
const_param<dims_constant()>(a_strides),
|
||||
const_param<dims_constant()>(b_strides),
|
||||
const_param<dims_constant()>(c_strides));
|
||||
});
|
||||
} else {
|
||||
auto [num_blocks, block_dims] = get_launch_args(out, large());
|
||||
auto kernel = cu::ternary_g<Op, DType, IdxT, 1>;
|
||||
if (work_per_thread == 4) {
|
||||
kernel = cu::ternary_g<Op, DType, IdxT, 4>;
|
||||
}
|
||||
encoder.add_kernel_node(
|
||||
cu::ternary_g<Op, DType, IdxT>,
|
||||
num_blocks,
|
||||
kernel,
|
||||
{num_blocks_x, num_blocks_y},
|
||||
block_dims,
|
||||
0,
|
||||
a.data<bool>(),
|
||||
b.data<DType>(),
|
||||
c.data<DType>(),
|
||||
out.data<DType>(),
|
||||
out.data_size(),
|
||||
rest,
|
||||
const_param(shape),
|
||||
const_param(a_strides),
|
||||
const_param(b_strides),
|
||||
|
||||
@@ -37,19 +37,36 @@ __global__ void unary_v(const In* in, Out* out, IdxT size) {
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Op, typename In, typename Out, typename IdxT>
|
||||
template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
|
||||
__global__ void unary_g(
|
||||
const In* in,
|
||||
Out* out,
|
||||
IdxT size,
|
||||
IdxT size_rest,
|
||||
const __grid_constant__ Shape shape,
|
||||
const __grid_constant__ Strides strides,
|
||||
int ndim) {
|
||||
IdxT index = cg::this_grid().thread_rank();
|
||||
if (index < size) {
|
||||
auto idx = elem_to_loc(index, shape.data(), strides.data(), ndim);
|
||||
out[index] = Op{}(in[idx]);
|
||||
auto block = cg::this_thread_block();
|
||||
auto grid = cg::this_grid();
|
||||
IdxT index_rest =
|
||||
grid.block_index().y * block.dim_threads().y + block.thread_index().y;
|
||||
if (index_rest >= size_rest) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto shape_x = shape[ndim - 1];
|
||||
auto stride_x = strides[ndim - 1];
|
||||
IdxT index_x =
|
||||
grid.block_index().x * block.dim_threads().x + block.thread_index().x;
|
||||
auto idx =
|
||||
elem_to_loc(index_rest * shape_x, shape.data(), strides.data(), ndim);
|
||||
auto in_vec =
|
||||
load_vector<N_READS>(in + idx, index_x, shape_x, stride_x, In(0));
|
||||
AlignedVector<Out, N_READS> out_vec;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < N_READS; ++i) {
|
||||
out_vec[i] = Op{}(in_vec[i]);
|
||||
}
|
||||
store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
|
||||
}
|
||||
|
||||
template <typename Op, typename In, typename Out>
|
||||
@@ -127,8 +144,7 @@ void unary_op_gpu_inplace(
|
||||
using OutType = cuda_type_t<CTYPE_OUT>;
|
||||
if (contig) {
|
||||
using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
|
||||
// TODO: Choose optimized value based on type size.
|
||||
constexpr int N_READS = 4;
|
||||
constexpr int N_READS = 16 / sizeof(OutType);
|
||||
auto [num_blocks, block_dims] = get_launch_args(
|
||||
out.data_size(), out.shape(), out.strides(), large, N_READS);
|
||||
encoder.add_kernel_node(
|
||||
@@ -142,18 +158,30 @@ void unary_op_gpu_inplace(
|
||||
} else {
|
||||
using IdxT = std::conditional_t<large(), int64_t, int32_t>;
|
||||
auto [shape, strides] = collapse_contiguous_dims(in);
|
||||
auto [num_blocks, block_dims] = get_launch_args(out, large);
|
||||
auto ndim = shape.size();
|
||||
int work_per_thread = 1;
|
||||
auto kernel = cu::unary_g<Op, InType, OutType, IdxT, 1>;
|
||||
auto dim0 = ndim > 0 ? shape.back() : 1;
|
||||
auto rest = out.size() / dim0;
|
||||
if (dim0 >= 4) {
|
||||
kernel = cu::unary_g<Op, InType, OutType, IdxT, 4>;
|
||||
work_per_thread = 4;
|
||||
}
|
||||
dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
|
||||
auto block_dims = get_block_dims(dim0, rest, 1);
|
||||
uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
|
||||
uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);
|
||||
encoder.add_kernel_node(
|
||||
cu::unary_g<Op, InType, OutType, IdxT>,
|
||||
num_blocks,
|
||||
kernel,
|
||||
{num_blocks_x, num_blocks_y},
|
||||
block_dims,
|
||||
0,
|
||||
in.data<InType>(),
|
||||
out.data<OutType>(),
|
||||
out.data_size(),
|
||||
rest,
|
||||
const_param(shape),
|
||||
const_param(strides),
|
||||
shape.size());
|
||||
ndim);
|
||||
}
|
||||
});
|
||||
} else {
|
||||
|
||||
34
mlx/backend/cuda/unary/CMakeLists.txt
Normal file
34
mlx/backend/cuda/unary/CMakeLists.txt
Normal file
@@ -0,0 +1,34 @@
|
||||
target_sources(
|
||||
mlx
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/abs.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arccos.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arccosh.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arcsin.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arcsinh.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arctan.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arctanh.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/bitwise_invert.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/ceil.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/conjugate.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/cos.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/cosh.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/erf.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/erf_inv.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/exp.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/expm1.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/floor.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/imag.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/log.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/log1p.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/logical_not.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/negative.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/real.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/round.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/sigmoid.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/sign.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/sin.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/sinh.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/sqrt.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/square.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tan.cu
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tanh.cu)
|
||||
7
mlx/backend/cuda/unary/abs.cu
Normal file
7
mlx/backend/cuda/unary/abs.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(Abs)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/arccos.cu
Normal file
7
mlx/backend/cuda/unary/arccos.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(ArcCos)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/arccosh.cu
Normal file
7
mlx/backend/cuda/unary/arccosh.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(ArcCosh)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/arcsin.cu
Normal file
7
mlx/backend/cuda/unary/arcsin.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(ArcSin)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/arcsinh.cu
Normal file
7
mlx/backend/cuda/unary/arcsinh.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(ArcSinh)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/arctan.cu
Normal file
7
mlx/backend/cuda/unary/arctan.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(ArcTan)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/arctanh.cu
Normal file
7
mlx/backend/cuda/unary/arctanh.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(ArcTanh)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/bitwise_invert.cu
Normal file
7
mlx/backend/cuda/unary/bitwise_invert.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(BitwiseInvert)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/ceil.cu
Normal file
7
mlx/backend/cuda/unary/ceil.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(Ceil)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/conjugate.cu
Normal file
7
mlx/backend/cuda/unary/conjugate.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(Conjugate)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/cos.cu
Normal file
7
mlx/backend/cuda/unary/cos.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(Cos)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/cosh.cu
Normal file
7
mlx/backend/cuda/unary/cosh.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(Cosh)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/erf.cu
Normal file
7
mlx/backend/cuda/unary/erf.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(Erf)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/erf_inv.cu
Normal file
7
mlx/backend/cuda/unary/erf_inv.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(ErfInv)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/exp.cu
Normal file
7
mlx/backend/cuda/unary/exp.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(Exp)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/expm1.cu
Normal file
7
mlx/backend/cuda/unary/expm1.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(Expm1)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/floor.cu
Normal file
7
mlx/backend/cuda/unary/floor.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(Floor)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/imag.cu
Normal file
7
mlx/backend/cuda/unary/imag.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(Imag)
|
||||
} // namespace mlx::core
|
||||
21
mlx/backend/cuda/unary/log.cu
Normal file
21
mlx/backend/cuda/unary/log.cu
Normal file
@@ -0,0 +1,21 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
void Log::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
nvtx3::scoped_range r("Log::eval_gpu");
|
||||
auto& s = out.primitive().stream();
|
||||
switch (base_) {
|
||||
case Base::e:
|
||||
unary_op_gpu<cu::Log>(inputs, out, name(), s);
|
||||
break;
|
||||
case Base::two:
|
||||
unary_op_gpu<cu::Log2>(inputs, out, name(), s);
|
||||
break;
|
||||
case Base::ten:
|
||||
unary_op_gpu<cu::Log10>(inputs, out, name(), s);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/log1p.cu
Normal file
7
mlx/backend/cuda/unary/log1p.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(Log1p)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/logical_not.cu
Normal file
7
mlx/backend/cuda/unary/logical_not.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(LogicalNot)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/negative.cu
Normal file
7
mlx/backend/cuda/unary/negative.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(Negative)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/real.cu
Normal file
7
mlx/backend/cuda/unary/real.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(Real)
|
||||
} // namespace mlx::core
|
||||
18
mlx/backend/cuda/unary/round.cu
Normal file
18
mlx/backend/cuda/unary/round.cu
Normal file
@@ -0,0 +1,18 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
void Round::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
nvtx3::scoped_range r("Round::eval_gpu");
|
||||
assert(inputs.size() == 1);
|
||||
const auto& in = inputs[0];
|
||||
auto& s = out.primitive().stream();
|
||||
if (issubdtype(in.dtype(), inexact)) {
|
||||
unary_op_gpu<cu::Round>(inputs, out, name(), s);
|
||||
} else {
|
||||
// No-op integer types
|
||||
out.copy_shared_buffer(in);
|
||||
}
|
||||
}
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/sigmoid.cu
Normal file
7
mlx/backend/cuda/unary/sigmoid.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(Sigmoid)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/sign.cu
Normal file
7
mlx/backend/cuda/unary/sign.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(Sign)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/sin.cu
Normal file
7
mlx/backend/cuda/unary/sin.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(Sin)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/sinh.cu
Normal file
7
mlx/backend/cuda/unary/sinh.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(Sinh)
|
||||
} // namespace mlx::core
|
||||
15
mlx/backend/cuda/unary/sqrt.cu
Normal file
15
mlx/backend/cuda/unary/sqrt.cu
Normal file
@@ -0,0 +1,15 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
void Sqrt::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
nvtx3::scoped_range r("Sqrt::eval_gpu");
|
||||
auto& s = out.primitive().stream();
|
||||
if (recip_) {
|
||||
unary_op_gpu<cu::Rsqrt>(inputs, out, "Rsqrt", s);
|
||||
} else {
|
||||
unary_op_gpu<cu::Sqrt>(inputs, out, "Sqrt", s);
|
||||
}
|
||||
}
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/square.cu
Normal file
7
mlx/backend/cuda/unary/square.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(Square)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/tan.cu
Normal file
7
mlx/backend/cuda/unary/tan.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(Tan)
|
||||
} // namespace mlx::core
|
||||
7
mlx/backend/cuda/unary/tanh.cu
Normal file
7
mlx/backend/cuda/unary/tanh.cu
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
|
||||
namespace mlx::core {
|
||||
UNARY_GPU(Tanh)
|
||||
} // namespace mlx::core
|
||||
215
mlx/backend/cuda/unary/unary.cuh
Normal file
215
mlx/backend/cuda/unary/unary.cuh
Normal file
@@ -0,0 +1,215 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/common/unary.h"
|
||||
#include "mlx/backend/cuda/device.h"
|
||||
#include "mlx/backend/cuda/device/unary_ops.cuh"
|
||||
#include "mlx/backend/cuda/kernel_utils.cuh"
|
||||
#include "mlx/dtype_utils.h"
|
||||
#include "mlx/primitives.h"
|
||||
|
||||
#include <cooperative_groups.h>
|
||||
#include <nvtx3/nvtx3.hpp>
|
||||
|
||||
namespace mlx::core {
|
||||
|
||||
namespace cu {
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
|
||||
__global__ void unary_v(const In* in, Out* out, IdxT size) {
|
||||
IdxT index = cg::this_grid().thread_rank();
|
||||
|
||||
if ((index + 1) * N_READS > size) {
|
||||
for (IdxT i = index * N_READS; i < size; ++i) {
|
||||
out[i] = Op{}(in[i]);
|
||||
}
|
||||
} else {
|
||||
auto in_vec = load_vector<N_READS>(in, index);
|
||||
|
||||
AlignedVector<Out, N_READS> out_vec;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < N_READS; ++i) {
|
||||
out_vec[i] = Op{}(in_vec[i]);
|
||||
}
|
||||
|
||||
store_vector<N_READS>(out, index, out_vec);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
|
||||
__global__ void unary_g(
|
||||
const In* in,
|
||||
Out* out,
|
||||
IdxT size_rest,
|
||||
const __grid_constant__ Shape shape,
|
||||
const __grid_constant__ Strides strides,
|
||||
int ndim) {
|
||||
auto block = cg::this_thread_block();
|
||||
auto grid = cg::this_grid();
|
||||
IdxT index_rest =
|
||||
grid.block_index().y * block.dim_threads().y + block.thread_index().y;
|
||||
if (index_rest >= size_rest) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto shape_x = shape[ndim - 1];
|
||||
auto stride_x = strides[ndim - 1];
|
||||
IdxT index_x =
|
||||
grid.block_index().x * block.dim_threads().x + block.thread_index().x;
|
||||
auto idx =
|
||||
elem_to_loc(index_rest * shape_x, shape.data(), strides.data(), ndim);
|
||||
auto in_vec =
|
||||
load_vector<N_READS>(in + idx, index_x, shape_x, stride_x, In(0));
|
||||
AlignedVector<Out, N_READS> out_vec;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < N_READS; ++i) {
|
||||
out_vec[i] = Op{}(in_vec[i]);
|
||||
}
|
||||
store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
|
||||
}
|
||||
|
||||
template <typename Op, typename In, typename Out>
|
||||
constexpr bool supports_unary_op() {
|
||||
if (std::is_same_v<Op, Abs> || std::is_same_v<Op, Negative> ||
|
||||
std::is_same_v<Op, Sign> || std::is_same_v<Op, Square>) {
|
||||
return std::is_same_v<In, Out>;
|
||||
}
|
||||
if (std::is_same_v<Op, ArcCosh> || std::is_same_v<Op, ArcSinh> ||
|
||||
std::is_same_v<Op, ArcTanh> || std::is_same_v<Op, Erf> ||
|
||||
std::is_same_v<Op, ErfInv> || std::is_same_v<Op, Expm1> ||
|
||||
std::is_same_v<Op, Sigmoid>) {
|
||||
return std::is_same_v<In, Out> && is_floating_v<In>;
|
||||
}
|
||||
if (std::is_same_v<Op, BitwiseInvert>) {
|
||||
return std::is_same_v<In, Out> && std::is_integral_v<In> &&
|
||||
!std::is_same_v<In, bool>;
|
||||
}
|
||||
if (std::is_same_v<Op, Ceil> || std::is_same_v<Op, Floor>) {
|
||||
return std::is_same_v<In, Out> && !mlx::core::is_complex_v<In>;
|
||||
}
|
||||
if (std::is_same_v<Op, Conjugate>) {
|
||||
return std::is_same_v<In, Out> && mlx::core::is_complex_v<In>;
|
||||
}
|
||||
if (std::is_same_v<Op, ArcCos> || std::is_same_v<Op, ArcSin> ||
|
||||
std::is_same_v<Op, ArcTan> || std::is_same_v<Op, Cos> ||
|
||||
std::is_same_v<Op, Cosh> || std::is_same_v<Op, Exp> ||
|
||||
std::is_same_v<Op, Log> || std::is_same_v<Op, Log2> ||
|
||||
std::is_same_v<Op, Log10> || std::is_same_v<Op, Log1p> ||
|
||||
std::is_same_v<Op, Round> || std::is_same_v<Op, Rsqrt> ||
|
||||
std::is_same_v<Op, Sqrt> || std::is_same_v<Op, Sin> ||
|
||||
std::is_same_v<Op, Sinh> || std::is_same_v<Op, Tan> ||
|
||||
std::is_same_v<Op, Tanh>) {
|
||||
return std::is_same_v<In, Out> && is_inexact_v<In>;
|
||||
}
|
||||
if (std::is_same_v<Op, Imag> || std::is_same_v<Op, Real>) {
|
||||
return mlx::core::is_complex_v<In> && std::is_same_v<Out, float>;
|
||||
}
|
||||
if (std::is_same_v<Op, LogicalNot>) {
|
||||
return std::is_same_v<In, Out> && std::is_same_v<In, bool>;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace cu
|
||||
|
||||
template <typename Op>
|
||||
void unary_op_gpu_inplace(
|
||||
const std::vector<array>& inputs,
|
||||
array& out,
|
||||
const char* op,
|
||||
const Stream& s) {
|
||||
auto& in = inputs[0];
|
||||
if (in.size() == 0) {
|
||||
return;
|
||||
}
|
||||
bool contig = in.flags().contiguous;
|
||||
bool large;
|
||||
if (!contig) {
|
||||
large = in.data_size() > INT32_MAX || out.size() > INT32_MAX;
|
||||
} else {
|
||||
large = in.data_size() > UINT32_MAX;
|
||||
}
|
||||
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
encoder.set_input_array(in);
|
||||
encoder.set_output_array(out);
|
||||
dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
|
||||
dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
|
||||
using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
|
||||
using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
|
||||
if constexpr (cu::supports_unary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
|
||||
dispatch_bool(large, [&](auto large) {
|
||||
using InType = cuda_type_t<CTYPE_IN>;
|
||||
using OutType = cuda_type_t<CTYPE_OUT>;
|
||||
if (contig) {
|
||||
using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
|
||||
constexpr int N_READS = 16 / sizeof(OutType);
|
||||
auto [num_blocks, block_dims] = get_launch_args(
|
||||
out.data_size(), out.shape(), out.strides(), large, N_READS);
|
||||
encoder.add_kernel_node(
|
||||
cu::unary_v<Op, InType, OutType, IdxT, N_READS>,
|
||||
num_blocks,
|
||||
block_dims,
|
||||
0,
|
||||
in.data<InType>(),
|
||||
out.data<OutType>(),
|
||||
out.data_size());
|
||||
} else {
|
||||
using IdxT = std::conditional_t<large(), int64_t, int32_t>;
|
||||
auto [shape, strides] = collapse_contiguous_dims(in);
|
||||
auto ndim = shape.size();
|
||||
int work_per_thread = 1;
|
||||
auto kernel = cu::unary_g<Op, InType, OutType, IdxT, 1>;
|
||||
auto dim0 = ndim > 0 ? shape.back() : 1;
|
||||
auto rest = out.size() / dim0;
|
||||
if (dim0 >= 4) {
|
||||
kernel = cu::unary_g<Op, InType, OutType, IdxT, 4>;
|
||||
work_per_thread = 4;
|
||||
}
|
||||
dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
|
||||
auto block_dims = get_block_dims(dim0, rest, 1);
|
||||
uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
|
||||
uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);
|
||||
encoder.add_kernel_node(
|
||||
kernel,
|
||||
{num_blocks_x, num_blocks_y},
|
||||
block_dims,
|
||||
0,
|
||||
in.data<InType>(),
|
||||
out.data<OutType>(),
|
||||
rest,
|
||||
const_param(shape),
|
||||
const_param(strides),
|
||||
ndim);
|
||||
}
|
||||
});
|
||||
} else {
|
||||
throw std::runtime_error(fmt::format(
|
||||
"Can not do unary op {} on input of {} with output of {}.",
|
||||
op,
|
||||
dtype_to_string(in.dtype()),
|
||||
dtype_to_string(out.dtype())));
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
template <typename Op>
|
||||
void unary_op_gpu(
|
||||
const std::vector<array>& inputs,
|
||||
array& out,
|
||||
const char* op,
|
||||
const Stream& s) {
|
||||
set_unary_output_data(inputs[0], out);
|
||||
unary_op_gpu_inplace<Op>(inputs, out, op, s);
|
||||
}
|
||||
|
||||
#define UNARY_GPU(func) \
|
||||
void func::eval_gpu(const std::vector<array>& inputs, array& out) { \
|
||||
nvtx3::scoped_range r(#func "::eval_gpu"); \
|
||||
auto& s = out.primitive().stream(); \
|
||||
unary_op_gpu<cu::func>(inputs, out, name(), s); \
|
||||
}
|
||||
|
||||
} // namespace mlx::core
|
||||
Reference in New Issue
Block a user