mirror of
https://github.com/ml-explore/mlx.git
synced 2025-12-16 01:49:05 +08:00
Some checks failed
Build and Test / check_lint (push) Has been cancelled
Build and Test / linux_build_and_test (ubuntu-22.04) (push) Has been cancelled
Build and Test / linux_build_and_test (ubuntu-22.04-arm) (push) Has been cancelled
Build and Test / mac_build_and_test (14.0) (push) Has been cancelled
Build and Test / mac_build_and_test (15.0) (push) Has been cancelled
Build and Test / cuda_build_and_test (cuda-12.6) (push) Has been cancelled
Build and Test / cuda_build_and_test (cuda-12.9) (push) Has been cancelled
Build and Test / build_documentation (push) Has been cancelled
Build and Test / Linux Fedora CPP Build (aarch64) (push) Has been cancelled
Build and Test / Linux Fedora CPP Build (x86_64) (push) Has been cancelled
Nightly Build / build_linux_release (3.10) (push) Has been cancelled
Nightly Build / build_linux_release (3.14) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.11, ubuntu-22.04) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.11, ubuntu-22.04-arm) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.12, ubuntu-22.04) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.12, ubuntu-22.04-arm) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.13, ubuntu-22.04) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.13, ubuntu-22.04-arm) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.14, ubuntu-22.04) (push) Has been cancelled
Nightly Build / build_linux_with_tests (3.14, ubuntu-22.04-arm) (push) Has been cancelled
Nightly Build / build_mac_release (3.10) (push) Has been cancelled
Nightly Build / build_mac_release (3.13) (push) Has been cancelled
Nightly Build / build_cuda_release (push) Has been cancelled
69 lines
1.9 KiB
Plaintext
69 lines
1.9 KiB
Plaintext
// Copyright © 2025 Apple Inc.
|
|
|
|
#include "mlx/backend/cuda/device.h"
|
|
#include "mlx/backend/cuda/device/fp16_math.cuh"
|
|
#include "mlx/backend/cuda/kernel_utils.cuh"
|
|
#include "mlx/dtype_utils.h"
|
|
#include "mlx/primitives.h"
|
|
|
|
#include <cooperative_groups.h>
|
|
#include <nvtx3/nvtx3.hpp>
|
|
|
|
namespace mlx::core {
|
|
|
|
namespace cu {
|
|
|
|
namespace cg = cooperative_groups;
|
|
|
|
template <typename T, typename IdxT, int N_WRITES>
|
|
__global__ void arange(T* out, IdxT size, T start, T step) {
|
|
IdxT index = cg::this_grid().thread_rank();
|
|
|
|
if ((index + 1) * N_WRITES > size) {
|
|
for (IdxT i = index * N_WRITES; i < size; ++i) {
|
|
out[i] = start + i * step;
|
|
}
|
|
} else {
|
|
AlignedVector<T, N_WRITES> out_vec;
|
|
#pragma unroll
|
|
for (int i = 0; i < N_WRITES; ++i) {
|
|
out_vec[i] = start + (index * N_WRITES + i) * step;
|
|
}
|
|
|
|
store_vector<N_WRITES>(out, index, out_vec);
|
|
}
|
|
}
|
|
|
|
} // namespace cu
|
|
|
|
void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
|
|
nvtx3::scoped_range r("Arange::eval_gpu");
|
|
if (out.size() == 0) {
|
|
return;
|
|
}
|
|
auto& encoder = cu::get_command_encoder(stream());
|
|
out.set_data(cu::malloc_async(out.nbytes(), encoder));
|
|
encoder.set_output_array(out);
|
|
|
|
dispatch_int_float_types(out.dtype(), "Arange", [&](auto type_tag) {
|
|
using CTYPE = MLX_GET_TYPE(type_tag);
|
|
using OutType = cuda_type_t<CTYPE>;
|
|
constexpr int N_WRITES = 16 / sizeof(OutType);
|
|
dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
|
|
using IdxT = std::conditional_t<large(), int64_t, int32_t>;
|
|
auto [num_blocks, block_dims] = get_launch_args(out, large(), N_WRITES);
|
|
encoder.add_kernel_node(
|
|
cu::arange<OutType, IdxT, N_WRITES>,
|
|
num_blocks,
|
|
block_dims,
|
|
0,
|
|
gpu_ptr<OutType>(out),
|
|
out.data_size(),
|
|
static_cast<CTYPE>(start_),
|
|
static_cast<CTYPE>(start_ + step_) - static_cast<CTYPE>(start_));
|
|
});
|
|
});
|
|
}
|
|
|
|
} // namespace mlx::core
|