mirror of
https://github.com/ml-explore/mlx.git
synced 2025-08-02 00:36:49 +08:00
Cuda faster softmax (#2435)
* faster softmax and logsumexp * faster softmax and logsumexp * format
This commit is contained in:
parent
ef631d63af
commit
3adba92ebe
@ -43,20 +43,19 @@ __global__ void logsumexp(const T* in, T* out, int axis_size) {
|
||||
AccT maxval = Limits<AccT>::finite_min();
|
||||
AccT normalizer = 0;
|
||||
for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); r++) {
|
||||
AccT vals[N_READS];
|
||||
cub::LoadDirectBlocked(
|
||||
r * BLOCK_DIM + block.thread_rank(),
|
||||
make_cast_iterator<AccT>(in),
|
||||
vals,
|
||||
axis_size,
|
||||
Limits<AccT>::min());
|
||||
auto index = r * BLOCK_DIM + block.thread_rank();
|
||||
auto vals = load_vector<N_READS>(in, index, axis_size, Limits<T>::min());
|
||||
prevmax = maxval;
|
||||
maxval = max_op(maxval, cub::ThreadReduce(vals, max_op));
|
||||
#pragma unroll
|
||||
for (int i = 0; i < N_READS; ++i) {
|
||||
maxval = max_op(maxval, static_cast<AccT>(vals[i]));
|
||||
}
|
||||
// Online normalizer calculation for softmax:
|
||||
// https://github.com/NVIDIA/online-softmax
|
||||
normalizer = normalizer * softmax_exp(prevmax - maxval);
|
||||
for (int i = 0; i < N_READS; i++) {
|
||||
normalizer = normalizer + softmax_exp(vals[i] - maxval);
|
||||
normalizer =
|
||||
normalizer + softmax_exp(static_cast<AccT>(vals[i]) - maxval);
|
||||
}
|
||||
}
|
||||
|
||||
@ -143,9 +142,9 @@ void LogSumExp::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
encoder.set_input_array(in);
|
||||
encoder.set_output_array(out);
|
||||
dispatch_float_types(out.dtype(), "logsumexp", [&](auto type_tag) {
|
||||
constexpr int N_READS = 4;
|
||||
using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
|
||||
constexpr int N_READS = 16 / sizeof(DataType);
|
||||
dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
|
||||
using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
|
||||
auto kernel = cu::logsumexp<DataType, float, block_dim(), N_READS>;
|
||||
encoder.add_kernel_node(
|
||||
kernel,
|
||||
|
@ -11,7 +11,6 @@
|
||||
#include <cooperative_groups.h>
|
||||
#include <cooperative_groups/reduce.h>
|
||||
#include <nvtx3/nvtx3.hpp>
|
||||
#include <cub/block/block_load.cuh>
|
||||
|
||||
#include <cassert>
|
||||
|
||||
@ -45,20 +44,21 @@ __global__ void softmax(const T* in, T* out, int axis_size) {
|
||||
AccT maxval = Limits<AccT>::finite_min();
|
||||
AccT normalizer = cast_to<AccT>(0);
|
||||
for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); r++) {
|
||||
AccT vals[N_READS];
|
||||
cub::LoadDirectBlocked(
|
||||
r * BLOCK_DIM + block.thread_rank(),
|
||||
make_cast_iterator<AccT>(in),
|
||||
vals,
|
||||
axis_size,
|
||||
Limits<AccT>::min());
|
||||
auto index = r * BLOCK_DIM + block.thread_rank();
|
||||
auto vals = load_vector<N_READS>(in, index, axis_size, Limits<T>::min());
|
||||
prevmax = maxval;
|
||||
maxval = max_op(maxval, cub::ThreadReduce(vals, max_op));
|
||||
#pragma unroll
|
||||
for (int i = 0; i < N_READS; ++i) {
|
||||
maxval = max_op(maxval, static_cast<AccT>(vals[i]));
|
||||
}
|
||||
|
||||
// Online normalizer calculation for softmax:
|
||||
// https://github.com/NVIDIA/online-softmax
|
||||
normalizer = normalizer * softmax_exp(prevmax - maxval);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < N_READS; i++) {
|
||||
normalizer = normalizer + softmax_exp(vals[i] - maxval);
|
||||
normalizer =
|
||||
normalizer + softmax_exp(static_cast<AccT>(vals[i]) - maxval);
|
||||
}
|
||||
}
|
||||
|
||||
@ -95,12 +95,11 @@ __global__ void softmax(const T* in, T* out, int axis_size) {
|
||||
// Write output.
|
||||
for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); r++) {
|
||||
auto index = r * BLOCK_DIM + block.thread_rank();
|
||||
T vals[N_READS];
|
||||
cub::LoadDirectBlocked(index, in, vals, axis_size);
|
||||
auto vals = load_vector<N_READS>(in, index, axis_size, T(0));
|
||||
for (int i = 0; i < N_READS; i++) {
|
||||
vals[i] = softmax_exp(static_cast<AccT>(vals[i]) - maxval) * normalizer;
|
||||
}
|
||||
cub::StoreDirectBlocked(index, out, vals, axis_size);
|
||||
store_vector<N_READS>(out, index, vals, axis_size);
|
||||
}
|
||||
}
|
||||
|
||||
@ -141,9 +140,9 @@ void Softmax::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
encoder.set_input_array(in);
|
||||
encoder.set_output_array(out);
|
||||
dispatch_float_types(out.dtype(), "softmax", [&](auto type_tag) {
|
||||
constexpr int N_READS = 4;
|
||||
using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
|
||||
constexpr int N_READS = 16 / sizeof(DataType);
|
||||
dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
|
||||
using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
|
||||
auto kernel = cu::softmax<DataType, DataType, block_dim(), N_READS>;
|
||||
if (precise) {
|
||||
kernel = cu::softmax<DataType, float, block_dim(), N_READS>;
|
||||
|
Loading…
Reference in New Issue
Block a user