mlx/backend/cuda/kernel_utils.cuh

// Copyright © 2025 Apple Inc.

// This file includes host-only utilities for writing CUDA kernels, the
// difference from backend/cuda/device/utils.cuh is that the latter file only
// include device-only code.

#pragma once

#include <type_traits>

#include "mlx/array.h"
#include "mlx/backend/cuda/allocator.h"
#include "mlx/backend/cuda/device/utils.cuh"

#include <cuda.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <fmt/format.h>
#include <cuda/cmath>

namespace mlx::core {

template <typename F>
void dispatch_1_2_3(int n, F&& f) {
  switch (n) {
    case 1:
      f(std::integral_constant<int, 1>{});
      break;
    case 2:
      f(std::integral_constant<int, 2>{});
      break;
    case 3:
      f(std::integral_constant<int, 3>{});
      break;
  }
}

template <typename F>
void dispatch_bool(bool v, F&& f) {
  if (v) {
    f(std::true_type{});
  } else {
    f(std::false_type{});
  }
}

template <typename F>
void dispatch_block_dim(int threads, F&& f) {
  if (threads <= WARP_SIZE) {
    f(std::integral_constant<int, WARP_SIZE>{});
  } else if (threads <= WARP_SIZE * 2) {
    f(std::integral_constant<int, WARP_SIZE * 2>{});
  } else if (threads <= WARP_SIZE * 4) {
    f(std::integral_constant<int, WARP_SIZE * 4>{});
  } else if (threads <= WARP_SIZE * 8) {
    f(std::integral_constant<int, WARP_SIZE * 8>{});
  } else if (threads <= WARP_SIZE * 16) {
    f(std::integral_constant<int, WARP_SIZE * 16>{});
  } else {
    f(std::integral_constant<int, WARP_SIZE * 32>{});
  }
}

// Maps CPU types to CUDA types.
template <typename T>
struct CTypeToCudaType {
  using type = T;
};

template <>
struct CTypeToCudaType<float16_t> {
  using type = __half;
};

template <>
struct CTypeToCudaType<bfloat16_t> {
  using type = __nv_bfloat16;
};

template <>
struct CTypeToCudaType<complex64_t> {
  using type = cu::complex64_t;
};

template <typename T>
using cuda_type_t = typename CTypeToCudaType<T>::type;

// Type traits for detecting floating numbers.
template <typename T>
inline constexpr bool is_floating_v =
    cuda::std::is_same_v<T, float> || cuda::std::is_same_v<T, double> ||
    cuda::std::is_same_v<T, float16_t> || cuda::std::is_same_v<T, bfloat16_t>;

// Type traits for detecting complex numbers.
template <typename T>
inline constexpr bool is_complex_v = cuda::std::is_same_v<T, complex64_t> ||
    cuda::std::is_same_v<T, complex128_t>;

// Type traits for detecting complex or real floating point numbers.
template <typename T>
inline constexpr bool is_inexact_v = is_floating_v<T> || is_complex_v<T>;

// Utility to copy data from vector to array in host.
template <int NDIM = MAX_NDIM, typename T = int32_t>
inline cuda::std::array<T, NDIM> const_param(const SmallVector<T>& vec) {
  if (vec.size() > NDIM) {
    throw std::runtime_error(
        fmt::format("ndim can not be larger than {}.", NDIM));
  }
  cuda::std::array<T, NDIM> result;
  std::copy_n(vec.begin(), vec.size(), result.begin());
  return result;
}

// Compute the grid and block dimensions, check backend/common/utils.h for docs.
dim3 get_block_dims(int dim0, int dim1, int dim2, int pow2 = 10);
dim3 get_2d_grid_dims(const Shape& shape, const Strides& strides);
dim3 get_2d_grid_dims(
    const Shape& shape,
    const Strides& strides,
    size_t divisor);
std::pair<dim3, dim3> get_grid_and_block(int dim0, int dim1, int dim2);

// Get the num_blocks and block_dims assuming each thread handles
// |work_per_thread| elements of |arr|.
std::tuple<dim3, uint> get_launch_args(
    size_t size,
    const Shape& shape,
    const Strides& strides,
    bool large,
    int work_per_thread = 1,
    uint max_block_dim = 1024);

inline std::tuple<dim3, uint> get_launch_args(
    const array& arr,
    bool large,
    int work_per_thread = 1,
    uint max_block_dim = 1024) {
  return get_launch_args(
      arr.size(),
      arr.shape(),
      arr.strides(),
      large,
      work_per_thread,
      max_block_dim);
}

} // namespace mlx::core
CUDA backend: backbone (#2075) 2025-05-07 13:26:46 +09:00			`// Copyright © 2025 Apple Inc.`

Export with callback (#2612) * export with callback * export with callback * Add types, fix kwarg ordering bug + test * cleanup, test, fix * typos 2025-10-08 19:24:33 -07:00			`// This file includes host-only utilities for writing CUDA kernels, the`
			`// difference from backend/cuda/device/utils.cuh is that the latter file only`
			`// include device-only code.`
Move some dims utils to common (#2223) 2025-05-29 22:48:30 +09:00
CUDA backend: backbone (#2075) 2025-05-07 13:26:46 +09:00			`#pragma once`

MLX_SWITCH macros to templates (#2320) 2025-07-01 01:33:44 -07:00			`#include <type_traits>`

Move some dims utils to common (#2223) 2025-05-29 22:48:30 +09:00			`#include "mlx/array.h"`
[CUDA] Reduce use of managed memory (#2725) * Use async cuda malloc managed with cuda 13 * add pool threshold * refactor for regular cuda malloc * load eval gpu for cuda * remove use of cuda pool, use cuda free async * fix * fix * fix * fix * fix + comment 2025-11-05 16:05:23 -08:00			`#include "mlx/backend/cuda/allocator.h"`
CUDA backend: compile (#2276) * CUDA backend: compile * Rename kernels/ to device/ 2025-06-13 09:08:39 +09:00			`#include "mlx/backend/cuda/device/utils.cuh"`
Move some dims utils to common (#2223) 2025-05-29 22:48:30 +09:00
[CUDA] Switch to CUDA graphs (#2317) * cuda graph prototype fix signal bug + start to add dependencies capture more capture more ops remaining ops fix reduce and rope deps add concurrent context try update, but not working cosistent topology order use node api use node api directly to reduce overhead fix bug use kernels in unary cache graph format fix synchronization format * comment 2025-07-02 15:59:13 -07:00			`#include <cuda.h>`
CUDA backend: backbone (#2075) 2025-05-07 13:26:46 +09:00			`#include <cuda_bf16.h>`
			`#include <cuda_fp16.h>`
CUDA backend: unary ops (#2158) 2025-06-09 22:45:08 +09:00			`#include <fmt/format.h>`
CUDA backend: binary ops (#2259) 2025-06-10 22:37:40 +09:00			`#include <cuda/cmath>`
CUDA backend: backbone (#2075) 2025-05-07 13:26:46 +09:00
			`namespace mlx::core {`

MLX_SWITCH macros to templates (#2320) 2025-07-01 01:33:44 -07:00			`template <typename F>`
			`void dispatch_1_2_3(int n, F&& f) {`
			`switch (n) {`
			`case 1:`
			`f(std::integral_constant<int, 1>{});`
			`break;`
			`case 2:`
			`f(std::integral_constant<int, 2>{});`
			`break;`
			`case 3:`
			`f(std::integral_constant<int, 3>{});`
			`break;`
CUDA backend: binary ops (#2259) 2025-06-10 22:37:40 +09:00			`}`
MLX_SWITCH macros to templates (#2320) 2025-07-01 01:33:44 -07:00			`}`

			`template <typename F>`
			`void dispatch_bool(bool v, F&& f) {`
			`if (v) {`
			`f(std::true_type{});`
			`} else {`
			`f(std::false_type{});`
CUDA backend: binary ops (#2259) 2025-06-10 22:37:40 +09:00			`}`
MLX_SWITCH macros to templates (#2320) 2025-07-01 01:33:44 -07:00			`}`

			`template <typename F>`
			`void dispatch_block_dim(int threads, F&& f) {`
			`if (threads <= WARP_SIZE) {`
			`f(std::integral_constant<int, WARP_SIZE>{});`
			`} else if (threads <= WARP_SIZE * 2) {`
			`f(std::integral_constant<int, WARP_SIZE * 2>{});`
			`} else if (threads <= WARP_SIZE * 4) {`
			`f(std::integral_constant<int, WARP_SIZE * 4>{});`
			`} else if (threads <= WARP_SIZE * 8) {`
			`f(std::integral_constant<int, WARP_SIZE * 8>{});`
			`} else if (threads <= WARP_SIZE * 16) {`
			`f(std::integral_constant<int, WARP_SIZE * 16>{});`
			`} else {`
			`f(std::integral_constant<int, WARP_SIZE * 32>{});`
CUDA backend: reduce (#2269) 2025-06-12 03:22:25 +09:00			`}`
MLX_SWITCH macros to templates (#2320) 2025-07-01 01:33:44 -07:00			`}`
CUDA backend: reduce (#2269) 2025-06-12 03:22:25 +09:00
CUDA backend: backbone (#2075) 2025-05-07 13:26:46 +09:00			`// Maps CPU types to CUDA types.`
			`template <typename T>`
			`struct CTypeToCudaType {`
			`using type = T;`
			`};`

			`template <>`
			`struct CTypeToCudaType<float16_t> {`
			`using type = __half;`
			`};`

			`template <>`
			`struct CTypeToCudaType<bfloat16_t> {`
			`using type = __nv_bfloat16;`
			`};`

			`template <>`
			`struct CTypeToCudaType<complex64_t> {`
[CUDA] Use cuda::std::complex in place of cuComplex (#2372) 2025-07-15 16:36:13 +09:00			`using type = cu::complex64_t;`
CUDA backend: backbone (#2075) 2025-05-07 13:26:46 +09:00			`};`

			`template <typename T>`
			`using cuda_type_t = typename CTypeToCudaType<T>::type;`

CUDA backend: unary ops (#2158) 2025-06-09 22:45:08 +09:00			`// Type traits for detecting floating numbers.`
			`template <typename T>`
			`inline constexpr bool is_floating_v =`
			`cuda::std::is_same_v<T, float> \|\| cuda::std::is_same_v<T, double> \|\|`
			`cuda::std::is_same_v<T, float16_t> \|\| cuda::std::is_same_v<T, bfloat16_t>;`

[CUDA] Use cuda::std::complex in place of cuComplex (#2372) 2025-07-15 16:36:13 +09:00			`// Type traits for detecting complex numbers.`
			`template <typename T>`
			`inline constexpr bool is_complex_v = cuda::std::is_same_v<T, complex64_t> \|\|`
			`cuda::std::is_same_v<T, complex128_t>;`

[CUDA] Fix back-end bugs and enable corresponding tests (#2296) * Fix some cuda back-end bugs and enable corresponding tests * more fixes * enable more tests * format 2025-06-16 08:45:40 -07:00			`// Type traits for detecting complex or real floating point numbers.`
			`template <typename T>`
[CUDA] Use cuda::std::complex in place of cuComplex (#2372) 2025-07-15 16:36:13 +09:00			`inline constexpr bool is_inexact_v = is_floating_v<T> \|\| is_complex_v<T>;`
[CUDA] Fix back-end bugs and enable corresponding tests (#2296) * Fix some cuda back-end bugs and enable corresponding tests * more fixes * enable more tests * format 2025-06-16 08:45:40 -07:00
CUDA backend: unary ops (#2158) 2025-06-09 22:45:08 +09:00			`// Utility to copy data from vector to array in host.`
			`template <int NDIM = MAX_NDIM, typename T = int32_t>`
Use SmallVector for shapes and strides (#2454) * Use SmallVector for shapes and strides * Convert SmallVector to tuple 2025-08-05 09:41:03 +09:00			`inline cuda::std::array<T, NDIM> const_param(const SmallVector<T>& vec) {`
CUDA backend: unary ops (#2158) 2025-06-09 22:45:08 +09:00			`if (vec.size() > NDIM) {`
			`throw std::runtime_error(`
			`fmt::format("ndim can not be larger than {}.", NDIM));`
			`}`
			`cuda::std::array<T, NDIM> result;`
			`std::copy_n(vec.begin(), vec.size(), result.begin());`
			`return result;`
			`}`

Move some dims utils to common (#2223) 2025-05-29 22:48:30 +09:00			`// Compute the grid and block dimensions, check backend/common/utils.h for docs.`
			`dim3 get_block_dims(int dim0, int dim1, int dim2, int pow2 = 10);`
			`dim3 get_2d_grid_dims(const Shape& shape, const Strides& strides);`
			`dim3 get_2d_grid_dims(`
			`const Shape& shape,`
			`const Strides& strides,`
			`size_t divisor);`
RoPE for CUDA (#2293) * First working CUDA rope * Fix random 2025-06-15 06:08:07 -07:00			`std::pair<dim3, dim3> get_grid_and_block(int dim0, int dim1, int dim2);`
Move some dims utils to common (#2223) 2025-05-29 22:48:30 +09:00
fix for max block dim (#2631) 2025-09-29 08:59:25 -07:00			`// Get the num_blocks and block_dims assuming each thread handles`
			`// \|work_per_thread\| elements of \|arr\|.`
Remove the kernel arg from get_launch_args (#2437) 2025-07-30 11:43:02 +09:00			`std::tuple<dim3, uint> get_launch_args(`
[CUDA] Fix back-end bugs and enable corresponding tests (#2296) * Fix some cuda back-end bugs and enable corresponding tests * more fixes * enable more tests * format 2025-06-16 08:45:40 -07:00			`size_t size,`
			`const Shape& shape,`
			`const Strides& strides,`
CUDA backend: binary ops (#2259) 2025-06-10 22:37:40 +09:00			`bool large,`
fix for max block dim (#2631) 2025-09-29 08:59:25 -07:00			`int work_per_thread = 1,`
			`uint max_block_dim = 1024);`
CUDA backend: binary ops (#2259) 2025-06-10 22:37:40 +09:00
fix for max block dim (#2631) 2025-09-29 08:59:25 -07:00			`inline std::tuple<dim3, uint> get_launch_args(`
			`const array& arr,`
			`bool large,`
			`int work_per_thread = 1,`
			`uint max_block_dim = 1024) {`
[CUDA] Fix back-end bugs and enable corresponding tests (#2296) * Fix some cuda back-end bugs and enable corresponding tests * more fixes * enable more tests * format 2025-06-16 08:45:40 -07:00			`return get_launch_args(`
fix for max block dim (#2631) 2025-09-29 08:59:25 -07:00			`arr.size(),`
			`arr.shape(),`
			`arr.strides(),`
			`large,`
			`work_per_thread,`
			`max_block_dim);`
[CUDA] Fix back-end bugs and enable corresponding tests (#2296) * Fix some cuda back-end bugs and enable corresponding tests * more fixes * enable more tests * format 2025-06-16 08:45:40 -07:00			`}`

CUDA backend: backbone (#2075) 2025-05-07 13:26:46 +09:00			`} // namespace mlx::core`