mlx/backend/cuda/jit_module.h

// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/array.h"
#include "mlx/backend/common/utils.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/device/config.h"

#include <deque>
#include <unordered_map>
#include <utility>
#include <variant>

#include <cuda.h>
#include <fmt/format.h>

namespace mlx::core::cu {

class Device;

using KernelBuilderResult = std::tuple<
    /* precompiled */ bool,
    /* source code */ std::string,
    /* kernel names */ std::vector<std::string>>;
using KernelBuilder = std::function<KernelBuilderResult()>;

struct KernelArgs {
  void** args() {
    return args_.data();
  }

  void append(const array& a) {
    append(reinterpret_cast<CUdeviceptr>(gpu_ptr<void>(a)));
  }

  template <typename T>
  void append(T val) {
    storage_.emplace_back(val);
    append_ptr(&storage_.back());
  }

  template <typename T>
  void append(SmallVector<T> vec) {
    storage_.emplace_back(std::move(vec));
    append_ptr(std::get<SmallVector<T>>(storage_.back()).data());
  }

  template <typename T>
  void append(const std::vector<T>& vec) {
    append(SmallVector<T>(vec.begin(), vec.end()));
  }

  // Make sure the arg is copied to an array with size of NDIM.
  template <size_t NDIM = MAX_NDIM, typename T>
  void append_ndim(SmallVector<T> vec) {
    if (vec.size() > NDIM) {
      throw std::runtime_error(
          fmt::format("ndim can not be larger than {}.", NDIM));
    }
    vec.resize(NDIM);
    append(std::move(vec));
  }

  void append_ptr(const void* v) {
    args_.push_back(const_cast<void*>(v));
  }

 private:
  std::vector<void*> args_;

  // The cuGraphAddKernelNode API requires passing pointers to arguments so
  // store temporary values until the node is created.
  using Arg = std::variant<
      std::monostate,
      CUdeviceptr,
      bool,
      int32_t,
      uint32_t,
      int64_t,
      float,
      SmallVector<const void*>,
      SmallVector<int32_t>,
      SmallVector<int64_t>>;
  std::deque<Arg> storage_;
};

class JitModule {
 public:
  JitModule(
      Device& device,
      const std::string& module_name,
      const KernelBuilder& builder,
      bool cache);
  ~JitModule();

  JitModule(const JitModule&) = delete;
  JitModule& operator=(const JitModule&) = delete;
  CUfunction get_kernel(
      const std::string& kernel_name,
      std::function<void(CUfunction)> configure_kernel = nullptr);
  std::pair<CUfunction, uint> get_kernel_and_dims(
      const std::string& kernel_name,
      std::function<void(CUfunction)> configure_kernel = nullptr);

 private:
  CUmodule module_{nullptr};
  std::unordered_map<std::string, std::tuple<CUfunction, bool, uint>> kernels_;
};

std::unordered_map<std::string, JitModule>& get_jit_module_cache();

JitModule& get_jit_module(
    const mlx::core::Device& device,
    const std::string& name,
    const KernelBuilder& builder,
    bool use_disk_cache = true);

} // namespace mlx::core::cu
CUDA backend: compile (#2276) * CUDA backend: compile * Rename kernels/ to device/ 2025-06-13 09:08:39 +09:00			`// Copyright © 2025 Apple Inc.`

			`#pragma once`

			`#include "mlx/array.h"`
			`#include "mlx/backend/common/utils.h"`
[CUDA] Switch to CUDA graphs (#2317) * cuda graph prototype fix signal bug + start to add dependencies capture more capture more ops remaining ops fix reduce and rope deps add concurrent context try update, but not working cosistent topology order use node api use node api directly to reduce overhead fix bug use kernels in unary cache graph format fix synchronization format * comment 2025-07-02 15:59:13 -07:00			`#include "mlx/backend/cuda/device.h"`
CUDA backend: compile (#2276) * CUDA backend: compile * Rename kernels/ to device/ 2025-06-13 09:08:39 +09:00			`#include "mlx/backend/cuda/device/config.h"`

			`#include <deque>`
			`#include <unordered_map>`
			`#include <utility>`
			`#include <variant>`

			`#include <cuda.h>`
			`#include <fmt/format.h>`

			`namespace mlx::core::cu {`

			`class Device;`

Custom cuda kernel (#2517) 2025-08-20 17:20:22 -07:00			`using KernelBuilderResult = std::tuple<`
			`/* precompiled */ bool,`
CUDA backend: compile (#2276) * CUDA backend: compile * Rename kernels/ to device/ 2025-06-13 09:08:39 +09:00			`/* source code */ std::string,`
			`/* kernel names */ std::vector<std::string>>;`
			`using KernelBuilder = std::function<KernelBuilderResult()>;`

[CUDA] Switch to CUDA graphs (#2317) * cuda graph prototype fix signal bug + start to add dependencies capture more capture more ops remaining ops fix reduce and rope deps add concurrent context try update, but not working cosistent topology order use node api use node api directly to reduce overhead fix bug use kernels in unary cache graph format fix synchronization format * comment 2025-07-02 15:59:13 -07:00			`struct KernelArgs {`
			`void** args() {`
			`return args_.data();`
			`}`
CUDA backend: compile (#2276) * CUDA backend: compile * Rename kernels/ to device/ 2025-06-13 09:08:39 +09:00
[CUDA] Switch to CUDA graphs (#2317) * cuda graph prototype fix signal bug + start to add dependencies capture more capture more ops remaining ops fix reduce and rope deps add concurrent context try update, but not working cosistent topology order use node api use node api directly to reduce overhead fix bug use kernels in unary cache graph format fix synchronization format * comment 2025-07-02 15:59:13 -07:00			`void append(const array& a) {`
[CUDA] Reduce use of managed memory (#2725) * Use async cuda malloc managed with cuda 13 * add pool threshold * refactor for regular cuda malloc * load eval gpu for cuda * remove use of cuda pool, use cuda free async * fix * fix * fix * fix * fix + comment 2025-11-05 16:05:23 -08:00			`append(reinterpret_cast<CUdeviceptr>(gpu_ptr<void>(a)));`
CUDA backend: compile (#2276) * CUDA backend: compile * Rename kernels/ to device/ 2025-06-13 09:08:39 +09:00			`}`

			`template <typename T>`
[CUDA] Switch to CUDA graphs (#2317) * cuda graph prototype fix signal bug + start to add dependencies capture more capture more ops remaining ops fix reduce and rope deps add concurrent context try update, but not working cosistent topology order use node api use node api directly to reduce overhead fix bug use kernels in unary cache graph format fix synchronization format * comment 2025-07-02 15:59:13 -07:00			`void append(T val) {`
CUDA backend: compile (#2276) * CUDA backend: compile * Rename kernels/ to device/ 2025-06-13 09:08:39 +09:00			`storage_.emplace_back(val);`
[CUDA] Switch to CUDA graphs (#2317) * cuda graph prototype fix signal bug + start to add dependencies capture more capture more ops remaining ops fix reduce and rope deps add concurrent context try update, but not working cosistent topology order use node api use node api directly to reduce overhead fix bug use kernels in unary cache graph format fix synchronization format * comment 2025-07-02 15:59:13 -07:00			`append_ptr(&storage_.back());`
CUDA backend: compile (#2276) * CUDA backend: compile * Rename kernels/ to device/ 2025-06-13 09:08:39 +09:00			`}`

			`template <typename T>`
Use SmallVector for shapes and strides (#2454) * Use SmallVector for shapes and strides * Convert SmallVector to tuple 2025-08-05 09:41:03 +09:00			`void append(SmallVector<T> vec) {`
			`storage_.emplace_back(std::move(vec));`
			`append_ptr(std::get<SmallVector<T>>(storage_.back()).data());`
CUDA backend: compile (#2276) * CUDA backend: compile * Rename kernels/ to device/ 2025-06-13 09:08:39 +09:00			`}`

[CUDA] Implement DynamicSlice/DynamicSliceUpdate (#2533) * Move DynamicSlice to gpu/primitives * Implement compute_dynamic_offset in CUDA 2025-08-26 07:31:39 +09:00			`template <typename T>`
			`void append(const std::vector<T>& vec) {`
			`append(SmallVector<T>(vec.begin(), vec.end()));`
			`}`

CUDA backend: compile (#2276) * CUDA backend: compile * Rename kernels/ to device/ 2025-06-13 09:08:39 +09:00			`// Make sure the arg is copied to an array with size of NDIM.`
			`template <size_t NDIM = MAX_NDIM, typename T>`
Use SmallVector for shapes and strides (#2454) * Use SmallVector for shapes and strides * Convert SmallVector to tuple 2025-08-05 09:41:03 +09:00			`void append_ndim(SmallVector<T> vec) {`
CUDA backend: compile (#2276) * CUDA backend: compile * Rename kernels/ to device/ 2025-06-13 09:08:39 +09:00			`if (vec.size() > NDIM) {`
			`throw std::runtime_error(`
			`fmt::format("ndim can not be larger than {}.", NDIM));`
			`}`
[CUDA] Switch to CUDA graphs (#2317) * cuda graph prototype fix signal bug + start to add dependencies capture more capture more ops remaining ops fix reduce and rope deps add concurrent context try update, but not working cosistent topology order use node api use node api directly to reduce overhead fix bug use kernels in unary cache graph format fix synchronization format * comment 2025-07-02 15:59:13 -07:00			`vec.resize(NDIM);`
			`append(std::move(vec));`
CUDA backend: compile (#2276) * CUDA backend: compile * Rename kernels/ to device/ 2025-06-13 09:08:39 +09:00			`}`

[CUDA] Switch to CUDA graphs (#2317) * cuda graph prototype fix signal bug + start to add dependencies capture more capture more ops remaining ops fix reduce and rope deps add concurrent context try update, but not working cosistent topology order use node api use node api directly to reduce overhead fix bug use kernels in unary cache graph format fix synchronization format * comment 2025-07-02 15:59:13 -07:00			`void append_ptr(const void* v) {`
			`args_.push_back(const_cast<void*>(v));`
			`}`
CUDA backend: compile (#2276) * CUDA backend: compile * Rename kernels/ to device/ 2025-06-13 09:08:39 +09:00
			`private:`
			`std::vector<void*> args_;`

Custom cuda kernel (#2517) 2025-08-20 17:20:22 -07:00			`// The cuGraphAddKernelNode API requires passing pointers to arguments so`
			`// store temporary values until the node is created.`
CUDA backend: compile (#2276) * CUDA backend: compile * Rename kernels/ to device/ 2025-06-13 09:08:39 +09:00			`using Arg = std::variant<`
			`std::monostate,`
			`CUdeviceptr,`
Custom cuda kernel (#2517) 2025-08-20 17:20:22 -07:00			`bool,`
CUDA backend: compile (#2276) * CUDA backend: compile * Rename kernels/ to device/ 2025-06-13 09:08:39 +09:00			`int32_t,`
			`uint32_t,`
			`int64_t,`
Custom cuda kernel (#2517) 2025-08-20 17:20:22 -07:00			`float,`
Use SmallVector for shapes and strides (#2454) * Use SmallVector for shapes and strides * Convert SmallVector to tuple 2025-08-05 09:41:03 +09:00			`SmallVector<const void*>,`
			`SmallVector<int32_t>,`
			`SmallVector<int64_t>>;`
CUDA backend: compile (#2276) * CUDA backend: compile * Rename kernels/ to device/ 2025-06-13 09:08:39 +09:00			`std::deque<Arg> storage_;`
			`};`

[CUDA] Switch to CUDA graphs (#2317) * cuda graph prototype fix signal bug + start to add dependencies capture more capture more ops remaining ops fix reduce and rope deps add concurrent context try update, but not working cosistent topology order use node api use node api directly to reduce overhead fix bug use kernels in unary cache graph format fix synchronization format * comment 2025-07-02 15:59:13 -07:00			`class JitModule {`
			`public:`
			`JitModule(`
			`Device& device,`
			`const std::string& module_name,`
Custom cuda kernel (#2517) 2025-08-20 17:20:22 -07:00			`const KernelBuilder& builder,`
			`bool cache);`
[CUDA] Switch to CUDA graphs (#2317) * cuda graph prototype fix signal bug + start to add dependencies capture more capture more ops remaining ops fix reduce and rope deps add concurrent context try update, but not working cosistent topology order use node api use node api directly to reduce overhead fix bug use kernels in unary cache graph format fix synchronization format * comment 2025-07-02 15:59:13 -07:00			`~JitModule();`

			`JitModule(const JitModule&) = delete;`
			`JitModule& operator=(const JitModule&) = delete;`
Custom cuda kernel (#2517) 2025-08-20 17:20:22 -07:00			`CUfunction get_kernel(`
			`const std::string& kernel_name,`
			`std::function<void(CUfunction)> configure_kernel = nullptr);`
fix for max block dim (#2631) 2025-09-29 08:59:25 -07:00			`std::pair<CUfunction, uint> get_kernel_and_dims(`
			`const std::string& kernel_name,`
			`std::function<void(CUfunction)> configure_kernel = nullptr);`
[CUDA] Switch to CUDA graphs (#2317) * cuda graph prototype fix signal bug + start to add dependencies capture more capture more ops remaining ops fix reduce and rope deps add concurrent context try update, but not working cosistent topology order use node api use node api directly to reduce overhead fix bug use kernels in unary cache graph format fix synchronization format * comment 2025-07-02 15:59:13 -07:00
			`private:`
			`CUmodule module_{nullptr};`
fix for max block dim (#2631) 2025-09-29 08:59:25 -07:00			`std::unordered_map<std::string, std::tuple<CUfunction, bool, uint>> kernels_;`
[CUDA] Switch to CUDA graphs (#2317) * cuda graph prototype fix signal bug + start to add dependencies capture more capture more ops remaining ops fix reduce and rope deps add concurrent context try update, but not working cosistent topology order use node api use node api directly to reduce overhead fix bug use kernels in unary cache graph format fix synchronization format * comment 2025-07-02 15:59:13 -07:00			`};`

[CUDA] Fix segfault on exit (#2424) * fix cuda segfault on exit * comment 2025-07-27 08:08:13 -07:00			`std::unordered_map<std::string, JitModule>& get_jit_module_cache();`

CUDA backend: compile (#2276) * CUDA backend: compile * Rename kernels/ to device/ 2025-06-13 09:08:39 +09:00			`JitModule& get_jit_module(`
			`const mlx::core::Device& device,`
			`const std::string& name,`
Custom cuda kernel (#2517) 2025-08-20 17:20:22 -07:00			`const KernelBuilder& builder,`
			`bool use_disk_cache = true);`
CUDA backend: compile (#2276) * CUDA backend: compile * Rename kernels/ to device/ 2025-06-13 09:08:39 +09:00
			`} // namespace mlx::core::cu`