rebase + nit (#2260)

Co-authored-by: Awni Hannun <awni@apple.com>
2025-12-16 01:49:05 +08:00 · 2025-06-11 02:51:51 +09:00
parent 62fecf3e13
commit 99c33d011d
10 changed files with 604 additions and 28 deletions
--- a/mlx/backend/cuda/CMakeLists.txt
+++ b/mlx/backend/cuda/CMakeLists.txt
@@ -7,7 +7,11 @@ target_sources(
  mlx
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_contiguous.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general_dynamic.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general_input.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/eval.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/event.cu
@@ -28,6 +32,15 @@ target_compile_definitions(mlx PRIVATE MLX_USE_CUDA)
 target_compile_options(mlx
                       PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>")

+# CUDA 12.8 emits warning #20280-D for copy kernels which is a false positive.
+# Explicitly pass this flag to suppress the warning, it is safe to set it to
+# true but the warning wouldn't be suppressed.
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
+  target_compile_options(
+    mlx
+    PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--static-global-template-stub=false>")
+endif()
+
 # Compute capability 7 is required for synchronization between CPU/GPU with
 # managed memory. TODO: Add more architectures for potential performance gain.
 set(MLX_CUDA_ARCHITECTURES