diff --git a/mlx/backend/cuda/CMakeLists.txt b/mlx/backend/cuda/CMakeLists.txt index 1a394e580..bb539c9ec 100644 --- a/mlx/backend/cuda/CMakeLists.txt +++ b/mlx/backend/cuda/CMakeLists.txt @@ -108,7 +108,7 @@ endif() # Compute capability 7 is required for synchronization between CPU/GPU with # managed memory. TODO: Add more architectures for potential performance gain. set(MLX_CUDA_ARCHITECTURES - "70;80" + "70;80;90" CACHE STRING "CUDA architectures") message(STATUS "CUDA architectures: ${MLX_CUDA_ARCHITECTURES}") set_target_properties(mlx PROPERTIES CUDA_ARCHITECTURES diff --git a/mlx/backend/cuda/device/atomic_ops.cuh b/mlx/backend/cuda/device/atomic_ops.cuh index 5df246c0e..cfb185c43 100644 --- a/mlx/backend/cuda/device/atomic_ops.cuh +++ b/mlx/backend/cuda/device/atomic_ops.cuh @@ -49,11 +49,7 @@ inline __device__ void atomic_add(__half* out, __half val) { } inline __device__ void atomic_add(complex64_t* out, complex64_t val) { -#if __CUDA_ARCH__ < 900 atomic_add_general(out, val); -#else - atomicAdd(out, val); -#endif } inline __device__ void atomic_add(__nv_bfloat16* out, __nv_bfloat16 val) {