use arch specific targets when possible (#2771)

2025-12-16 01:49:05 +08:00 · 2025-11-14 20:04:18 -08:00
parent 3c622ddd1d
commit 1bf605d56d
4 changed files with 34 additions and 5 deletions
--- a/mlx/backend/cuda/jit_module.cpp
+++ b/mlx/backend/cuda/jit_module.cpp
@@ -279,11 +279,14 @@ void compile(
  // Compile program.
  std::vector<const char*> args;
  bool use_sass = compiler_supports_device_sass(device);
+  auto cc = device.compute_capability_major();
+  std::string arch_tag = (cc == 90 || cc == 100 || cc == 121) ? "a" : "";
  std::string compute = fmt::format(
-      "--gpu-architecture={}_{}{}",
+      "--gpu-architecture={}_{}{}{}",
      use_sass ? "sm" : "compute",
-      device.compute_capability_major(),
-      device.compute_capability_minor());
+      cc,
+      device.compute_capability_minor(),
+      arch_tag);
  args.push_back(compute.c_str());
  std::string cccl_include = cccl_dir();
  if (!cccl_include.empty()) {