divmod, partition, sort fixes (#2302)

2025-12-16 01:49:05 +08:00 · 2025-06-16 18:49:32 -07:00
parent bc53f8293f
commit b8022c578a
8 changed files with 271 additions and 49 deletions
--- a/mlx/backend/cuda/device/binary_ops.cuh
+++ b/mlx/backend/cuda/device/binary_ops.cuh
@@ -22,7 +22,7 @@ struct FloorDivide {
    if constexpr (cuda::std::is_integral_v<T>) {
      return x / y;
    } else {
-      return trunc(x / y);
+      return truncf(x / y);
    }
  }
 };
@@ -132,7 +132,7 @@ struct LogAddExp {
          cuda::std::numeric_limits<float>::quiet_NaN(),
          cuda::std::numeric_limits<float>::quiet_NaN()};
    }
-    constexpr float inf = cuda::std::numeric_limits<float>::infinity();
+    float inf = cuda::std::numeric_limits<float>::infinity();
    auto maxval = x > y ? x : y;
    auto minval = x < y ? x : y;
    if (cuCrealf(minval) == -inf || cuCrealf(maxval) == inf)
--- a/mlx/backend/cuda/device/config.h
+++ b/mlx/backend/cuda/device/config.h
@@ -5,7 +5,7 @@
 #pragma once

 // The maximum dimensions of shape/strides passed as kernel parameters.
-#define MAX_NDIM 8
+#define MAX_NDIM 10

 // All existing NVIDIA hardware has a fixed 32 warp size. Though a built-in
 // warpSize variable exists, using it would prevent compile-time optimizations.