rebase

2025-11-05 11:28:12 +08:00 · 2025-08-29 17:13:50 +00:00
parent 9a878bae64
commit ec3f487eed
547 changed files with 10067 additions and 2222 deletions
--- a/docs/build/html/_sources/dev/custom_metal_kernels.rst
+++ b/docs/build/html/_sources/dev/custom_metal_kernels.rst
@@ -127,7 +127,8 @@ relying on a copy from ``ensure_row_contiguous``:
      name="myexp_strided",
      input_names=["inp"],
      output_names=["out"],
-      source=source
+      source=source,
+      ensure_row_contiguous=False,
  )

  def exp_elementwise(a: mx.array):
@@ -138,7 +139,6 @@ relying on a copy from ``ensure_row_contiguous``:
          threadgroup=(256, 1, 1),
          output_shapes=[a.shape],
          output_dtypes=[a.dtype],
-          ensure_row_contiguous=False,
      )
      return outputs[0]

--- a/docs/build/html/_sources/index.rst
+++ b/docs/build/html/_sources/index.rst
@@ -70,6 +70,7 @@ are the CPU and GPU.
   python/fft
   python/linalg
   python/metal
+   python/cuda
   python/memory_management
   python/nn
   python/optimizers
--- a/docs/build/html/_sources/install.rst
+++ b/docs/build/html/_sources/install.rst
@@ -271,7 +271,7 @@ and the CUDA toolkit. For example on Ubuntu, run the following:
   dpkg -i cuda-keyring_1.1-1_all.deb
   apt-get update -y
   apt-get -y install cuda-toolkit-12-9
-   apt-get install libblas-dev liblapack-dev liblapacke-dev -y
+   apt-get install libblas-dev liblapack-dev liblapacke-dev libcudnn9-dev-cuda-12 -y


 When building either the Python or C++ APIs make sure to pass the cmake flag
--- a/docs/build/html/_sources/python/_autosummary/mlx.core.cuda.is_available.rst
+++ b/docs/build/html/_sources/python/_autosummary/mlx.core.cuda.is_available.rst
@@ -0,0 +1,6 @@
+mlx.core.cuda.is\_available
+===========================
+
+.. currentmodule:: mlx.core.cuda
+
+.. autofunction:: is_available
--- a/docs/build/html/_sources/python/_autosummary/mlx.core.fast.cuda_kernel.rst
+++ b/docs/build/html/_sources/python/_autosummary/mlx.core.fast.cuda_kernel.rst
@@ -0,0 +1,6 @@
+mlx.core.fast.cuda\_kernel
+==========================
+
+.. currentmodule:: mlx.core.fast
+
+.. autofunction:: cuda_kernel
--- a/docs/build/html/_sources/python/cuda.rst
+++ b/docs/build/html/_sources/python/cuda.rst
@@ -0,0 +1,9 @@
+CUDA
+=====
+
+.. currentmodule:: mlx.core.cuda
+
+.. autosummary::
+  :toctree: _autosummary
+
+  is_available
--- a/docs/build/html/_sources/python/fast.rst
+++ b/docs/build/html/_sources/python/fast.rst
@@ -13,3 +13,4 @@ Fast
  rope
  scaled_dot_product_attention
  metal_kernel
+  cuda_kernel
--- a/docs/build/html/_sources/usage/compile.rst
+++ b/docs/build/html/_sources/usage/compile.rst
@@ -225,7 +225,7 @@ In some cases returning updated state can be pretty inconvenient. Hence,
  def fun(x, y):
      z = x + y
      state.append(z)
-      return mx.exp(z), state
+      return mx.exp(z)

  fun(mx.array(1.0), mx.array(2.0))
  # Prints [array(3, dtype=float32)]