rebase

2025-12-16 01:49:05 +08:00 · 2025-01-09 21:56:20 +00:00
parent 04b749a588
commit d8d647015b
2642 changed files with 137687 additions and 70861 deletions
--- a/docs/build/html/_sources/dev/extensions.rst
+++ b/docs/build/html/_sources/dev/extensions.rst
@@ -420,8 +420,8 @@ element in the output.
            constant const float& alpha [[buffer(3)]],
            constant const float& beta [[buffer(4)]],
            constant const int* shape [[buffer(5)]],
-            constant const size_t* x_strides [[buffer(6)]],
-            constant const size_t* y_strides [[buffer(7)]],
+            constant const int64_t* x_strides [[buffer(6)]],
+            constant const int64_t* y_strides [[buffer(7)]],
            constant const int& ndim [[buffer(8)]],
            uint index [[thread_position_in_grid]]) {
        // Convert linear indices to offsets in array
@@ -438,24 +438,10 @@ each instantiation a unique host name so we can identify it.

 .. code-block:: C++

-    #define instantiate_axpby(type_name, type)              \
-        template [[host_name("axpby_general_" #type_name)]] \
-        [[kernel]] void axpby_general<type>(                \
-            device const type* x [[buffer(0)]],             \
-            device const type* y [[buffer(1)]],             \
-            device type* out [[buffer(2)]],                 \
-            constant const float& alpha [[buffer(3)]],      \
-            constant const float& beta [[buffer(4)]],       \
-            constant const int* shape [[buffer(5)]],        \
-            constant const size_t* x_strides [[buffer(6)]], \
-            constant const size_t* y_strides [[buffer(7)]], \
-            constant const int& ndim [[buffer(8)]],         \
-            uint index [[thread_position_in_grid]]);
-
-    instantiate_axpby(float32, float);
-    instantiate_axpby(float16, half);
-    instantiate_axpby(bfloat16, bfloat16_t);
-    instantiate_axpby(complex64, complex64_t);
+    instantiate_kernel("axpby_general_float32", axpby_general, float)
+    instantiate_kernel("axpby_general_float16", axpby_general, float16_t)
+    instantiate_kernel("axpby_general_bfloat16", axpby_general, bfloat16_t)
+    instantiate_kernel("axpby_general_complex64", axpby_general, complex64_t)

 The logic to determine the kernel, set the inputs, resolve the grid dimensions,
 and dispatch to the GPU are contained in :meth:`Axpby::eval_gpu` as shown
--- a/docs/build/html/_sources/dev/mlx_in_cpp.rst
+++ b/docs/build/html/_sources/dev/mlx_in_cpp.rst
@@ -0,0 +1,121 @@
+.. _mlx_in_cpp:
+
+Using MLX in C++
+================
+
+You can use MLX in a C++ project with CMake.
+
+.. note::
+
+  This guide is based one the following `example using MLX in C++ 
+  <https://github.com/ml-explore/mlx/tree/main/examples/cmake_project>`_
+
+First install MLX:
+
+.. code-block:: bash
+
+  pip install -U mlx
+
+You can also install the MLX Python package from source or just the C++
+library. For more information see the :ref:`documentation on installing MLX
+<build_and_install>`.
+
+Next make an example program in ``example.cpp``: 
+
+.. code-block:: C++
+
+  #include <iostream>
+
+  #include "mlx/mlx.h"
+
+  namespace mx = mlx::core;
+
+  int main() {
+    auto x = mx::array({1, 2, 3});
+    auto y = mx::array({1, 2, 3});
+    std::cout << x + y << std::endl;
+    return 0;
+  }
+
+The next step is to setup a CMake file in ``CMakeLists.txt``:
+
+.. code-block:: cmake
+
+  cmake_minimum_required(VERSION 3.27)
+
+  project(example LANGUAGES CXX)
+
+  set(CMAKE_CXX_STANDARD 17)
+  set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+
+Depending on how you installed MLX, you may need to tell CMake where to
+find it. 
+
+If you installed MLX with Python, then add the following to the CMake file:
+
+.. code-block:: cmake
+
+  find_package(
+    Python 3.9
+    COMPONENTS Interpreter Development.Module
+    REQUIRED)
+  execute_process(
+    COMMAND "${Python_EXECUTABLE}" -m mlx --cmake-dir
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    OUTPUT_VARIABLE MLX_ROOT)
+
+If you installed the MLX C++ package to a system path, then CMake should be
+able to find it. If you installed it to a non-standard location or CMake can't
+find MLX then set ``MLX_ROOT`` to the location where MLX is installed:
+
+.. code-block:: cmake
+
+  set(MLX_ROOT "/path/to/mlx/")
+
+Next, instruct CMake to find MLX:
+
+.. code-block:: cmake
+
+  find_package(MLX CONFIG REQUIRED)
+
+Finally, add the ``example.cpp`` program as an executable and link MLX.
+
+.. code-block:: cmake
+
+  add_executable(example example.cpp)
+  target_link_libraries(example PRIVATE mlx)
+
+You can build the example with:
+
+.. code-block:: bash
+
+  cmake -B build -DCMAKE_BUILD_TYPE=Release
+  cmake --build build
+
+And run it with:
+
+.. code-block:: bash
+
+  ./build/example
+
+Note ``find_package(MLX CONFIG REQUIRED)`` sets the following variables:
+
+.. list-table:: Package Variables
+   :widths: 20 20 
+   :header-rows: 1
+
+   * - Variable 
+     - Description 
+   * - MLX_FOUND
+     - ``True`` if MLX is found
+   * - MLX_INCLUDE_DIRS
+     - Include directory
+   * - MLX_LIBRARIES
+     - Libraries to link against
+   * - MLX_CXX_FLAGS
+     - Additional compiler flags
+   * - MLX_BUILD_ACCELERATE
+     - ``True`` if MLX was built with Accelerate 
+   * - MLX_BUILD_METAL
+     - ``True`` if MLX was built with Metal
--- a/docs/build/html/_sources/index.rst
+++ b/docs/build/html/_sources/index.rst
@@ -45,6 +45,7 @@ are the CPU and GPU.
   usage/numpy
   usage/distributed
   usage/using_streams
+   usage/export

 .. toctree::
   :caption: Examples
@@ -61,6 +62,7 @@ are the CPU and GPU.
   python/array
   python/data_types
   python/devices_and_streams
+   python/export
   python/ops
   python/random
   python/transforms
@@ -86,3 +88,4 @@ are the CPU and GPU.
   dev/extensions
   dev/metal_debugger
   dev/custom_metal_kernels
+   dev/mlx_in_cpp
--- a/docs/build/html/_sources/install.rst
+++ b/docs/build/html/_sources/install.rst
@@ -1,3 +1,5 @@
+.. _build_and_install:
+
 Build and Install
 =================

@@ -53,7 +55,7 @@ Build Requirements
 ^^^^^^^^^^^^^^^^^^

 - A C++ compiler with C++17 support (e.g. Clang >= 5.0)
- `cmake <https://cmake.org/>`_ -- version 3.24 or later, and ``make``
+- `cmake <https://cmake.org/>`_ -- version 3.25 or later, and ``make``
 - Xcode >= 15.0 and macOS SDK >= 14.0

 .. note::
--- a/docs/build/html/_sources/python/_autosummary/mlx.core.export_function.rst
+++ b/docs/build/html/_sources/python/_autosummary/mlx.core.export_function.rst
@@ -0,0 +1,6 @@
+mlx.core.export\_function
+=========================
+
+.. currentmodule:: mlx.core
+
+.. autofunction:: export_function
--- a/docs/build/html/_sources/python/_autosummary/mlx.core.export_to_dot.rst
+++ b/docs/build/html/_sources/python/_autosummary/mlx.core.export_to_dot.rst
@@ -0,0 +1,6 @@
+mlx.core.export\_to\_dot
+========================
+
+.. currentmodule:: mlx.core
+
+.. autofunction:: export_to_dot
--- a/docs/build/html/_sources/python/_autosummary/mlx.core.exporter.rst
+++ b/docs/build/html/_sources/python/_autosummary/mlx.core.exporter.rst
@@ -0,0 +1,6 @@
+mlx.core.exporter
+=================
+
+.. currentmodule:: mlx.core
+
+.. autofunction:: exporter
--- a/docs/build/html/_sources/python/_autosummary/mlx.core.finfo.rst
+++ b/docs/build/html/_sources/python/_autosummary/mlx.core.finfo.rst
@@ -0,0 +1,30 @@
+mlx.core.finfo
+==============
+
+.. currentmodule:: mlx.core
+
+.. autoclass:: finfo
+
+   
+   .. automethod:: __init__
+
+   
+   .. rubric:: Methods
+
+   .. autosummary::
+   
+      ~finfo.__init__
+   
+   
+
+   
+   
+   .. rubric:: Attributes
+
+   .. autosummary::
+   
+      ~finfo.dtype
+      ~finfo.max
+      ~finfo.min
+   
+   
--- a/docs/build/html/_sources/python/_autosummary/mlx.core.import_function.rst
+++ b/docs/build/html/_sources/python/_autosummary/mlx.core.import_function.rst
@@ -0,0 +1,6 @@
+mlx.core.import\_function
+=========================
+
+.. currentmodule:: mlx.core
+
+.. autofunction:: import_function
--- a/docs/build/html/_sources/python/_autosummary/mlx.core.kron.rst
+++ b/docs/build/html/_sources/python/_autosummary/mlx.core.kron.rst
@@ -0,0 +1,6 @@
+mlx.core.kron
+=============
+
+.. currentmodule:: mlx.core
+
+.. autofunction:: kron
--- a/docs/build/html/_sources/python/_autosummary/mlx.core.slice.rst
+++ b/docs/build/html/_sources/python/_autosummary/mlx.core.slice.rst
@@ -0,0 +1,6 @@
+mlx.core.slice
+==============
+
+.. currentmodule:: mlx.core
+
+.. autofunction:: slice
--- a/docs/build/html/_sources/python/_autosummary/mlx.core.slice_update.rst
+++ b/docs/build/html/_sources/python/_autosummary/mlx.core.slice_update.rst
@@ -0,0 +1,6 @@
+mlx.core.slice\_update
+======================
+
+.. currentmodule:: mlx.core
+
+.. autofunction:: slice_update
--- a/docs/build/html/_sources/python/_autosummary/mlx.core.unflatten.rst
+++ b/docs/build/html/_sources/python/_autosummary/mlx.core.unflatten.rst
@@ -0,0 +1,6 @@
+mlx.core.unflatten
+==================
+
+.. currentmodule:: mlx.core
+
+.. autofunction:: unflatten
--- a/docs/build/html/_sources/python/data_types.rst
+++ b/docs/build/html/_sources/python/data_types.rst
@@ -66,3 +66,4 @@ documentation for more information. Use :func:`issubdtype` to determine if one
   Dtype
   DtypeCategory
   issubdtype
+   finfo
--- a/docs/build/html/_sources/python/export.rst
+++ b/docs/build/html/_sources/python/export.rst
@@ -0,0 +1,14 @@
+.. _export:
+
+Export Functions
+================
+
+.. currentmodule:: mlx.core
+
+.. autosummary::
+  :toctree: _autosummary
+
+   export_function
+   import_function
+   exporter
+   export_to_dot
--- a/docs/build/html/_sources/python/ops.rst
+++ b/docs/build/html/_sources/python/ops.rst
@@ -89,6 +89,7 @@ Operations
   isneginf
   isposinf
   issubdtype
+   kron
   left_shift
   less
   less_equal
@@ -144,6 +145,8 @@ Operations
   sign
   sin
   sinh
+   slice
+   slice_update
   softmax
   sort
   split
@@ -168,6 +171,7 @@ Operations
   tri
   tril
   triu
+   unflatten
   var
   view
   where
--- a/docs/build/html/_sources/usage/compile.rst
+++ b/docs/build/html/_sources/usage/compile.rst
@@ -421,3 +421,77 @@ the most opportunity to optimize the computation graph:
  # Compiling the outer function is good to do as it will likely
  # be faster even though the inner functions are compiled
  fun = mx.compile(outer)
+
+
+
+.. _shapeless_compile:
+
+Shapeless Compilation
+---------------------
+
+When the shape of an input to a compiled function changes, the function is
+recompiled. You can compile a function once and run it on inputs with
+variable shapes by specifying ``shapeless=True`` to :func:`compile`. In this
+case changes to the shapes of the inputs do not cause the function to be
+recompiled.
+
+.. code-block:: python
+
+  def fun(x, y):
+      return mx.abs(x + y)
+
+  compiled_fun = mx.compile(fun, shapeless=True)
+
+  x = mx.array(1.0)
+  y = mx.array(-2.0)
+
+  # Firt call compiles the function
+  print(compiled_fun(x, y))
+
+  # Second call with different shapes
+  # does not recompile the function
+  x = mx.array([1.0, -6.0])
+  y = mx.array([-2.0, 3.0])
+  print(compiled_fun(x, y))
+
+
+Use shapeless compilations carefully. Since compilation is not triggered when
+shapes change, any graphs which are conditional on the input shapes will not
+work as expected. Shape-dependent computations are common and sometimes subtle
+to detect. For example:
+
+.. code-block:: python
+
+  def fun(x):
+      return x.reshape(x.shape[0] * x.shape[1], -1)
+
+  compiled_fun = mx.compile(fun, shapeless=True)
+
+  x = mx.random.uniform(shape=(2, 3, 4))
+
+  out = compiled_fun(x)
+
+  x = mx.random.uniform(shape=(5, 5, 3))
+
+  # Error, can't reshape (5, 5, 3) to (6, -1)
+  out = compiled_fun(x)
+
+The second call to the ``compiled_fun`` fails because of the call to
+:func:`reshape` which uses the static shape of ``x`` in the first call. We can
+fix this by using :func:`flatten` to avoid hardcoding the shape of ``x``:
+
+.. code-block:: python
+
+  def fun(x):
+      return x.flatten(0, 1)
+
+  compiled_fun = mx.compile(fun, shapeless=True)
+
+  x = mx.random.uniform(shape=(2, 3, 4))
+
+  out = compiled_fun(x)
+
+  x = mx.random.uniform(shape=(5, 5, 3))
+
+  # Ok
+  out = compiled_fun(x)
--- a/docs/build/html/_sources/usage/distributed.rst
+++ b/docs/build/html/_sources/usage/distributed.rst
@@ -141,12 +141,13 @@ everything else remaining the same.
    from mlx.utils import tree_map

    def all_reduce_grads(grads):
-        N = mx.distributed.init()
+        N = mx.distributed.init().size()
        if N == 1:
            return grads
        return tree_map(
-                lambda x: mx.distributed.all_sum(x) / N,
-                grads)
+            lambda x: mx.distributed.all_sum(x) / N,
+            grads
+        )

    def step(model, x, y):
        loss, grads = loss_grad_fn(model, x, y)
--- a/docs/build/html/_sources/usage/export.rst
+++ b/docs/build/html/_sources/usage/export.rst
@@ -0,0 +1,288 @@
+.. _export_usage:
+
+Exporting Functions
+===================
+
+.. currentmodule:: mlx.core
+
+MLX has an API to export and import functions to and from a file. This lets you
+run computations written in one MLX front-end (e.g. Python) in another MLX
+front-end (e.g. C++). 
+
+This guide walks through the basics of the MLX export API with some examples.
+To see the full list of functions check-out the :ref:`API documentation
+<export>`.
+
+Basics of Exporting 
+-------------------
+
+Let's start with a simple example:
+ 
+.. code-block:: python
+
+  def fun(x, y):
+    return x + y
+
+  x = mx.array(1.0)
+  y = mx.array(1.0)
+  mx.export_function("add.mlxfn", fun, x, y)
+
+To export a function, provide sample input arrays that the function
+can be called with. The data doesn't matter, but the shapes and types of the
+arrays do. In the above example we exported ``fun`` with two ``float32``
+scalar arrays. We can then import the function and run it:
+
+.. code-block:: python
+
+  add_fun = mx.import_function("add.mlxfn")
+
+  out, = add_fun(mx.array(1.0), mx.array(2.0))
+  # Prints: array(3, dtype=float32)
+  print(out)
+
+  out, = add_fun(mx.array(1.0), mx.array(3.0))
+  # Prints: array(4, dtype=float32)
+  print(out)
+
+  # Raises an exception
+  add_fun(mx.array(1), mx.array(3.0))
+
+  # Raises an exception
+  add_fun(mx.array([1.0, 2.0]), mx.array(3.0))
+
+Notice the third and fourth calls to ``add_fun`` raise exceptions because the
+shapes and types of the inputs are different than the shapes and types of the
+example inputs we exported the function with.
+
+Also notice that even though the original ``fun`` returns a single output
+array, the imported function always returns a tuple of one or more arrays.
+
+The inputs to :func:`export_function` and to an imported function can be
+specified as variable positional arguments or as a tuple of arrays:
+
+.. code-block:: python
+
+  def fun(x, y):
+    return x + y
+
+  x = mx.array(1.0)
+  y = mx.array(1.0)
+   
+  # Both arguments to fun are positional
+  mx.export_function("add.mlxfn", fun, x, y)
+
+  # Same as above
+  mx.export_function("add.mlxfn", fun, (x, y))
+
+  imported_fun = mx.import_function("add.mlxfn")
+
+  # Ok
+  out, = imported_fun(x, y)
+
+  # Also ok
+  out, = imported_fun((x, y))
+
+You can pass example inputs to functions as positional or keyword arguments. If
+you use keyword arguments to export the function, then you have to use the same
+keyword arguments when calling the imported function.
+
+.. code-block:: python
+
+  def fun(x, y):
+    return x + y
+
+  # One argument to fun is positional, the other is a kwarg
+  mx.export_function("add.mlxfn", fun, x, y=y)
+
+  imported_fun = mx.import_function("add.mlxfn")
+
+  # Ok
+  out, = imported_fun(x, y=y)
+
+  # Also ok
+  out, = imported_fun((x,), {"y": y})
+
+  # Raises since the keyword argument is missing
+  out, = imported_fun(x, y)
+
+  # Raises since the keyword argument has the wrong key
+  out, = imported_fun(x, z=y)
+
+
+Exporting Modules
+-----------------
+
+An :obj:`mlx.nn.Module` can be exported with or without the parameters included
+in the exported function. Here's an example:
+
+.. code-block:: python
+
+   model = nn.Linear(4, 4)
+   mx.eval(model.parameters())
+
+   def call(x):
+      return model(x)
+
+   mx.export_function("model.mlxfn", call, mx.zeros(4))
+
+In the above example, the :obj:`mlx.nn.Linear` module is exported. Its
+parameters are also saved to the ``model.mlxfn`` file.
+
+.. note::
+
+   For enclosed arrays inside an exported function, be extra careful to ensure
+   they are evaluated. The computation graph that gets exported will include
+   the computation that produces enclosed inputs.
+  
+   If the above example was missing ``mx.eval(model.parameters()``, the
+   exported function would include the random initialization of the
+   :obj:`mlx.nn.Module` parameters.
+
+If you only want to export the ``Module.__call__`` function without the
+parameters, pass them as inputs to the ``call`` wrapper:
+
+.. code-block:: python
+
+   model = nn.Linear(4, 4)
+   mx.eval(model.parameters())
+
+   def call(x, **params):
+     # Set the model's parameters to the input parameters
+     model.update(tree_unflatten(list(params.items())))
+     return model(x)
+ 
+   params = dict(tree_flatten(model.parameters()))
+   mx.export_function("model.mlxfn", call, (mx.zeros(4),), params)
+
+
+Shapeless Exports
+-----------------
+
+Just like :func:`compile`, functions can also be exported for dynamically shaped
+inputs. Pass ``shapeless=True`` to :func:`export_function` or :func:`exporter`
+to export a function which can be used for inputs with variable shapes:
+
+.. code-block:: python
+
+  mx.export_function("fun.mlxfn", mx.abs, mx.array(0.0), shapeless=True)
+  imported_abs = mx.import_function("fun.mlxfn")
+
+  # Ok
+  out, = imported_abs(mx.array(-1.0))
+  
+  # Also ok 
+  out, = imported_abs(mx.array([-1.0, -2.0]))
+
+With ``shapeless=False`` (which is the default), the second call to
+``imported_abs`` would raise an exception with a shape mismatch.
+
+Shapeless exporting works the same as shapeless compilation and should be
+used carefully. See the :ref:`documentation on shapeless compilation
+<shapeless_compile>` for more information.
+
+Exporting Multiple Traces
+-------------------------
+
+In some cases, functions build different computation graphs for different
+input arguments. A simple way to manage this is to export to a new file with
+each set of inputs. This is a fine option in many cases. But it can be
+suboptimal if the exported functions have a large amount of duplicate constant
+data (for example the parameters of a :obj:`mlx.nn.Module`).
+
+The export API in MLX lets you export multiple traces of the same function to
+a single file by creating an exporting context manager with :func:`exporter`:
+
+.. code-block:: python
+
+  def fun(x, y=None):
+      constant = mx.array(3.0)
+      if y is not None:
+        x += y 
+      return x + constant
+
+  with mx.exporter("fun.mlxfn", fun) as exporter:
+      exporter(mx.array(1.0))
+      exporter(mx.array(1.0), y=mx.array(0.0))
+
+  imported_function = mx.import_function("fun.mlxfn")
+
+  # Call the function with y=None
+  out, = imported_function(mx.array(1.0))
+  print(out)
+
+  # Call the function with y specified
+  out, = imported_function(mx.array(1.0), y=mx.array(1.0))
+  print(out)
+
+In the above example the function constant data, (i.e. ``constant``), is only
+saved once. 
+
+Transformations with Imported Functions
+---------------------------------------
+
+Function transformations like :func:`grad`, :func:`vmap`, and :func:`compile` work
+on imported functions just like regular Python functions:
+
+.. code-block:: python
+
+  def fun(x):
+      return mx.sin(x)
+
+  x = mx.array(0.0)
+  mx.export_function("sine.mlxfn", fun, x)
+
+  imported_fun = mx.import_function("sine.mlxfn")
+
+  # Take the derivative of the imported function
+  dfdx = mx.grad(lambda x: imported_fun(x)[0])
+  # Prints: array(1, dtype=float32)
+  print(dfdx(x))
+
+  # Compile the imported function 
+  mx.compile(imported_fun)
+  # Prints: array(0, dtype=float32)
+  print(compiled_fun(x)[0])
+
+
+Importing Functions in C++
+--------------------------
+
+Importing and running functions in C++ is basically the same as importing and
+running them in Python. First, follow the :ref:`instructions <mlx_in_cpp>` to
+setup a simple C++ project that uses MLX as a library.
+
+Next, export a simple function from Python:
+
+.. code-block:: python
+
+  def fun(x, y):
+      return mx.exp(x + y)
+
+  x = mx.array(1.0)
+  y = mx.array(1.0)
+  mx.export_function("fun.mlxfn", fun, x, y)
+
+
+Import and run the function in C++ with only a few lines of code:
+
+.. code-block:: c++
+
+  auto fun = mx::import_function("fun.mlxfn");
+
+  auto inputs = {mx::array(1.0), mx::array(1.0)};
+  auto outputs = fun(inputs);
+
+  // Prints: array(2, dtype=float32)
+  std::cout << outputs[0] << std::endl;
+
+Imported functions can be transformed in C++ just like in Python. Use 
+``std::vector<mx::array>`` for positional arguments and ``std::map<std::string,
+mx::array>`` for keyword arguments when calling imported functions in C++.
+
+More Examples
+-------------
+
+Here are a few more complete examples exporting more complex functions from
+Python and importing and running them in C++:
+
+* `Inference and training a multi-layer perceptron <https://github.com/ml-explore/mlx/tree/main/examples/export>`_