diff --git a/.circleci/config.yml b/.circleci/config.yml index 0ea9303db..205a930af 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -16,6 +16,9 @@ parameters: linux_release: type: boolean default: false + cuda_release: + type: boolean + default: false jobs: build_documentation: @@ -104,7 +107,7 @@ jobs: command: | echo "stubs" pip install typing_extensions - python setup.py generate_stubs + python setup.py generate_stubs - run: name: Run Python tests command: | @@ -162,7 +165,7 @@ jobs: command: | source env/bin/activate pip install typing_extensions - python setup.py generate_stubs + python setup.py generate_stubs - run: name: Run Python tests command: | @@ -223,7 +226,6 @@ jobs: command: | sudo apt-get update sudo apt-get install libblas-dev liblapack-dev liblapacke-dev - sudo apt-get install openmpi-bin openmpi-common libopenmpi-dev python -m venv env source env/bin/activate CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \ @@ -283,7 +285,7 @@ jobs: command: | source env/bin/activate pip install typing_extensions - python setup.py generate_stubs + python setup.py generate_stubs - run: name: Build Python package command: | @@ -342,7 +344,7 @@ jobs: CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \ pip install . -v pip install typing_extensions - python setup.py generate_stubs + python setup.py generate_stubs << parameters.extra_env >> \ CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \ python -m build --wheel @@ -356,6 +358,48 @@ jobs: - store_artifacts: path: wheelhouse/ + build_cuda_release: + parameters: + python_version: + type: string + default: "3.9" + extra_env: + type: string + default: "DEV_RELEASE=1" + machine: + image: linux-cuda-12:default + resource_class: gpu.nvidia.small.gen2 + steps: + - checkout + - run: + name: Build wheel + command: | + sudo apt-get update + sudo apt-get install libblas-dev liblapack-dev liblapacke-dev + python -m venv env + source env/bin/activate + pip install auditwheel + pip install patchelf + pip install build + pip install twine + << parameters.extra_env >> \ + CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \ + CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \ + pip install ".[dev]" -v + python setup.py generate_stubs + << parameters.extra_env >> \ + CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \ + CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \ + python -m build --wheel + bash python/scripts/repair_cuda.sh + - run: + name: Upload package + command: | + source env/bin/activate + twine upload wheelhouse/*.whl + - store_artifacts: + path: wheelhouse/ + workflows: build_and_test: when: @@ -625,3 +669,14 @@ workflows: parameters: python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"] extra_env: ["PYPI_RELEASE=1"] + cuda_test_release: + when: + and: + - equal: [ main, << pipeline.git.branch >> ] + - << pipeline.parameters.cuda_release >> + jobs: + - build_cuda_release: + matrix: + parameters: + python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"] + extra_env: ["PYPI_RELEASE=1"] diff --git a/docs/src/install.rst b/docs/src/install.rst index 059b2cba4..22de94f90 100644 --- a/docs/src/install.rst +++ b/docs/src/install.rst @@ -30,6 +30,16 @@ MLX is also available on conda-forge. To install MLX with conda do: conda install conda-forge::mlx +CUDA +^^^^ + +MLX has a CUDA backend which you can use on any Linux platform with CUDA 12 +and SM 7.0 (Volta) and up. To install MLX with CUDA support, run: + +.. code-block:: shell + + pip install mlx-cuda + Troubleshooting ^^^^^^^^^^^^^^^ @@ -65,6 +75,8 @@ Build Requirements Python API ^^^^^^^^^^ +.. _python install: + To build and install the MLX python library from source, first, clone MLX from `its GitHub repo `_: @@ -107,6 +119,8 @@ IDE: C++ API ^^^^^^^ +.. _cpp install: + Currently, MLX must be built and installed from source. Similarly to the python library, to build and install the MLX C++ library start @@ -185,6 +199,7 @@ should point to the path to the built metal library. xcrun -sdk macosx --show-sdk-version + Binary Size Minimization ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -213,6 +228,50 @@ be anwywhere from a few hundred millisecond to a few seconds depending on the application. Once a kernel is compiled, it will be cached by the system. The Metal kernel cache persists across reboots. +Linux +^^^^^ + +To build from source on Linux (CPU only), install the BLAS and LAPACK headers. +For example on Ubuntu, run the following: + +.. code-block:: shell + + apt-get update -y + apt-get install libblas-dev liblapack-dev liblapacke-dev -y + +From here follow the instructions to install either the :ref:`Python ` or :ref:`C++ ` APIs. + +CUDA +^^^^ + +To build from source on Linux with CUDA, install the BLAS and LAPACK headers +and the CUDA toolkit. For example on Ubuntu, run the following: + +.. code-block:: shell + + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb + dpkg -i cuda-keyring_1.1-1_all.deb + apt-get update -y + apt-get -y install cuda-toolkit-12-9 + apt-get install libblas-dev liblapack-dev liblapacke-dev -y + + +When building either the Python or C++ APIs make sure to pass the cmake flag +``MLX_BUILD_CUDA=ON``. For example, to build the Python API run: + +.. code-block:: shell + + CMAKE_BUILD_PARALLEL_LEVEL=8 CMAKE_ARGS="-DMLX_BUILD_CUDA=ON" pip install -e ".[dev]" + +To build the C++ package run: + +.. code-block:: shell + + mkdir -p build && cd build + cmake .. -DMLX_BUILD_CUDA=ON && make -j + + Troubleshooting ^^^^^^^^^^^^^^^ diff --git a/mlx/backend/cuda/device.cpp b/mlx/backend/cuda/device.cpp index fcf7fdf5e..ba31c0e45 100644 --- a/mlx/backend/cuda/device.cpp +++ b/mlx/backend/cuda/device.cpp @@ -114,7 +114,7 @@ void CommandEncoder::synchronize() { std::future f = p->get_future(); add_completed_handler([p = std::move(p)]() { p->set_value(); }); worker_.end_batch(); - worker_.commit(); + commit(); f.wait(); } diff --git a/python/scripts/repair_cuda.sh b/python/scripts/repair_cuda.sh new file mode 100644 index 000000000..21e6a977a --- /dev/null +++ b/python/scripts/repair_cuda.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +auditwheel repair dist/* \ + --plat manylinux_2_35_x86_64 \ + --exclude libcublas* \ + --exclude libnvrtc* + +cd wheelhouse +repaired_wheel=$(find . -name "*.whl" -print -quit) +unzip -q "${repaired_wheel}" +core_so=$(find mlx -name "core*.so" -print -quit) +rpath=$(patchelf --print-rpath "${core_so}") +rpath=$rpath:\$ORIGIN/../nvidia/cublas/lib:\$ORIGIN/../nvidia/cuda_nvrtc/lib +patchelf --force-rpath --set-rpath "$rpath" "$core_so" + +# Re-zip the repaired wheel +zip -r -q "${repaired_wheel}" . diff --git a/setup.py b/setup.py index d742e6595..35f2e68ef 100644 --- a/setup.py +++ b/setup.py @@ -174,20 +174,26 @@ if __name__ == "__main__": ) package_dir = {"": "python"} package_data = {"mlx": ["lib/*", "include/*", "share/*"], "mlx.core": ["*.pyi"]} + install_requires = [] + build_cuda = "MLX_BUILD_CUDA=ON" in os.environ.get("CMAKE_ARGS", "") + if build_cuda: + install_requires = ["nvidia-cublas-cu12", "nvidia-cuda-nvrtc-cu12"] setup( - name="mlx", + name="mlx-cuda" if build_cuda else "mlx", version=get_version(), author="MLX Contributors", author_email="mlx@group.apple.com", description="A framework for machine learning on Apple silicon.", long_description=long_description, long_description_content_type="text/markdown", + license="MIT", url="https://github.com/ml-explore/mlx", packages=packages, package_dir=package_dir, package_data=package_data, include_package_data=True, + install_requires=install_requires, extras_require={ "dev": [ "nanobind==2.4.0",