Compare commits

..

4 Commits

Author SHA1 Message Date
Awni Hannun
38c9085938 update circle 2025-07-11 09:36:38 -07:00
Awni Hannun
8f93ca9e52 cleanup circle, fix cuda repair 2025-07-11 08:17:58 -07:00
Awni Hannun
c623cc7683 temp for testing 2025-07-10 22:04:28 -07:00
Awni Hannun
9381163788 install linux with mlx[cuda] and mlx[cpu] 2025-07-10 22:04:28 -07:00
7 changed files with 209 additions and 194 deletions

View File

@@ -7,18 +7,6 @@ parameters:
nightly_build:
type: boolean
default: false
weekly_build:
type: boolean
default: false
test_release:
type: boolean
default: false
linux_release:
type: boolean
default: false
cuda_release:
type: boolean
default: false
jobs:
build_documentation:
@@ -301,9 +289,9 @@ jobs:
python_version:
type: string
default: "3.9"
extra_env:
build_env:
type: string
default: "DEV_RELEASE=1"
default: ""
docker:
- image: ubuntu:20.04
steps:
@@ -332,17 +320,22 @@ jobs:
pip install patchelf
pip install build
pip install twine
<< parameters.extra_env >> pip install . -v
<< parameters.build_env >> pip install . -v
pip install typing_extensions
python setup.py generate_stubs
<< parameters.extra_env >> python -m build --wheel
<< parameters.build_env >> python -m build --wheel
auditwheel show dist/*
auditwheel repair dist/* --plat manylinux_2_31_x86_64
- run:
name: Upload package
command: |
source env/bin/activate
twine upload wheelhouse/*
<< parameters.build_env >> MLX_BUILD_COMMON=1 \
python -m build --wheel --outdir wheelhouse
- when:
condition: << parameters.build_env >>
steps:
- run:
name: Upload packages
command: |
source env/bin/activate
twine upload wheelhouse/*
- store_artifacts:
path: wheelhouse/
@@ -351,9 +344,9 @@ jobs:
python_version:
type: string
default: "3.9"
extra_env:
build_env:
type: string
default: "DEV_RELEASE=1"
default: ""
machine:
image: linux-cuda-12:default
resource_class: gpu.nvidia.small.gen2
@@ -364,25 +357,29 @@ jobs:
command: |
sudo apt-get update
sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
sudo apt-get install zip
python -m venv env
source env/bin/activate
pip install auditwheel
pip install patchelf
pip install build
pip install twine
<< parameters.extra_env >> \
<< parameters.build_env >> \
CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
pip install ".[dev]" -v
python setup.py generate_stubs
<< parameters.extra_env >> \
<< parameters.build_env >> \
CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
python -m build --wheel
bash python/scripts/repair_cuda.sh
- run:
name: Upload package
command: |
source env/bin/activate
twine upload wheelhouse/*.whl
- when:
condition: << parameters.build_env >>
steps:
- run:
name: Upload package
command: |
source env/bin/activate
twine upload wheelhouse/*.whl
- store_artifacts:
path: wheelhouse/
@@ -394,8 +391,6 @@ workflows:
pattern: "^(?!pull/)[-\\w]+$"
value: << pipeline.git.branch >>
- not: << pipeline.parameters.nightly_build >>
- not: << pipeline.parameters.weekly_build >>
- not: << pipeline.parameters.test_release >>
jobs:
- mac_build_and_test:
matrix:
@@ -409,8 +404,6 @@ workflows:
when:
and:
- not: << pipeline.parameters.nightly_build >>
- not: << pipeline.parameters.weekly_build >>
- not: << pipeline.parameters.test_release >>
jobs:
- build_release:
filters:
@@ -501,7 +494,17 @@ workflows:
matrix:
parameters:
python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
extra_env: ["PYPI_RELEASE=1"]
build_env: ["PYPI_RELEASE=1"]
- build_cuda_release:
filters:
tags:
only: /^v.*/
branches:
ignore: /.*/
matrix:
parameters:
python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
build_env: ["PYPI_RELEASE=1"]
prb:
when:
@@ -580,99 +583,21 @@ workflows:
- macosx_deployment_target: "15.0"
xcode_version: "15.0.0"
python_version: "3.13"
weekly_build:
when:
and:
- equal: [ main, << pipeline.git.branch >> ]
- << pipeline.parameters.weekly_build >>
jobs:
- build_release:
matrix:
parameters:
python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
macosx_deployment_target: ["13.5", "14.0", "15.0"]
build_env: ["DEV_RELEASE=1"]
xcode_version: ["16.2.0", "15.0.0"]
exclude:
- macosx_deployment_target: "13.5"
xcode_version: "16.2.0"
python_version: "3.9"
build_env: "DEV_RELEASE=1"
- macosx_deployment_target: "13.5"
xcode_version: "16.2.0"
python_version: "3.10"
build_env: "DEV_RELEASE=1"
- macosx_deployment_target: "13.5"
xcode_version: "16.2.0"
python_version: "3.11"
build_env: "DEV_RELEASE=1"
- macosx_deployment_target: "13.5"
xcode_version: "16.2.0"
python_version: "3.12"
build_env: "DEV_RELEASE=1"
- macosx_deployment_target: "13.5"
xcode_version: "16.2.0"
python_version: "3.13"
build_env: "DEV_RELEASE=1"
- macosx_deployment_target: "14.0"
xcode_version: "15.0.0"
python_version: "3.9"
build_env: "DEV_RELEASE=1"
- macosx_deployment_target: "14.0"
xcode_version: "15.0.0"
python_version: "3.10"
build_env: "DEV_RELEASE=1"
- macosx_deployment_target: "14.0"
xcode_version: "15.0.0"
python_version: "3.11"
build_env: "DEV_RELEASE=1"
- macosx_deployment_target: "14.0"
xcode_version: "15.0.0"
python_version: "3.12"
build_env: "DEV_RELEASE=1"
- macosx_deployment_target: "14.0"
xcode_version: "15.0.0"
python_version: "3.13"
build_env: "DEV_RELEASE=1"
- macosx_deployment_target: "15.0"
xcode_version: "15.0.0"
python_version: "3.9"
build_env: "DEV_RELEASE=1"
- macosx_deployment_target: "15.0"
xcode_version: "15.0.0"
python_version: "3.10"
build_env: "DEV_RELEASE=1"
- macosx_deployment_target: "15.0"
xcode_version: "15.0.0"
python_version: "3.11"
build_env: "DEV_RELEASE=1"
- macosx_deployment_target: "15.0"
xcode_version: "15.0.0"
python_version: "3.12"
build_env: "DEV_RELEASE=1"
- macosx_deployment_target: "15.0"
xcode_version: "15.0.0"
python_version: "3.13"
build_env: "DEV_RELEASE=1"
linux_test_release:
when:
and:
- equal: [ main, << pipeline.git.branch >> ]
- << pipeline.parameters.linux_release >>
jobs:
- build_linux_release:
filters:
tags:
only: /^v.*/
branches:
ignore: /.*/
matrix:
parameters:
python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
extra_env: ["PYPI_RELEASE=1"]
cuda_test_release:
when:
and:
- equal: [ main, << pipeline.git.branch >> ]
- << pipeline.parameters.cuda_release >>
jobs:
- build_cuda_release:
filters:
tags:
only: /^v.*/
branches:
ignore: /.*/
matrix:
parameters:
python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
extra_env: ["PYPI_RELEASE=1"]

View File

@@ -38,8 +38,16 @@ and SM 7.0 (Volta) and up. To install MLX with CUDA support, run:
.. code-block:: shell
pip install mlx-cuda
pip install "mlx[cuda]"
CPU only (Linux)
^^^^^^^^^^^^^^^^
For a CPU-only version of MLX that runs on Linux use:
.. code-block:: shell
pip install "mlx[cpu]"
Troubleshooting
^^^^^^^^^^^^^^^

View File

@@ -22,20 +22,78 @@
#include "mlx/backend/cpu/encoder.h"
#include "mlx/distributed/distributed.h"
#include "mlx/distributed/distributed_impl.h"
#include "mlx/dtype_utils.h"
#include "mlx/threadpool.h"
#ifndef SOL_TCP
#define SOL_TCP IPPROTO_TCP
#endif
#define SWITCH_TYPE(x, ...) \
switch ((x).dtype()) { \
case bool_: { \
using T = bool; \
__VA_ARGS__; \
} break; \
case int8: { \
using T = int8_t; \
__VA_ARGS__; \
} break; \
case int16: { \
using T = int16_t; \
__VA_ARGS__; \
} break; \
case int32: { \
using T = int32_t; \
__VA_ARGS__; \
} break; \
case int64: { \
using T = int64_t; \
__VA_ARGS__; \
} break; \
case uint8: { \
using T = uint8_t; \
__VA_ARGS__; \
} break; \
case uint16: { \
using T = uint16_t; \
__VA_ARGS__; \
} break; \
case uint32: { \
using T = uint32_t; \
__VA_ARGS__; \
} break; \
case uint64: { \
using T = uint64_t; \
__VA_ARGS__; \
} break; \
case bfloat16: { \
using T = bfloat16_t; \
__VA_ARGS__; \
} break; \
case float16: { \
using T = float16_t; \
__VA_ARGS__; \
} break; \
case float32: { \
using T = float; \
__VA_ARGS__; \
} break; \
case float64: { \
using T = double; \
__VA_ARGS__; \
} break; \
case complex64: { \
using T = complex64_t; \
__VA_ARGS__; \
} break; \
}
namespace mlx::core::distributed::ring {
constexpr const size_t ALL_SUM_SIZE = 8 * 1024 * 1024;
constexpr const size_t ALL_SUM_BUFFERS = 2;
constexpr const int CONN_ATTEMPTS = 5;
constexpr const int CONN_WAIT = 1000;
constexpr const int INIT_TIMEOUT = 20000;
using GroupImpl = mlx::core::distributed::detail::GroupImpl;
using json = nlohmann::json;
@@ -445,7 +503,6 @@ std::vector<int> make_connections(
return sockets;
}
template <typename T>
struct SumOp {
void operator()(const T* input, T* output, size_t N) {
@@ -493,27 +550,19 @@ class RingGroup : public GroupImpl {
size_ = nodes.size();
int connect_to = (rank_ + 1) % size_;
// Initialize the ring by making all the connections
log_info(verbose_, "Rank", rank_, "accepting");
log_info(verbose_, "Rank", rank_, "connecting to", connect_to);
auto sl = std::async(std::launch::async, accept_connections, nodes[rank_]);
auto sr = std::async(
std::launch::async, make_connections, nodes[connect_to], verbose);
std::future_status status_sl, status_sr;
for (int i = 0; i < 10; i++) {
status_sl = sl.wait_for(std::chrono::milliseconds(INIT_TIMEOUT / 10));
status_sr = sl.wait_for(std::chrono::milliseconds(INIT_TIMEOUT / 10));
if (status_sl == std::future_status::ready &&
status_sr == std::future_status::ready) {
break;
}
// We define the connection order by having the rank_ == size_ - 1 connect
// first and accept after.
if (rank_ < connect_to) {
log_info(verbose_, "Rank", rank_, "accepting");
sockets_left_ = std::move(accept_connections(nodes[rank_]));
log_info(verbose_, "Rank", rank_, "connecting to", connect_to);
sockets_right_ = std::move(make_connections(nodes[connect_to], verbose));
} else {
log_info(verbose_, "Rank", rank_, "connecting to", connect_to);
sockets_right_ = std::move(make_connections(nodes[connect_to], verbose));
log_info(verbose_, "Rank", rank_, "accepting");
sockets_left_ = std::move(accept_connections(nodes[rank_]));
}
if (status_sl != std::future_status::ready ||
status_sr != std::future_status::ready) {
throw std::runtime_error("[ring] Ring initialization timed out");
}
sockets_left_ = std::move(sl.get());
sockets_right_ = std::move(sr.get());
// Failure if we couldn't make right or left sockets
if (sockets_right_.empty()) {
@@ -579,24 +628,18 @@ class RingGroup : public GroupImpl {
}
void all_sum(const array& input, array& output, Stream stream) override {
dispatch_all_types(output.dtype(), [&](auto type_tag) {
using T = MLX_GET_TYPE(type_tag);
all_reduce<T, SumOp<T>>(input, output, stream, SumOp<T>());
});
SWITCH_TYPE(
output, all_reduce<T, SumOp<T>>(input, output, stream, SumOp<T>()));
}
void all_max(const array& input, array& output, Stream stream) override {
dispatch_all_types(output.dtype(), [&](auto type_tag) {
using T = MLX_GET_TYPE(type_tag);
all_reduce<T, MaxOp<T>>(input, output, stream, MaxOp<T>());
});
SWITCH_TYPE(
output, all_reduce<T, MaxOp<T>>(input, output, stream, MaxOp<T>()));
}
void all_min(const array& input, array& output, Stream stream) override {
dispatch_all_types(output.dtype(), [&](auto type_tag) {
using T = MLX_GET_TYPE(type_tag);
all_reduce<T, MinOp<T>>(input, output, stream, MinOp<T>());
});
SWITCH_TYPE(
output, all_reduce<T, MinOp<T>>(input, output, stream, MinOp<T>()));
}
std::shared_ptr<GroupImpl> split(int color, int key = -1) override {

View File

@@ -3,15 +3,19 @@
auditwheel repair dist/* \
--plat manylinux_2_35_x86_64 \
--exclude libcublas* \
--exclude libnvrtc*
--exclude libnvrtc* \
-w wheel_tmp
cd wheelhouse
mkdir wheelhouse
cd wheel_tmp
repaired_wheel=$(find . -name "*.whl" -print -quit)
unzip -q "${repaired_wheel}"
rm "${repaired_wheel}"
core_so=$(find mlx -name "core*.so" -print -quit)
rpath=$(patchelf --print-rpath "${core_so}")
rpath=$rpath:\$ORIGIN/../nvidia/cublas/lib:\$ORIGIN/../nvidia/cuda_nvrtc/lib
patchelf --force-rpath --set-rpath "$rpath" "$core_so"
# Re-zip the repaired wheel
zip -r -q "${repaired_wheel}" .
zip -r -q "../wheelhouse/${repaired_wheel}" .

View File

@@ -4,7 +4,6 @@ import unittest
import mlx.core as mx
import mlx_distributed_tests
import mlx_tests
class TestMPIDistributed(mlx_distributed_tests.MLXDistributedCommonTestCase):
@@ -151,4 +150,4 @@ class TestMPIDistributed(mlx_distributed_tests.MLXDistributedCommonTestCase):
if __name__ == "__main__":
mlx_tests.MLXTestRunner()
unittest.main()

View File

@@ -4,7 +4,6 @@ import unittest
import mlx.core as mx
import mlx_distributed_tests
import mlx_tests
class TestRingDistributed(mlx_distributed_tests.MLXDistributedCommonTestCase):

View File

@@ -5,10 +5,11 @@ import os
import platform
import re
import subprocess
from functools import partial
from pathlib import Path
from subprocess import run
from setuptools import Command, Extension, find_namespace_packages, setup
from setuptools import Command, Extension, setup
from setuptools.command.build_ext import build_ext
@@ -165,19 +166,28 @@ with open(Path(__file__).parent / "README.md", encoding="utf-8") as f:
# The information here can also be placed in setup.cfg - better separation of
# logic and declaration, and simpler if you include description/version in a file.
if __name__ == "__main__":
packages = find_namespace_packages(
where="python", exclude=["src", "tests", "tests.*"]
)
package_dir = {"": "python"}
package_data = {"mlx": ["lib/*", "include/*", "share/*"], "mlx.core": ["*.pyi"]}
install_requires = []
packages = [
"mlx",
"mlx.nn",
"mlx.nn.layers",
"mlx.optimizers",
]
is_release = "PYPI_RELEASE" in os.environ
build_macos = platform.system() == "Darwin"
build_cuda = "MLX_BUILD_CUDA=ON" in os.environ.get("CMAKE_ARGS", "")
build_common = "MLX_BUILD_COMMON" in os.environ
install_requires = []
if build_cuda:
install_requires = ["nvidia-cublas-cu12", "nvidia-cuda-nvrtc-cu12"]
version = get_version()
setup(
name="mlx-cuda" if build_cuda else "mlx",
version=get_version(),
_setup = partial(
setup,
version=version,
author="MLX Contributors",
author_email="mlx@group.apple.com",
description="A framework for machine learning on Apple silicon.",
@@ -185,29 +195,56 @@ if __name__ == "__main__":
long_description_content_type="text/markdown",
license="MIT",
url="https://github.com/ml-explore/mlx",
packages=packages,
package_dir=package_dir,
package_data=package_data,
include_package_data=True,
install_requires=install_requires,
extras_require={
"dev": [
"nanobind==2.4.0",
"numpy",
"pre-commit",
"setuptools>=42",
"torch",
"typing_extensions",
],
},
entry_points={
"console_scripts": [
"mlx.launch = mlx.distributed_run:main",
"mlx.distributed_config = mlx.distributed_run:distributed_config",
]
},
ext_modules=[CMakeExtension("mlx.core")],
cmdclass={"build_ext": CMakeBuild, "generate_stubs": GenerateStubs},
zip_safe=False,
python_requires=">=3.9",
install_requires=install_requires,
)
extras = {
"dev": [
"nanobind==2.4.0",
"numpy",
"pre-commit",
"setuptools>=42",
"torch",
"typing_extensions",
],
}
entry_points = {
"console_scripts": [
"mlx.launch = mlx.distributed_run:main",
"mlx.distributed_config = mlx.distributed_run:distributed_config",
]
}
if not is_release or build_macos:
_setup(
name="mlx",
include_package_data=True,
packages=packages,
extras_require=extras,
entry_points=entry_points,
ext_modules=[CMakeExtension("mlx.core")],
cmdclass={"build_ext": CMakeBuild, "generate_stubs": GenerateStubs},
)
elif build_common:
extras["cpu"] = [f"mlx-cpu=={version}"]
extras["cuda"] = [f"mlx-cuda=={version}"]
_setup(
name="mlx",
packages=["mlx"],
extras_require=extras,
entry_points=entry_points,
exclude_package_data=package_data,
)
else:
_setup(
name="mlx-cuda" if build_cuda else "mlx-cpu",
include_package_data=True,
packages=packages,
extras_require=extras,
ext_modules=[CMakeExtension("mlx.core")],
cmdclass={"build_ext": CMakeBuild, "generate_stubs": GenerateStubs},
)