mirror of
https://github.com/ml-explore/mlx.git
synced 2025-12-16 01:49:05 +08:00
Compare commits
2 Commits
a0ae49d397
...
2204182bba
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2204182bba | ||
|
|
3628e5d497 |
@@ -81,23 +81,24 @@ jobs:
|
|||||||
export DEBIAN_FRONTEND=noninteractive
|
export DEBIAN_FRONTEND=noninteractive
|
||||||
export NEEDRESTART_MODE=a
|
export NEEDRESTART_MODE=a
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get upgrade -y
|
|
||||||
pip install --upgrade cmake
|
|
||||||
sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
|
sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
|
||||||
sudo apt-get install openmpi-bin openmpi-common libopenmpi-dev
|
sudo apt-get install openmpi-bin openmpi-common libopenmpi-dev
|
||||||
|
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
- run:
|
- run:
|
||||||
name: Install Python package
|
name: Install Python package
|
||||||
command: |
|
command: |
|
||||||
pip install -e ".[dev]"
|
uv venv
|
||||||
|
uv pip install cmake
|
||||||
|
uv pip install -e ".[dev]" -v
|
||||||
- run:
|
- run:
|
||||||
name: Generate package stubs
|
name: Generate package stubs
|
||||||
command: |
|
command: |
|
||||||
echo "stubs"
|
uv pip install typing_extensions
|
||||||
pip install typing_extensions
|
uv run --no-project setup.py generate_stubs
|
||||||
python setup.py generate_stubs
|
|
||||||
- run:
|
- run:
|
||||||
name: Run Python tests
|
name: Run Python tests
|
||||||
command: |
|
command: |
|
||||||
|
source .venv/bin/activate
|
||||||
python -m unittest discover python/tests -v
|
python -m unittest discover python/tests -v
|
||||||
mpirun --bind-to none -host localhost:8 -np 8 python python/tests/mpi_test_distributed.py
|
mpirun --bind-to none -host localhost:8 -np 8 python python/tests/mpi_test_distributed.py
|
||||||
mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
|
mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
|
||||||
@@ -105,6 +106,7 @@ jobs:
|
|||||||
- run:
|
- run:
|
||||||
name: Build CPP only
|
name: Build CPP only
|
||||||
command: |
|
command: |
|
||||||
|
source .venv/bin/activate
|
||||||
mkdir -p build && cd build
|
mkdir -p build && cd build
|
||||||
cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
|
cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
|
||||||
make -j `nproc`
|
make -j `nproc`
|
||||||
@@ -130,33 +132,30 @@ jobs:
|
|||||||
- run:
|
- run:
|
||||||
name: Install dependencies
|
name: Install dependencies
|
||||||
command: |
|
command: |
|
||||||
brew install python@3.9
|
HOMEBREW_NO_AUTO_UPDATE=1 HOMEBREW_NO_INSTALL_CLEANUP=1 \
|
||||||
brew install openmpi
|
brew install openmpi uv
|
||||||
python3.9 -m venv env
|
|
||||||
source env/bin/activate
|
|
||||||
pip install --upgrade pip
|
|
||||||
pip install --upgrade cmake
|
|
||||||
pip install nanobind==2.4.0
|
|
||||||
pip install numpy
|
|
||||||
pip install torch
|
|
||||||
pip install tensorflow
|
|
||||||
pip install unittest-xml-reporting
|
|
||||||
- run:
|
- run:
|
||||||
name: Install Python package
|
name: Install Python package
|
||||||
command: |
|
command: |
|
||||||
source env/bin/activate
|
uv venv --python 3.9
|
||||||
|
uv pip install \
|
||||||
|
nanobind==2.4.0 \
|
||||||
|
cmake \
|
||||||
|
numpy \
|
||||||
|
torch \
|
||||||
|
tensorflow \
|
||||||
|
unittest-xml-reporting
|
||||||
DEBUG=1 CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
|
DEBUG=1 CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
|
||||||
pip install -e . -v
|
uv pip install -e . -v
|
||||||
- run:
|
- run:
|
||||||
name: Generate package stubs
|
name: Generate package stubs
|
||||||
command: |
|
command: |
|
||||||
source env/bin/activate
|
uv pip install typing_extensions
|
||||||
pip install typing_extensions
|
uv run --no-project setup.py generate_stubs
|
||||||
python setup.py generate_stubs
|
|
||||||
- run:
|
- run:
|
||||||
name: Run Python tests
|
name: Run Python tests
|
||||||
command: |
|
command: |
|
||||||
source env/bin/activate
|
source .venv/bin/activate
|
||||||
LOW_MEMORY=1 DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
|
LOW_MEMORY=1 DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
|
||||||
LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu
|
LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu
|
||||||
mpirun --bind-to none -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
|
mpirun --bind-to none -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
|
||||||
@@ -165,16 +164,15 @@ jobs:
|
|||||||
- run:
|
- run:
|
||||||
name: Build example extension
|
name: Build example extension
|
||||||
command: |
|
command: |
|
||||||
source env/bin/activate
|
|
||||||
cd examples/extensions
|
cd examples/extensions
|
||||||
pip install -r requirements.txt
|
uv pip install -r requirements.txt
|
||||||
python setup.py build_ext -j8
|
uv run --no-project setup.py build_ext -j8
|
||||||
- store_test_results:
|
- store_test_results:
|
||||||
path: test-results
|
path: test-results
|
||||||
- run:
|
- run:
|
||||||
name: Build CPP only
|
name: Build CPP only
|
||||||
command: |
|
command: |
|
||||||
source env/bin/activate
|
source .venv/bin/activate
|
||||||
mkdir -p build && cd build && cmake .. && make -j `sysctl -n hw.ncpu`
|
mkdir -p build && cd build && cmake .. && make -j `sysctl -n hw.ncpu`
|
||||||
- run:
|
- run:
|
||||||
name: Run CPP tests
|
name: Run CPP tests
|
||||||
@@ -183,7 +181,7 @@ jobs:
|
|||||||
- run:
|
- run:
|
||||||
name: Build small binary
|
name: Build small binary
|
||||||
command: |
|
command: |
|
||||||
source env/bin/activate
|
source .venv/bin/activate
|
||||||
cd build/
|
cd build/
|
||||||
cmake .. -DCMAKE_BUILD_TYPE=MinSizeRel \
|
cmake .. -DCMAKE_BUILD_TYPE=MinSizeRel \
|
||||||
-DBUILD_SHARED_LIBS=ON \
|
-DBUILD_SHARED_LIBS=ON \
|
||||||
@@ -195,12 +193,13 @@ jobs:
|
|||||||
- run:
|
- run:
|
||||||
name: Run Python tests with JIT
|
name: Run Python tests with JIT
|
||||||
command: |
|
command: |
|
||||||
source env/bin/activate
|
|
||||||
CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
|
CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
|
||||||
pip install -e . -v
|
uv pip install -e .
|
||||||
LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 \
|
LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 \
|
||||||
METAL_DEBUG_ERROR_MODE=0 \
|
METAL_DEBUG_ERROR_MODE=0 \
|
||||||
python -m xmlrunner discover -v python/tests -o test-results/gpu_jit
|
uv run --no-project python -m xmlrunner discover \
|
||||||
|
-v python/tests \
|
||||||
|
-o test-results/gpu_jit
|
||||||
|
|
||||||
cuda_build_and_test:
|
cuda_build_and_test:
|
||||||
parameters:
|
parameters:
|
||||||
@@ -224,17 +223,17 @@ jobs:
|
|||||||
curl -sL https://github.com/ccache/ccache/releases/download/v4.11.3/ccache-4.11.3-linux-x86_64.tar.xz | tar xJf -
|
curl -sL https://github.com/ccache/ccache/releases/download/v4.11.3/ccache-4.11.3-linux-x86_64.tar.xz | tar xJf -
|
||||||
sudo mv ccache-4.11.3-linux-x86_64/ccache /usr/bin/ccache
|
sudo mv ccache-4.11.3-linux-x86_64/ccache /usr/bin/ccache
|
||||||
rm -rf ccache-4.11.3-linux-x86_64
|
rm -rf ccache-4.11.3-linux-x86_64
|
||||||
|
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
- run:
|
- run:
|
||||||
name: Install Python package
|
name: Install Python package
|
||||||
command: |
|
command: |
|
||||||
python3 -m venv env
|
uv venv
|
||||||
source env/bin/activate
|
|
||||||
CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
|
CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
|
||||||
pip install -e ".[dev]" -v
|
uv pip install -e ".[dev]" -v
|
||||||
- run:
|
- run:
|
||||||
name: Run Python tests
|
name: Run Python tests
|
||||||
command: |
|
command: |
|
||||||
source env/bin/activate
|
source .venv/bin/activate
|
||||||
LOW_MEMORY=1 DEVICE=cpu python -m unittest discover python/tests -v
|
LOW_MEMORY=1 DEVICE=cpu python -m unittest discover python/tests -v
|
||||||
LOW_MEMORY=1 DEVICE=gpu python -m tests discover python/tests -v
|
LOW_MEMORY=1 DEVICE=gpu python -m tests discover python/tests -v
|
||||||
- run:
|
- run:
|
||||||
@@ -343,14 +342,10 @@ jobs:
|
|||||||
export DEBIAN_FRONTEND=noninteractive
|
export DEBIAN_FRONTEND=noninteractive
|
||||||
export NEEDRESTART_MODE=a
|
export NEEDRESTART_MODE=a
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get upgrade -y
|
|
||||||
TZ=Etc/UTC sudo apt-get -y install tzdata
|
TZ=Etc/UTC sudo apt-get -y install tzdata
|
||||||
sudo apt-get install -y apt-utils
|
|
||||||
sudo apt-get install -y software-properties-common
|
|
||||||
sudo add-apt-repository -y ppa:deadsnakes/ppa
|
sudo add-apt-repository -y ppa:deadsnakes/ppa
|
||||||
sudo apt-get install -y $PYTHON $PYTHON-dev $PYTHON-full
|
sudo apt-get install -y $PYTHON $PYTHON-dev $PYTHON-full
|
||||||
sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
|
sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
|
||||||
sudo apt-get install -y build-essential git
|
|
||||||
$PYTHON -m venv env
|
$PYTHON -m venv env
|
||||||
source env/bin/activate
|
source env/bin/activate
|
||||||
pip install --upgrade pip
|
pip install --upgrade pip
|
||||||
|
|||||||
@@ -44,8 +44,11 @@ struct ArgMin {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int N>
|
template <int N>
|
||||||
__device__ IndexValPair<T>
|
__device__ IndexValPair<T> reduce_many(
|
||||||
reduce_many(IndexValPair<T> best, T (&vals)[N], uint32_t offset) {
|
IndexValPair<T> best,
|
||||||
|
const AlignedVector<T, N>& vals,
|
||||||
|
uint32_t offset) {
|
||||||
|
#pragma unroll
|
||||||
for (int i = 0; i < N; i++) {
|
for (int i = 0; i < N; i++) {
|
||||||
if (vals[i] < best.val) {
|
if (vals[i] < best.val) {
|
||||||
best.val = vals[i];
|
best.val = vals[i];
|
||||||
@@ -74,8 +77,11 @@ struct ArgMax {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int N>
|
template <int N>
|
||||||
__device__ IndexValPair<T>
|
__device__ IndexValPair<T> reduce_many(
|
||||||
reduce_many(IndexValPair<T> best, T (&vals)[N], uint32_t offset) {
|
IndexValPair<T> best,
|
||||||
|
const AlignedVector<T, N>& vals,
|
||||||
|
uint32_t offset) {
|
||||||
|
#pragma unroll
|
||||||
for (int i = 0; i < N; i++) {
|
for (int i = 0; i < N; i++) {
|
||||||
if (vals[i] > best.val) {
|
if (vals[i] > best.val) {
|
||||||
best.val = vals[i];
|
best.val = vals[i];
|
||||||
@@ -106,16 +112,15 @@ __global__ void arg_reduce_general(
|
|||||||
|
|
||||||
int64_t in_idx = elem_to_loc(index, shape.data(), in_strides.data(), ndim);
|
int64_t in_idx = elem_to_loc(index, shape.data(), in_strides.data(), ndim);
|
||||||
int64_t out_idx = elem_to_loc(index, shape.data(), out_strides.data(), ndim);
|
int64_t out_idx = elem_to_loc(index, shape.data(), out_strides.data(), ndim);
|
||||||
|
in += in_idx;
|
||||||
|
|
||||||
Op op;
|
Op op;
|
||||||
T init = op.init();
|
T init = op.init();
|
||||||
IndexValPair<T> best{0, init};
|
IndexValPair<T> best{0, init};
|
||||||
|
|
||||||
for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); ++r) {
|
for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); ++r) {
|
||||||
T vals[N_READS];
|
|
||||||
auto tid = r * BLOCK_DIM + block.thread_index().x;
|
auto tid = r * BLOCK_DIM + block.thread_index().x;
|
||||||
cub::LoadDirectBlocked(
|
auto vals = load_vector<N_READS>(in, tid, axis_size, axis_stride, init);
|
||||||
tid, StridedIterator(in + in_idx, axis_stride), vals, axis_size, init);
|
|
||||||
best = op.reduce_many(best, vals, tid * N_READS);
|
best = op.reduce_many(best, vals, tid * N_READS);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -131,20 +131,6 @@ inline __device__ void store_vector(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper for accessing strided data.
|
|
||||||
template <typename T>
|
|
||||||
struct StridedIterator {
|
|
||||||
T it;
|
|
||||||
int64_t stride;
|
|
||||||
|
|
||||||
__host__ __device__ StridedIterator(T it, int64_t stride)
|
|
||||||
: it(it), stride(stride) {}
|
|
||||||
|
|
||||||
__host__ __device__ auto operator[](int i) const {
|
|
||||||
return it[i * stride];
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
// Type limits utils
|
// Type limits utils
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|||||||
Reference in New Issue
Block a user