mirror of
https://github.com/ml-explore/mlx.git
synced 2025-12-15 17:39:05 +08:00
Compare commits
117 Commits
v0.29.3
...
ibv-backen
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dd91ee9534 | ||
|
|
8fab4f0929 | ||
|
|
47af2c8cb0 | ||
|
|
f40152ebc1 | ||
|
|
5d7e6a0642 | ||
|
|
b9b78b1059 | ||
|
|
45727b0c02 | ||
|
|
2444fbdfe9 | ||
|
|
f3b605e53c | ||
|
|
0388ae3aaf | ||
|
|
d4c1de4a8b | ||
|
|
4dbffb3954 | ||
|
|
b1a60b2d2d | ||
|
|
a6d6717181 | ||
|
|
941cfe23d7 | ||
|
|
9abb0b8123 | ||
|
|
50d3914c67 | ||
|
|
cacbdbf995 | ||
|
|
193cdcd81a | ||
|
|
d8ceae7b77 | ||
|
|
eff0e31f00 | ||
|
|
6c5785bc2f | ||
|
|
8879ee00eb | ||
|
|
6e762fe2e2 | ||
|
|
2b95d0c270 | ||
|
|
b054838780 | ||
|
|
dd79d3c465 | ||
|
|
704fd1ae28 | ||
|
|
c9f4dc851f | ||
|
|
f8bd675655 | ||
|
|
23a9168d34 | ||
|
|
bca205e287 | ||
|
|
1d4eacb737 | ||
|
|
8abd37ad05 | ||
|
|
3e05cea9f8 | ||
|
|
5b0f047226 | ||
|
|
618c87af8c | ||
|
|
d5f61a93fa | ||
|
|
4a09264236 | ||
|
|
0dbc7e5bee | ||
|
|
0d68efd461 | ||
|
|
f9e1a14135 | ||
|
|
d8e9ded928 | ||
|
|
60939d010c | ||
|
|
fdcd2923fd | ||
|
|
54f1cc6e3e | ||
|
|
b3825ac149 | ||
|
|
7f4b7e553c | ||
|
|
ad16f41a7f | ||
|
|
f46877bc08 | ||
|
|
6f35017d1b | ||
|
|
b167f0df1c | ||
|
|
a9f0d6b160 | ||
|
|
940f4c7818 | ||
|
|
35f81728f1 | ||
|
|
4442ed86c1 | ||
|
|
698559c231 | ||
|
|
ecc4879b07 | ||
|
|
32b18d8b66 | ||
|
|
472c43a0c8 | ||
|
|
b7214ff01e | ||
|
|
76414c8971 | ||
|
|
49e4566df3 | ||
|
|
aad49f932f | ||
|
|
86765cce34 | ||
|
|
1bedcbd556 | ||
|
|
9ac7dbe877 | ||
|
|
1bf605d56d | ||
|
|
3c622ddd1d | ||
|
|
27ff069175 | ||
|
|
3b2ffcefc3 | ||
|
|
b65f882df3 | ||
|
|
b704e9e77a | ||
|
|
66519fb348 | ||
|
|
8973550ff3 | ||
|
|
3f866be665 | ||
|
|
23f81ed1c1 | ||
|
|
3fe2250c00 | ||
|
|
047114b988 | ||
|
|
9320eb89a8 | ||
|
|
75819d70ea | ||
|
|
60d80a3728 | ||
|
|
eba6a9d163 | ||
|
|
be9e2aebd6 | ||
|
|
df58b4133a | ||
|
|
27778156dc | ||
|
|
761f901a41 | ||
|
|
6ece97f69b | ||
|
|
d3bc6a9bff | ||
|
|
26ceb507eb | ||
|
|
910b3e3299 | ||
|
|
50fa315d18 | ||
|
|
1ff2b713b6 | ||
|
|
50514a6146 | ||
|
|
93d76b0f30 | ||
|
|
78678de0cd | ||
|
|
ed9c6b1117 | ||
|
|
39b04ce638 | ||
|
|
d9e6349657 | ||
|
|
b901a9f311 | ||
|
|
68c5fa1c95 | ||
|
|
793a31eeb6 | ||
|
|
74c1ed25bb | ||
|
|
ec72b44417 | ||
|
|
460691a0e8 | ||
|
|
969924cc69 | ||
|
|
d1e06117e8 | ||
|
|
539d8322d1 | ||
|
|
c4767d110f | ||
|
|
895217f25b | ||
|
|
0cfeeb60ca | ||
|
|
8f8af61a37 | ||
|
|
233384161e | ||
|
|
5bcf3a6794 | ||
|
|
7707196297 | ||
|
|
7e3471c987 | ||
|
|
9f0ba3ddf1 |
@@ -1,579 +0,0 @@
|
||||
version: 2.1
|
||||
|
||||
orbs:
|
||||
apple: ml-explore/pr-approval@0.1.0
|
||||
|
||||
parameters:
|
||||
nightly_build:
|
||||
type: boolean
|
||||
default: false
|
||||
test_release:
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
jobs:
|
||||
build_documentation:
|
||||
parameters:
|
||||
upload-docs:
|
||||
type: boolean
|
||||
default: false
|
||||
macos:
|
||||
xcode: "26.0.0"
|
||||
resource_class: m4pro.medium
|
||||
steps:
|
||||
- checkout
|
||||
- run:
|
||||
name: Install
|
||||
command: |
|
||||
xcodebuild -downloadComponent MetalToolchain
|
||||
brew install python@3.9
|
||||
brew install doxygen
|
||||
python3.9 -m venv env
|
||||
source env/bin/activate
|
||||
pip install --upgrade pip
|
||||
pip install --upgrade cmake
|
||||
pip install -r docs/requirements.txt
|
||||
pip install . -v
|
||||
- when:
|
||||
condition:
|
||||
not: << parameters.upload-docs >>
|
||||
steps:
|
||||
- run:
|
||||
name: Build documentation
|
||||
command: |
|
||||
source env/bin/activate
|
||||
cd docs && doxygen && make html O=-W
|
||||
- when:
|
||||
condition: << parameters.upload-docs >>
|
||||
steps:
|
||||
- add_ssh_keys:
|
||||
fingerprints:
|
||||
- "SHA256:OhcVVMovbT0pkgMeiVRyxMnjV9R2t+hKBsNcuxq9h+0"
|
||||
- run:
|
||||
name: Upload documentation
|
||||
command: |
|
||||
source env/bin/activate
|
||||
git config user.email "mlx@group.apple.com"
|
||||
git config user.name "CircleCI Docs"
|
||||
git checkout gh-pages
|
||||
git rebase main
|
||||
cd docs
|
||||
git rm -rf build/html
|
||||
doxygen && make html O=-W
|
||||
git add -f build/html
|
||||
git commit -m "rebase"
|
||||
git push -f origin gh-pages
|
||||
|
||||
linux_build_and_test:
|
||||
machine:
|
||||
image: ubuntu-2204:current
|
||||
resource_class: large
|
||||
steps:
|
||||
- checkout
|
||||
- run:
|
||||
name: Run style checks
|
||||
command: |
|
||||
pip install pre-commit
|
||||
pre-commit run --all
|
||||
if ! git diff --quiet; then echo 'Style checks failed, please install pre-commit and run pre-commit run --all and push the change'; exit 1; fi
|
||||
- run:
|
||||
name: Install dependencies
|
||||
command: |
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
export NEEDRESTART_MODE=a
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
|
||||
sudo apt-get install openmpi-bin openmpi-common libopenmpi-dev
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
- run:
|
||||
name: Install Python package
|
||||
command: |
|
||||
uv venv
|
||||
uv pip install cmake
|
||||
DEBUG=1 CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
|
||||
uv pip install -e ".[dev]" -v
|
||||
- run:
|
||||
name: Generate package stubs
|
||||
command: |
|
||||
uv pip install typing_extensions
|
||||
uv run --no-project setup.py generate_stubs
|
||||
- run:
|
||||
name: Run Python tests
|
||||
command: |
|
||||
source .venv/bin/activate
|
||||
python -m unittest discover python/tests -v
|
||||
mpirun --bind-to none -host localhost:8 -np 8 python python/tests/mpi_test_distributed.py
|
||||
mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
|
||||
if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
|
||||
- run:
|
||||
name: Build CPP only
|
||||
command: |
|
||||
source .venv/bin/activate
|
||||
mkdir -p build && cd build
|
||||
cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
|
||||
make -j `nproc`
|
||||
- run:
|
||||
name: Run CPP tests
|
||||
command: ./build/tests/tests
|
||||
|
||||
mac_build_and_test:
|
||||
parameters:
|
||||
xcode_version:
|
||||
type: string
|
||||
default: "26.0.0"
|
||||
macosx_deployment_target:
|
||||
type: string
|
||||
default: ""
|
||||
macos:
|
||||
xcode: << parameters.xcode_version >>
|
||||
environment:
|
||||
MACOSX_DEPLOYMENT_TARGET: << parameters.macosx_deployment_target >>
|
||||
resource_class: m4pro.medium
|
||||
steps:
|
||||
- checkout
|
||||
- run:
|
||||
name: Install dependencies
|
||||
command: |
|
||||
xcodebuild -downloadComponent MetalToolchain
|
||||
HOMEBREW_NO_AUTO_UPDATE=1 HOMEBREW_NO_INSTALL_CLEANUP=1 \
|
||||
brew install openmpi uv
|
||||
- run:
|
||||
name: Install Python package
|
||||
command: |
|
||||
uv venv --python 3.9
|
||||
uv pip install \
|
||||
nanobind==2.4.0 \
|
||||
cmake \
|
||||
numpy \
|
||||
torch \
|
||||
tensorflow \
|
||||
unittest-xml-reporting
|
||||
DEBUG=1 CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
|
||||
uv pip install -e . -v
|
||||
- run:
|
||||
name: Generate package stubs
|
||||
command: |
|
||||
uv pip install typing_extensions
|
||||
uv run --no-project setup.py generate_stubs
|
||||
- run:
|
||||
name: Run Python tests
|
||||
command: |
|
||||
source .venv/bin/activate
|
||||
LOW_MEMORY=1 DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
|
||||
LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu
|
||||
mpirun --bind-to none -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
|
||||
mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
|
||||
if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
|
||||
- run:
|
||||
name: Build example extension
|
||||
command: |
|
||||
source .venv/bin/activate
|
||||
cd examples/extensions
|
||||
uv pip install -r requirements.txt
|
||||
uv run --no-project setup.py build_ext --inplace
|
||||
uv run --no-project python test.py
|
||||
- store_test_results:
|
||||
path: test-results
|
||||
- run:
|
||||
name: Build CPP only
|
||||
command: |
|
||||
source .venv/bin/activate
|
||||
mkdir -p build && cd build && cmake .. && make -j `sysctl -n hw.ncpu`
|
||||
- run:
|
||||
name: Run CPP tests
|
||||
command: |
|
||||
DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 ./build/tests/tests
|
||||
- run:
|
||||
name: Build small binary
|
||||
command: |
|
||||
source .venv/bin/activate
|
||||
cd build/
|
||||
cmake .. -DCMAKE_BUILD_TYPE=MinSizeRel \
|
||||
-DBUILD_SHARED_LIBS=ON \
|
||||
-DMLX_BUILD_CPU=OFF \
|
||||
-DMLX_BUILD_SAFETENSORS=OFF \
|
||||
-DMLX_BUILD_GGUF=OFF \
|
||||
-DMLX_METAL_JIT=ON
|
||||
make -j `sysctl -n hw.ncpu`
|
||||
- run:
|
||||
name: Run Python tests with JIT
|
||||
command: |
|
||||
CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
|
||||
uv pip install -e . -v
|
||||
LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 \
|
||||
METAL_DEBUG_ERROR_MODE=0 \
|
||||
uv run --no-project python -m xmlrunner discover \
|
||||
-v python/tests \
|
||||
-o test-results/gpu_jit
|
||||
|
||||
cuda_build_and_test:
|
||||
parameters:
|
||||
image_date:
|
||||
type: string
|
||||
default: "2023.11.1"
|
||||
machine:
|
||||
image: "linux-cuda-12:<< parameters.image_date >>"
|
||||
resource_class: gpu.nvidia.small.gen2
|
||||
steps:
|
||||
- checkout
|
||||
- restore_cache:
|
||||
keys:
|
||||
- cuda-<< parameters.image_date >>-{{ arch }}-
|
||||
- run:
|
||||
name: Install dependencies
|
||||
command: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install libcudnn9-dev-cuda-12
|
||||
sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
|
||||
sudo apt-get install libnccl2 libnccl-dev
|
||||
curl -sL https://github.com/ccache/ccache/releases/download/v4.11.3/ccache-4.11.3-linux-x86_64.tar.xz | tar xJf -
|
||||
sudo mv ccache-4.11.3-linux-x86_64/ccache /usr/bin/ccache
|
||||
rm -rf ccache-4.11.3-linux-x86_64
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
- run:
|
||||
name: Set CCache size
|
||||
command: ccache --max-size 1G
|
||||
- run:
|
||||
name: Install Python package
|
||||
command: |
|
||||
uv venv
|
||||
uv pip install cmake
|
||||
DEBUG=1 CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
|
||||
uv pip install -e ".[dev]" -v
|
||||
- run:
|
||||
name: Run Python tests
|
||||
command: |
|
||||
source .venv/bin/activate
|
||||
LOW_MEMORY=1 DEVICE=cpu python -m unittest discover python/tests -v
|
||||
LOW_MEMORY=1 DEVICE=gpu python -m tests discover python/tests -v
|
||||
- run:
|
||||
name: Build CPP only
|
||||
command: |
|
||||
source .venv/bin/activate
|
||||
cmake . -B build \
|
||||
-DMLX_BUILD_CUDA=ON \
|
||||
-DCMAKE_CUDA_COMPILER=`which nvcc` \
|
||||
-DCMAKE_BUILD_TYPE=DEBUG
|
||||
cmake --build build -j `nproc`
|
||||
- run:
|
||||
name: Run CPP tests
|
||||
command: ./build/tests/tests -sfe="*fft_tests.cpp,*linalg_tests.cpp"
|
||||
- run:
|
||||
name: CCache report
|
||||
command: |
|
||||
ccache --show-stats
|
||||
ccache --zero-stats
|
||||
ccache --cleanup
|
||||
- save_cache:
|
||||
key: cuda-<< parameters.image_date >>-{{ arch }}-{{ epoch }}
|
||||
paths:
|
||||
- /home/circleci/.cache/ccache
|
||||
|
||||
build_release:
|
||||
parameters:
|
||||
python_version:
|
||||
type: string
|
||||
default: "3.9"
|
||||
xcode_version:
|
||||
type: string
|
||||
default: "26.0.0"
|
||||
build_env:
|
||||
type: string
|
||||
default: ""
|
||||
macosx_deployment_target:
|
||||
type: string
|
||||
default: ""
|
||||
macos:
|
||||
xcode: << parameters.xcode_version >>
|
||||
resource_class: m4pro.medium
|
||||
environment:
|
||||
MACOSX_DEPLOYMENT_TARGET: << parameters.macosx_deployment_target >>
|
||||
steps:
|
||||
- checkout
|
||||
- run:
|
||||
name: Install dependencies
|
||||
command: |
|
||||
xcodebuild -downloadComponent MetalToolchain
|
||||
mkdir -p ~/miniconda3
|
||||
curl https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh -o ~/miniconda3/miniconda.sh
|
||||
bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
|
||||
rm ~/miniconda3/miniconda.sh
|
||||
source ~/miniconda3/bin/activate
|
||||
conda init --all
|
||||
conda create -n env python=<< parameters.python_version >> -y
|
||||
conda activate env
|
||||
pip install --upgrade cmake
|
||||
pip install nanobind==2.4.0
|
||||
pip install --upgrade setuptools
|
||||
pip install numpy
|
||||
pip install twine
|
||||
pip install build
|
||||
- run:
|
||||
name: Install Python package
|
||||
command: |
|
||||
conda activate env
|
||||
env -u MACOSX_DEPLOYMENT_TARGET DEV_RELEASE=1 \
|
||||
pip install . -v
|
||||
- run:
|
||||
name: Generate package stubs
|
||||
command: |
|
||||
conda activate env
|
||||
pip install typing_extensions
|
||||
python setup.py generate_stubs
|
||||
- run:
|
||||
name: Build Python package
|
||||
command: |
|
||||
conda activate env
|
||||
python setup.py clean --all
|
||||
<< parameters.build_env >> MLX_BUILD_STAGE=1 python -m build -w
|
||||
- when:
|
||||
condition:
|
||||
equal: ["3.9", << parameters.python_version >>]
|
||||
steps:
|
||||
- run:
|
||||
name: Build common package
|
||||
command: |
|
||||
conda activate env
|
||||
python setup.py clean --all
|
||||
<< parameters.build_env >> MLX_BUILD_STAGE=2 python -m build -w
|
||||
- when:
|
||||
condition: << parameters.build_env >>
|
||||
steps:
|
||||
- run:
|
||||
name: Upload package
|
||||
command: |
|
||||
conda activate env
|
||||
twine upload dist/*
|
||||
- store_artifacts:
|
||||
path: dist/
|
||||
|
||||
build_linux_release:
|
||||
parameters:
|
||||
python_version:
|
||||
type: string
|
||||
default: "3.9"
|
||||
build_env:
|
||||
type: string
|
||||
default: ""
|
||||
machine:
|
||||
image: ubuntu-2204:current
|
||||
resource_class: large
|
||||
steps:
|
||||
- checkout
|
||||
- run:
|
||||
name: Build wheel
|
||||
command: |
|
||||
PYTHON=python<< parameters.python_version >>
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
export NEEDRESTART_MODE=a
|
||||
sudo apt-get update
|
||||
TZ=Etc/UTC sudo apt-get -y install tzdata
|
||||
sudo add-apt-repository -y ppa:deadsnakes/ppa
|
||||
sudo apt-get install -y $PYTHON $PYTHON-dev $PYTHON-full
|
||||
sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
|
||||
$PYTHON -m venv env
|
||||
source env/bin/activate
|
||||
pip install --upgrade pip
|
||||
pip install --upgrade cmake
|
||||
pip install auditwheel
|
||||
pip install patchelf
|
||||
pip install build
|
||||
pip install twine
|
||||
<< parameters.build_env >> pip install ".[dev]" -v
|
||||
pip install typing_extensions
|
||||
python setup.py generate_stubs
|
||||
python setup.py clean --all
|
||||
MLX_BUILD_STAGE=1 << parameters.build_env >> python -m build -w
|
||||
bash python/scripts/repair_linux.sh
|
||||
- when:
|
||||
condition:
|
||||
equal: ["3.9", << parameters.python_version >>]
|
||||
steps:
|
||||
- run:
|
||||
name: Build common package
|
||||
command: |
|
||||
source env/bin/activate
|
||||
python setup.py clean --all
|
||||
<< parameters.build_env >> MLX_BUILD_STAGE=2 \
|
||||
python -m build -w
|
||||
auditwheel repair dist/mlx_cpu*.whl --plat manylinux_2_35_x86_64
|
||||
- when:
|
||||
condition: << parameters.build_env >>
|
||||
steps:
|
||||
- run:
|
||||
name: Upload packages
|
||||
command: |
|
||||
source env/bin/activate
|
||||
twine upload wheelhouse/*.whl
|
||||
- store_artifacts:
|
||||
path: wheelhouse/
|
||||
|
||||
build_cuda_release:
|
||||
parameters:
|
||||
build_env:
|
||||
type: string
|
||||
default: ""
|
||||
machine:
|
||||
image: ubuntu-2204:current
|
||||
resource_class: xlarge
|
||||
steps:
|
||||
- checkout
|
||||
- run:
|
||||
name: Build wheel
|
||||
command: |
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
export NEEDRESTART_MODE=a
|
||||
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
|
||||
sudo dpkg -i cuda-keyring_1.1-1_all.deb
|
||||
sudo apt-get update
|
||||
sudo apt-get install cuda-toolkit-12-9 libcudnn9-dev-cuda-12
|
||||
sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
|
||||
sudo apt-get install zip
|
||||
pip install auditwheel
|
||||
pip install patchelf
|
||||
pip install build
|
||||
pip install twine
|
||||
export PATH=/usr/local/cuda/bin${PATH:+:${PATH}}
|
||||
export LD_LIBRARY_PATH=/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
|
||||
<< parameters.build_env >> MLX_BUILD_STAGE=2 \
|
||||
CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
|
||||
python -m build -w
|
||||
bash python/scripts/repair_cuda.sh
|
||||
- when:
|
||||
condition: << parameters.build_env >>
|
||||
steps:
|
||||
- run:
|
||||
name: Upload package
|
||||
command: |
|
||||
twine upload wheelhouse/*.whl
|
||||
- store_artifacts:
|
||||
path: wheelhouse/
|
||||
|
||||
workflows:
|
||||
build_and_test:
|
||||
when:
|
||||
and:
|
||||
- matches:
|
||||
pattern: "^(?!pull/)[-\\w]+$"
|
||||
value: << pipeline.git.branch >>
|
||||
- not: << pipeline.parameters.nightly_build >>
|
||||
- not: << pipeline.parameters.test_release >>
|
||||
jobs:
|
||||
- mac_build_and_test:
|
||||
matrix:
|
||||
parameters:
|
||||
macosx_deployment_target: ["13.5", "15.0"]
|
||||
- linux_build_and_test
|
||||
- cuda_build_and_test:
|
||||
matrix:
|
||||
parameters:
|
||||
image_date: ["2023.11.1", "2025.05.1"]
|
||||
- build_documentation
|
||||
|
||||
build_pypi_release:
|
||||
when:
|
||||
and:
|
||||
- not: << pipeline.parameters.nightly_build >>
|
||||
- not: << pipeline.parameters.test_release >>
|
||||
jobs:
|
||||
- build_release:
|
||||
filters:
|
||||
tags:
|
||||
only: /^v.*/
|
||||
branches:
|
||||
ignore: /.*/
|
||||
matrix:
|
||||
parameters:
|
||||
python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
|
||||
macosx_deployment_target: ["13.5", "14.0", "15.0"]
|
||||
build_env: ["PYPI_RELEASE=1"]
|
||||
xcode_version: ["26.0.0"]
|
||||
- build_documentation:
|
||||
filters:
|
||||
tags:
|
||||
only: /^v.*/
|
||||
branches:
|
||||
ignore: /.*/
|
||||
upload-docs: true
|
||||
- build_linux_release:
|
||||
filters:
|
||||
tags:
|
||||
only: /^v.*/
|
||||
branches:
|
||||
ignore: /.*/
|
||||
matrix:
|
||||
parameters:
|
||||
python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
|
||||
build_env: ["PYPI_RELEASE=1"]
|
||||
- build_cuda_release:
|
||||
filters:
|
||||
tags:
|
||||
only: /^v.*/
|
||||
branches:
|
||||
ignore: /.*/
|
||||
matrix:
|
||||
parameters:
|
||||
build_env: ["PYPI_RELEASE=1"]
|
||||
|
||||
prb:
|
||||
when:
|
||||
matches:
|
||||
pattern: "^pull/\\d+(/head)?$"
|
||||
value: << pipeline.git.branch >>
|
||||
jobs:
|
||||
- hold:
|
||||
type: approval
|
||||
- apple/authenticate:
|
||||
context: pr-approval
|
||||
- mac_build_and_test:
|
||||
requires: [ hold ]
|
||||
matrix:
|
||||
parameters:
|
||||
macosx_deployment_target: ["13.5", "15.0"]
|
||||
- linux_build_and_test:
|
||||
requires: [ hold ]
|
||||
- cuda_build_and_test:
|
||||
requires: [ hold ]
|
||||
matrix:
|
||||
parameters:
|
||||
image_date: ["2023.11.1", "2025.05.1"]
|
||||
nightly_build:
|
||||
when:
|
||||
and:
|
||||
- equal: [ main, << pipeline.git.branch >> ]
|
||||
- << pipeline.parameters.nightly_build >>
|
||||
jobs:
|
||||
- build_release:
|
||||
matrix:
|
||||
parameters:
|
||||
python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
|
||||
macosx_deployment_target: ["13.5", "14.0", "15.0"]
|
||||
xcode_version: ["26.0.0"]
|
||||
- build_linux_release:
|
||||
matrix:
|
||||
parameters:
|
||||
python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
|
||||
- build_cuda_release
|
||||
|
||||
build_dev_release:
|
||||
when:
|
||||
and:
|
||||
- equal: [ main, << pipeline.git.branch >> ]
|
||||
- << pipeline.parameters.test_release >>
|
||||
jobs:
|
||||
- build_release:
|
||||
matrix:
|
||||
parameters:
|
||||
python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
|
||||
macosx_deployment_target: ["13.5", "14.0", "15.0"]
|
||||
build_env: ["DEV_RELEASE=1"]
|
||||
xcode_version: ["26.0.0"]
|
||||
- build_linux_release:
|
||||
matrix:
|
||||
parameters:
|
||||
python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
|
||||
build_env: ["DEV_RELEASE=1"]
|
||||
- build_cuda_release:
|
||||
matrix:
|
||||
parameters:
|
||||
build_env: ["DEV_RELEASE=1"]
|
||||
15
.github/actions/build-cuda-release/action.yml
vendored
Normal file
15
.github/actions/build-cuda-release/action.yml
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
name: 'Build CUDA wheel'
|
||||
description: 'Build CUDA wheel'
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Build package
|
||||
shell: bash
|
||||
env:
|
||||
CMAKE_ARGS: -DMLX_BUILD_CUDA=ON
|
||||
run: |
|
||||
pip install auditwheel build patchelf setuptools
|
||||
python setup.py clean --all
|
||||
MLX_BUILD_STAGE=2 python -m build -w
|
||||
bash python/scripts/repair_cuda.sh
|
||||
38
.github/actions/build-docs/action.yml
vendored
Normal file
38
.github/actions/build-docs/action.yml
vendored
Normal file
@@ -0,0 +1,38 @@
|
||||
name: 'Build Documentation'
|
||||
description: 'Build documentation'
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Setup machine
|
||||
uses: ./.github/actions/setup-linux
|
||||
|
||||
- name: Install dependencies
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt-get install -y doxygen
|
||||
source .venv/bin/activate
|
||||
pip install -r docs/requirements.txt
|
||||
pip install . -v
|
||||
|
||||
- name: Build documentation
|
||||
shell: bash
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
cd docs
|
||||
doxygen
|
||||
make html O=-W
|
||||
|
||||
- name: Create artifact tar
|
||||
shell: bash
|
||||
run: tar -cf artifact.tar -C docs --dereference build/html index.html
|
||||
|
||||
# Do it manually because upload-pages-artifact requires gtar
|
||||
- name: Upload artifact
|
||||
id: upload-artifact
|
||||
uses: actions/upload-artifact@v5
|
||||
with:
|
||||
name: github-pages
|
||||
path: artifact.tar
|
||||
retention-days: 1
|
||||
if-no-files-found: error
|
||||
40
.github/actions/build-linux-release/action.yml
vendored
Normal file
40
.github/actions/build-linux-release/action.yml
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
name: 'Build Linux wheel'
|
||||
description: 'Build Linux wheel'
|
||||
|
||||
inputs:
|
||||
build-backend:
|
||||
description: 'Build the backend mlx-cpu package'
|
||||
type: boolean
|
||||
required: false
|
||||
default: false
|
||||
arch:
|
||||
description: 'Platform architecture tag'
|
||||
required: true
|
||||
type: choice
|
||||
options:
|
||||
- x86_64
|
||||
- aarch64
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Generate package stubs
|
||||
shell: bash
|
||||
run: |
|
||||
pip install -e ".[dev]" -v
|
||||
pip install typing_extensions
|
||||
python setup.py generate_stubs
|
||||
- name: Build Python package
|
||||
shell: bash
|
||||
run: |
|
||||
pip install auditwheel patchelf build
|
||||
python setup.py clean --all
|
||||
MLX_BUILD_STAGE=1 python -m build -w
|
||||
bash python/scripts/repair_linux.sh ${{ inputs.arch }}
|
||||
- name: Build backend package
|
||||
if: ${{ inputs.build-backend }}
|
||||
shell: bash
|
||||
run: |
|
||||
python setup.py clean --all
|
||||
MLX_BUILD_STAGE=2 python -m build -w
|
||||
auditwheel repair dist/mlx_cpu*.whl --plat manylinux_2_35_${{ inputs.arch }}
|
||||
41
.github/actions/build-linux/action.yml
vendored
Normal file
41
.github/actions/build-linux/action.yml
vendored
Normal file
@@ -0,0 +1,41 @@
|
||||
name: 'Build and Test on Linux'
|
||||
|
||||
inputs:
|
||||
toolkit:
|
||||
description: 'The toolkit to build with'
|
||||
required: false
|
||||
default: 'cpu'
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Install Python package
|
||||
id: python_build
|
||||
shell: sh
|
||||
env:
|
||||
DEBUG: 1
|
||||
CMAKE_ARGS: >-
|
||||
-DCMAKE_COMPILE_WARNING_AS_ERROR=ON
|
||||
-DMLX_BUILD_CUDA=${{ startsWith(inputs.toolkit, 'cuda') && 'ON' || 'OFF' }}
|
||||
run: |
|
||||
if ${{ startsWith(inputs.toolkit, 'cuda') && runner.arch == 'arm64' }} ; then
|
||||
# There is no GPU in arm64 runner, use a common arch.
|
||||
CMAKE_ARGS="$CMAKE_ARGS -DMLX_CUDA_ARCHITECTURES=90a"
|
||||
# Can not build tests when the built executables can not run.
|
||||
CMAKE_ARGS="$CMAKE_ARGS -DMLX_BUILD_TESTS=OFF"
|
||||
fi
|
||||
pip install --no-build-isolation -e ".[dev]" -v
|
||||
# Pass the CMAKE_ARGS to following steps.
|
||||
echo CMAKE_ARGS="$CMAKE_ARGS" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Generate package stubs
|
||||
shell: sh
|
||||
run: |
|
||||
pip install typing_extensions
|
||||
python setup.py generate_stubs
|
||||
|
||||
- name: Build CPP only
|
||||
shell: bash
|
||||
run: |
|
||||
cmake . -B build -DCMAKE_BUILD_TYPE=Debug ${{ steps.python_build.outputs.CMAKE_ARGS }}
|
||||
cmake --build build -j $(nproc)
|
||||
34
.github/actions/build-macos-release/action.yml
vendored
Normal file
34
.github/actions/build-macos-release/action.yml
vendored
Normal file
@@ -0,0 +1,34 @@
|
||||
name: 'Build macOS release'
|
||||
description: 'Build MLX releases macOS'
|
||||
|
||||
inputs:
|
||||
macos-target:
|
||||
description: 'macOS build target'
|
||||
required: false
|
||||
default: '15.0'
|
||||
build-backend:
|
||||
description: 'Build the backend mlx-metal package'
|
||||
type: boolean
|
||||
required: false
|
||||
default: false
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Build Python package
|
||||
shell: bash -l {0}
|
||||
env:
|
||||
MACOSX_DEPLOYMENT_TARGET: ${{ inputs.macos-target }}
|
||||
run: |
|
||||
pip install build
|
||||
python setup.py clean --all
|
||||
MLX_BUILD_STAGE=1 python -m build -w
|
||||
|
||||
- name: Build backend package
|
||||
if: ${{ inputs.build-backend }}
|
||||
shell: bash -l {0}
|
||||
env:
|
||||
MACOSX_DEPLOYMENT_TARGET: ${{ inputs.macos-target }}
|
||||
run: |
|
||||
python setup.py clean --all
|
||||
MLX_BUILD_STAGE=2 python -m build -w
|
||||
88
.github/actions/build-macos/action.yml
vendored
Normal file
88
.github/actions/build-macos/action.yml
vendored
Normal file
@@ -0,0 +1,88 @@
|
||||
name: 'Build and Test on macOS'
|
||||
description: 'Build and test MLX on macOS'
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
env:
|
||||
DEBUG: 1
|
||||
CMAKE_ARGS: "-DCMAKE_COMPILE_WARNING_AS_ERROR=ON"
|
||||
shell: bash -l {0}
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install cmake setuptools nanobind==2.4.0
|
||||
pip install -e . -v
|
||||
|
||||
- name: Generate package stubs
|
||||
shell: bash -l {0}
|
||||
run: |
|
||||
pip install typing_extensions
|
||||
python setup.py generate_stubs
|
||||
|
||||
- name: Install tests dependencies
|
||||
shell: bash -l {0}
|
||||
run: |
|
||||
pip install numpy torch tensorflow unittest-xml-reporting
|
||||
|
||||
- name: Run Python tests
|
||||
shell: bash -l {0}
|
||||
env:
|
||||
LOW_MEMORY: 1
|
||||
run: |
|
||||
DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
|
||||
DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu
|
||||
mpirun --bind-to none -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
|
||||
mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
|
||||
if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
|
||||
|
||||
- name: Build example extension
|
||||
shell: bash -l {0}
|
||||
run: |
|
||||
cd examples/extensions
|
||||
pip install -r requirements.txt
|
||||
python setup.py build_ext --inplace
|
||||
python test.py
|
||||
|
||||
- name: Build CPP only
|
||||
shell: bash -l {0}
|
||||
run: |
|
||||
mkdir -p build
|
||||
cd build
|
||||
cmake ..
|
||||
make -j $(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Run CPP tests
|
||||
shell: bash -l {0}
|
||||
env:
|
||||
DEVICE: gpu
|
||||
METAL_DEVICE_WRAPPER_TYPE: 1
|
||||
METAL_DEBUG_ERROR_MODE: 0
|
||||
run: ./build/tests/tests
|
||||
|
||||
- name: Build small binary with JIT
|
||||
shell: bash -l {0}
|
||||
run: |
|
||||
mkdir -p build
|
||||
cd build
|
||||
cmake .. -DCMAKE_BUILD_TYPE=MinSizeRel \
|
||||
-DBUILD_SHARED_LIBS=ON \
|
||||
-DMLX_BUILD_CPU=OFF \
|
||||
-DMLX_BUILD_SAFETENSORS=OFF \
|
||||
-DMLX_BUILD_GGUF=OFF \
|
||||
-DMLX_METAL_JIT=ON
|
||||
make -j $(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Run Python tests with JIT
|
||||
shell: bash -l {0}
|
||||
env:
|
||||
LOW_MEMORY: 1
|
||||
DEVICE: gpu
|
||||
METAL_DEVICE_WRAPPER_TYPE: 1
|
||||
METAL_DEBUG_ERROR_MODE: 0
|
||||
run: |
|
||||
CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
|
||||
pip install -e . -v
|
||||
python -m xmlrunner discover \
|
||||
-v python/tests \
|
||||
-o test-results/gpu_jit
|
||||
86
.github/actions/setup-linux/action.yml
vendored
Normal file
86
.github/actions/setup-linux/action.yml
vendored
Normal file
@@ -0,0 +1,86 @@
|
||||
name: 'Setup Linux Environment'
|
||||
description: 'Install dependencies for Linux builds'
|
||||
|
||||
inputs:
|
||||
toolkit:
|
||||
description: 'Which toolkit to install'
|
||||
required: false
|
||||
default: 'cpu'
|
||||
python-version:
|
||||
description: 'Version of python to set up'
|
||||
required: false
|
||||
default: '3.10'
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Use ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2
|
||||
with:
|
||||
key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ inputs.toolkit }}-py${{ inputs.python-version }}
|
||||
max-size: 1GB
|
||||
|
||||
- name: Install common dependencies
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev zip
|
||||
|
||||
- uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: ${{ inputs.python-version }}
|
||||
|
||||
- name: Setup Python venv
|
||||
shell: bash
|
||||
run: |
|
||||
python -m venv .venv
|
||||
source .venv/bin/activate
|
||||
pip install setuptools cmake nanobind==2.4.0
|
||||
echo PATH=$PATH >> $GITHUB_ENV
|
||||
# Make cmake search .venv for nanobind
|
||||
echo PYTHONPATH=`python -c 'import sys; print(sys.path[-1])'` >> $GITHUB_ENV
|
||||
|
||||
- name: Install MPI
|
||||
shell: bash
|
||||
run: sudo apt-get install -y openmpi-bin openmpi-common libopenmpi-dev
|
||||
|
||||
- name: Install CUDA toolkit
|
||||
if: ${{ startsWith(inputs.toolkit, 'cuda') }}
|
||||
shell: bash
|
||||
env:
|
||||
# Note: the CI machine does not meet CUDA 13's driver requirement.
|
||||
# Compatibility matrix:
|
||||
# https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html
|
||||
PACKAGES: |
|
||||
{
|
||||
"cuda-12.6": "libcudnn9-dev-cuda-12 cuda-toolkit-12-6",
|
||||
"cuda-12.9": "libcudnn9-dev-cuda-12 cuda-toolkit-12-9",
|
||||
"cuda-13.0": "libcudnn9-dev-cuda-13 cuda-toolkit-13-0"
|
||||
}
|
||||
run: |
|
||||
# The CUDA binaries are hosted in the "sbsa" repo, the "arm64" repo is
|
||||
# Jetson specific. SBSA means Arm Server Base System Architecture.
|
||||
ARCH=${{ runner.arch == 'arm64' && 'sbsa' || 'x86_64' }}
|
||||
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$ARCH/cuda-keyring_1.1-1_all.deb
|
||||
sudo dpkg -i cuda-keyring_1.1-1_all.deb
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y \
|
||||
libnccl2 libnccl-dev \
|
||||
${{ fromJson(env.PACKAGES)[inputs.toolkit] }}
|
||||
echo "/usr/local/${{ inputs.toolkit }}/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: CUDA packages and driver report
|
||||
if: ${{ startsWith(inputs.toolkit, 'cuda') }}
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt-get install -y ubuntu-drivers-common dkms
|
||||
echo "NVIDIA Driver Packages Available:"
|
||||
sudo ubuntu-drivers list --gpgpu
|
||||
echo "NVIDIA Driver Version:"
|
||||
cat /proc/driver/nvidia/version || echo "nvidia driver not found"
|
||||
echo "Installed NVIDIA and CUDA packages:"
|
||||
dpkg -l | egrep "cuda|nvidia" -i
|
||||
echo "DKMS Status:"
|
||||
dkms status || echo "dkms not found"
|
||||
echo "NVIDIA-SMI Status:"
|
||||
nvidia-smi || echo "nvidia-smi not found"
|
||||
24
.github/actions/setup-macos/action.yml
vendored
Normal file
24
.github/actions/setup-macos/action.yml
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
name: 'Setup macOS Environment'
|
||||
description: 'Install dependencies for macOS builds'
|
||||
|
||||
inputs:
|
||||
python-version:
|
||||
description: 'Python version to use'
|
||||
required: false
|
||||
default: '3.10'
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Install Homebrew packages
|
||||
shell: sh
|
||||
run: /opt/homebrew/bin/brew install openmpi
|
||||
|
||||
- name: Verify MetalToolchain installed
|
||||
shell: bash
|
||||
run: xcodebuild -showComponent MetalToolchain
|
||||
|
||||
- uses: conda-incubator/setup-miniconda@v3
|
||||
with:
|
||||
miniconda-version: "latest"
|
||||
python-version: ${{ inputs.python-version }}
|
||||
69
.github/actions/test-linux/action.yml
vendored
Normal file
69
.github/actions/test-linux/action.yml
vendored
Normal file
@@ -0,0 +1,69 @@
|
||||
name: 'Run Linux tests'
|
||||
|
||||
inputs:
|
||||
has-gpu:
|
||||
description: 'Run GPU tests'
|
||||
required: false
|
||||
default: false
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Run MPI tests
|
||||
shell: bash
|
||||
run: |
|
||||
echo "::group::MPI tests"
|
||||
mpirun --bind-to none --allow-run-as-root -host localhost:8 -np 8 python python/tests/mpi_test_distributed.py
|
||||
echo "::endgroup::"
|
||||
|
||||
- name: Run distributed tests
|
||||
if: ${{ inputs.has-gpu == 'false' }}
|
||||
shell: bash
|
||||
run: |
|
||||
echo "::group::Distributed tests"
|
||||
mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
|
||||
if grep -Fq '[WARN]' stderr.log ; then
|
||||
grep -F '[WARN]' stderr.log
|
||||
echo "Distributed ring test failed";
|
||||
exit 1;
|
||||
fi
|
||||
echo "::endgroup::"
|
||||
|
||||
- name: Run Python tests - CPU
|
||||
if: ${{ inputs.has-gpu == 'false' }}
|
||||
shell: bash
|
||||
env:
|
||||
DEVICE: cpu
|
||||
run: |
|
||||
echo "::group::Python tests - CPU"
|
||||
python -m unittest discover python/tests -v
|
||||
echo "::endgroup::"
|
||||
|
||||
- name: Run Python tests - GPU
|
||||
if: ${{ inputs.has-gpu == 'true' }}
|
||||
shell: bash
|
||||
env:
|
||||
DEVICE: gpu
|
||||
run: |
|
||||
echo "::group::Python tests - GPU"
|
||||
python -m tests discover python/tests -v
|
||||
echo "::endgroup::"
|
||||
|
||||
- name: Run CPP tests - CPU
|
||||
shell: bash
|
||||
env:
|
||||
DEVICE: cpu
|
||||
run: |
|
||||
echo "::group::CPP tests - CPU"
|
||||
./build/tests/tests
|
||||
echo "::endgroup::"
|
||||
|
||||
- name: Run CPP tests - GPU
|
||||
if: ${{ inputs.has-gpu == 'true' }}
|
||||
shell: bash
|
||||
env:
|
||||
DEVICE: gpu
|
||||
run: |
|
||||
echo "::group::CPP tests - GPU"
|
||||
./build/tests/tests -sfe="*fft_tests.cpp,*linalg_tests.cpp"
|
||||
echo "::endgroup::"
|
||||
6
.github/dependabot.yml
vendored
Normal file
6
.github/dependabot.yml
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
version: 2
|
||||
updates:
|
||||
- package-ecosystem: "github-actions"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
27
.github/scripts/setup+build-cpp-linux-fedora-container.sh
vendored
Executable file
27
.github/scripts/setup+build-cpp-linux-fedora-container.sh
vendored
Executable file
@@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
# [Setup] Install dependencies inside the container.
|
||||
dnf update -y
|
||||
dnf install -y \
|
||||
blas-devel \
|
||||
lapack-devel \
|
||||
openblas-devel \
|
||||
make \
|
||||
cmake \
|
||||
clang \
|
||||
git
|
||||
dnf clean all
|
||||
|
||||
# [C++] CI Build Sanity Check: Verifies code compilation, not for release.
|
||||
export CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON"
|
||||
export DEBUG=1
|
||||
export CMAKE_C_COMPILER=/usr/bin/clang
|
||||
export CMAKE_CXX_COMPILER=/usr/bin/clang++
|
||||
|
||||
mkdir -p build
|
||||
pushd build
|
||||
cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
|
||||
make -j $(nproc)
|
||||
./tests/tests
|
||||
popd
|
||||
108
.github/workflows/build_and_test.yml
vendored
Normal file
108
.github/workflows/build_and_test.yml
vendored
Normal file
@@ -0,0 +1,108 @@
|
||||
name: Build and Test
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
# For testing CI without starting a pull request:
|
||||
- test/*
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
|
||||
jobs:
|
||||
check_lint:
|
||||
name: Check Lint
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: pre-commit/action@v3.0.1
|
||||
|
||||
linux_build_and_test:
|
||||
name: Linux (cpu, ${{ matrix.arch }})
|
||||
needs: check_lint
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: ['x86_64', 'aarch64']
|
||||
runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }}
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: ./.github/actions/setup-linux
|
||||
- uses: ./.github/actions/build-linux
|
||||
- uses: ./.github/actions/test-linux
|
||||
|
||||
cuda_build_and_test:
|
||||
name: Linux (${{ matrix.toolkit }}, ${{ matrix.arch }})
|
||||
if: github.repository == 'ml-explore/mlx'
|
||||
needs: check_lint
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: ['x86_64', 'aarch64']
|
||||
toolkit: ['cuda-12.6', 'cuda-12.9']
|
||||
runs-on: ${{ matrix.arch == 'x86_64' && 'gpu-t4-4-core' || 'ubuntu-22.04-arm' }}
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: ./.github/actions/setup-linux
|
||||
with:
|
||||
toolkit: ${{ matrix.toolkit }}
|
||||
- uses: ./.github/actions/build-linux
|
||||
with:
|
||||
toolkit: ${{ matrix.toolkit }}
|
||||
- uses: ./.github/actions/test-linux
|
||||
if: matrix.arch == 'x86_64'
|
||||
with:
|
||||
has-gpu: true
|
||||
|
||||
mac_build_and_test:
|
||||
name: macOS (${{ matrix.macos-target }})
|
||||
if: github.repository == 'ml-explore/mlx'
|
||||
strategy:
|
||||
matrix:
|
||||
macos-target: ["14.0", "15.0"]
|
||||
runs-on: [self-hosted, macos]
|
||||
env:
|
||||
MACOSX_DEPLOYMENT_TARGET: ${{ matrix.macos-target }}
|
||||
needs: check_lint
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: ./.github/actions/setup-macos
|
||||
- uses: ./.github/actions/build-macos
|
||||
|
||||
build_documentation:
|
||||
name: Build Documentation
|
||||
if: github.repository == 'ml-explore/mlx'
|
||||
runs-on: ubuntu-22.04
|
||||
needs: check_lint
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: ./.github/actions/build-docs
|
||||
|
||||
linux_fedora_build_cpp:
|
||||
name: Linux Fedora (${{ matrix.arch }})
|
||||
needs: check_lint
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- host: ubuntu-22.04
|
||||
arch: x86_64
|
||||
- host: ubuntu-22.04-arm
|
||||
arch: aarch64
|
||||
|
||||
runs-on: ${{ matrix.host }}
|
||||
container:
|
||||
image: fedora:42
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: CPP Build Test - No Release
|
||||
run: |
|
||||
bash ./.github/scripts/setup+build-cpp-linux-fedora-container.sh
|
||||
28
.github/workflows/documentation.yml
vendored
Normal file
28
.github/workflows/documentation.yml
vendored
Normal file
@@ -0,0 +1,28 @@
|
||||
name: Documentation
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: ./.github/actions/build-docs
|
||||
|
||||
deploy:
|
||||
needs: build
|
||||
permissions:
|
||||
pages: write
|
||||
id-token: write
|
||||
runs-on: ubuntu-latest
|
||||
environment:
|
||||
name: github-pages
|
||||
url: ${{ steps.deployment.outputs.page_url }}
|
||||
steps:
|
||||
- name: Deploy to GitHub Pages
|
||||
id: deployment
|
||||
uses: actions/deploy-pages@v4
|
||||
96
.github/workflows/nightly.yml
vendored
Normal file
96
.github/workflows/nightly.yml
vendored
Normal file
@@ -0,0 +1,96 @@
|
||||
name: Nightly Build
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: 33 6 * * 1-5
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
build_linux_release:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python_version: ["3.10", "3.14"]
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: ./.github/actions/setup-linux
|
||||
- uses: ./.github/actions/build-linux-release
|
||||
with:
|
||||
build-backend: ${{ matrix.python-version == '3.10' }}
|
||||
arch: "x86_64"
|
||||
- name: Upload mlx artifacts
|
||||
uses: actions/upload-artifact@v5
|
||||
with:
|
||||
name: linux-wheels-${{ matrix.python_version }}
|
||||
path: wheelhouse/mlx-*.whl
|
||||
retention-days: 7
|
||||
- name: Upload mlx-cpu artifacts
|
||||
if: matrix.python_version == '3.10'
|
||||
uses: actions/upload-artifact@v5
|
||||
with:
|
||||
name: mlx-cpu
|
||||
path: wheelhouse/mlx_cpu-*.whl
|
||||
retention-days: 7
|
||||
|
||||
build_linux_with_tests:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python_version: ["3.11", "3.12", "3.13", "3.14"]
|
||||
runner:
|
||||
- ubuntu-22.04
|
||||
- ubuntu-22.04-arm
|
||||
runs-on: ${{ matrix.runner }}
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: ./.github/actions/setup-linux
|
||||
with:
|
||||
python-version: ${{ matrix.python_version }}
|
||||
- uses: ./.github/actions/build-linux
|
||||
- uses: ./.github/actions/test-linux
|
||||
|
||||
build_mac_release:
|
||||
if: github.repository == 'ml-explore/mlx'
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.10", "3.13"]
|
||||
runs-on: [self-hosted, macos]
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: ./.github/actions/setup-macos
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- uses: ./.github/actions/build-macos
|
||||
- name: Build macOS 15 package
|
||||
uses: ./.github/actions/build-macos-release
|
||||
with:
|
||||
macos-target: 15.0
|
||||
build-backend: ${{ matrix.python-version == '3.10' }}
|
||||
- name: Build macOS 14 package
|
||||
uses: ./.github/actions/build-macos-release
|
||||
with:
|
||||
macos-target: 14.0
|
||||
build-backend: ${{ matrix.python-version == '3.10' }}
|
||||
|
||||
build_cuda_release:
|
||||
if: github.repository == 'ml-explore/mlx'
|
||||
runs-on: ubuntu-22-large
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: ./.github/actions/setup-linux
|
||||
with:
|
||||
toolkit: 'cuda-12.9'
|
||||
- name: Build Python package
|
||||
uses: ./.github/actions/build-cuda-release
|
||||
with:
|
||||
toolkit: 'cuda-12.9'
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v5
|
||||
with:
|
||||
name: mlx-cuda
|
||||
path: wheelhouse/mlx_cuda-*.whl
|
||||
retention-days: 7
|
||||
20
.github/workflows/pull_request.yml
vendored
20
.github/workflows/pull_request.yml
vendored
@@ -1,20 +0,0 @@
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
|
||||
jobs:
|
||||
check_lint:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: 3.8
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install pre-commit black isort clang-format
|
||||
- name: Run lint
|
||||
run: |
|
||||
pre-commit run --all-files
|
||||
238
.github/workflows/release.yml
vendored
Normal file
238
.github/workflows/release.yml
vendored
Normal file
@@ -0,0 +1,238 @@
|
||||
name: PyPI Release
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'v*'
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
dev_release:
|
||||
description: "Do a dev release or regular release"
|
||||
required: true
|
||||
default: "false"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
setup:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Set publishing variables
|
||||
run: echo "Publishing setup complete"
|
||||
|
||||
build_documentation:
|
||||
if: github.repository == 'ml-explore/mlx'
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: ./.github/actions/build-docs
|
||||
|
||||
deploy_documentation:
|
||||
needs: build_documentation
|
||||
permissions:
|
||||
pages: write
|
||||
id-token: write
|
||||
runs-on: ubuntu-latest
|
||||
environment:
|
||||
name: github-pages
|
||||
url: ${{ steps.deployment.outputs.page_url }}
|
||||
steps:
|
||||
- name: Deploy to GitHub Pages
|
||||
id: deployment
|
||||
uses: actions/deploy-pages@v4
|
||||
|
||||
build_linux_release:
|
||||
if: github.repository == 'ml-explore/mlx'
|
||||
strategy:
|
||||
matrix:
|
||||
python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
|
||||
arch: ['x86_64', 'aarch64']
|
||||
runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }}
|
||||
env:
|
||||
PYPI_RELEASE: 1
|
||||
DEV_RELEASE: ${{ github.event.inputs.dev_release == 'true' && 1 || 0 }}
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: ./.github/actions/setup-linux
|
||||
with:
|
||||
python-version: ${{ matrix.python_version }}
|
||||
- uses: ./.github/actions/build-linux-release
|
||||
with:
|
||||
build-backend: ${{ matrix.python-version == '3.10' }}
|
||||
arch: ${{ matrix.arch }}
|
||||
- name: Upload MLX artifacts
|
||||
uses: actions/upload-artifact@v5
|
||||
with:
|
||||
overwrite: true
|
||||
name: linux-wheels-${{ matrix.python_version }}-${{ matrix.arch }}
|
||||
path: wheelhouse/mlx-*.whl
|
||||
- name: Upload CPU artifacts
|
||||
if: matrix.python_version == '3.10'
|
||||
uses: actions/upload-artifact@v5
|
||||
with:
|
||||
overwrite: true
|
||||
name: mlx-cpu-${{ matrix.arch }}
|
||||
path: wheelhouse/mlx_cpu-*.whl
|
||||
|
||||
build_mac_release:
|
||||
if: github.repository == 'ml-explore/mlx'
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
|
||||
runs-on: [self-hosted, macos]
|
||||
env:
|
||||
PYPI_RELEASE: 1
|
||||
DEV_RELEASE: ${{ github.event.inputs.dev_release == 'true' && 1 || 0 }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: ./.github/actions/setup-macos
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Install dependencies
|
||||
shell: bash -l {0}
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install cmake setuptools nanobind==2.4.0
|
||||
pip install -e . -v
|
||||
- name: Generate package stubs
|
||||
shell: bash -l {0}
|
||||
run: |
|
||||
pip install typing_extensions
|
||||
python setup.py generate_stubs
|
||||
- name: Build macOS 14 package
|
||||
uses: ./.github/actions/build-macos-release
|
||||
with:
|
||||
macos-target: 14.0
|
||||
build-backend: ${{ matrix.python-version == '3.10' }}
|
||||
- name: Build macOS 15 package
|
||||
uses: ./.github/actions/build-macos-release
|
||||
with:
|
||||
macos-target: 15.0
|
||||
build-backend: ${{ matrix.python-version == '3.10' }}
|
||||
- name: Upload MLX artifacts
|
||||
uses: actions/upload-artifact@v5
|
||||
with:
|
||||
overwrite: true
|
||||
name: mac-wheels-${{ matrix.python-version }}
|
||||
path: dist/mlx-*.whl
|
||||
- name: Upload Metal artifacts
|
||||
if: matrix.python-version == '3.10'
|
||||
uses: actions/upload-artifact@v5
|
||||
with:
|
||||
overwrite: true
|
||||
name: mlx-metal
|
||||
path: dist/mlx_metal-*.whl
|
||||
|
||||
build_cuda_release:
|
||||
if: github.repository == 'ml-explore/mlx'
|
||||
runs-on: ubuntu-22-large
|
||||
env:
|
||||
PYPI_RELEASE: 1
|
||||
DEV_RELEASE: ${{ github.event.inputs.dev_release == 'true' && 1 || 0 }}
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: ./.github/actions/setup-linux
|
||||
with:
|
||||
toolkit: 'cuda-12.9'
|
||||
- name: Build Python package
|
||||
uses: ./.github/actions/build-cuda-release
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v5
|
||||
with:
|
||||
overwrite: true
|
||||
name: mlx-cuda
|
||||
path: wheelhouse/mlx_cuda-*.whl
|
||||
|
||||
pypi-publish:
|
||||
name: Upload release to PyPI
|
||||
runs-on: ubuntu-latest
|
||||
needs: [setup, build_linux_release, build_mac_release]
|
||||
permissions:
|
||||
id-token: write
|
||||
environment:
|
||||
name: pypi
|
||||
url: https://pypi.org/p/mlx
|
||||
steps:
|
||||
- uses: actions/download-artifact@v6
|
||||
with:
|
||||
pattern: linux-wheels-*
|
||||
merge-multiple: true
|
||||
path: dist
|
||||
- uses: actions/download-artifact@v6
|
||||
with:
|
||||
pattern: mac-wheels-*
|
||||
merge-multiple: true
|
||||
path: dist
|
||||
- name: Display structure of downloaded files
|
||||
run: ls -R dist
|
||||
- name: Publish package distributions to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
repository-url: https://upload.pypi.org/legacy/
|
||||
|
||||
pypi-publish-cuda:
|
||||
name: Upload CUDA release to PyPI
|
||||
runs-on: ubuntu-latest
|
||||
needs: [setup, build_cuda_release]
|
||||
permissions:
|
||||
id-token: write
|
||||
environment:
|
||||
name: pypi
|
||||
url: https://pypi.org/p/mlx-cuda
|
||||
steps:
|
||||
- uses: actions/download-artifact@v6
|
||||
with:
|
||||
name: mlx-cuda
|
||||
path: dist
|
||||
- name: Display structure of downloaded files
|
||||
run: ls -R dist
|
||||
- name: Publish package distributions to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
repository-url: https://upload.pypi.org/legacy/
|
||||
|
||||
pypi-publish-cpu:
|
||||
name: Upload CPU release to PyPI
|
||||
runs-on: ubuntu-latest
|
||||
needs: [setup, build_linux_release]
|
||||
permissions:
|
||||
id-token: write
|
||||
environment:
|
||||
name: pypi
|
||||
url: https://pypi.org/p/mlx-cpu
|
||||
steps:
|
||||
- uses: actions/download-artifact@v6
|
||||
with:
|
||||
pattern: mlx-cpu-*
|
||||
merge-multiple: true
|
||||
path: dist
|
||||
- name: Display structure of downloaded files
|
||||
run: ls -R dist
|
||||
- name: Publish package distributions to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
repository-url: https://upload.pypi.org/legacy/
|
||||
|
||||
pypi-publish-metal:
|
||||
name: Upload Metal release to PyPI
|
||||
runs-on: ubuntu-latest
|
||||
needs: [setup, build_mac_release]
|
||||
permissions:
|
||||
id-token: write
|
||||
environment:
|
||||
name: pypi
|
||||
url: https://pypi.org/p/mlx-metal
|
||||
steps:
|
||||
- uses: actions/download-artifact@v6
|
||||
with:
|
||||
name: mlx-metal
|
||||
path: dist
|
||||
- name: Display structure of downloaded files
|
||||
run: ls -R dist
|
||||
- name: Publish package distributions to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
repository-url: https://upload.pypi.org/legacy/
|
||||
@@ -1,4 +1,10 @@
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v6.0.0
|
||||
hooks:
|
||||
- id: check-yaml
|
||||
# - id: end-of-file-fixer
|
||||
# - id: trailing-whitespace
|
||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||
rev: v19.1.7
|
||||
hooks:
|
||||
|
||||
@@ -74,6 +74,7 @@ endif()
|
||||
if(MLX_USE_CCACHE)
|
||||
find_program(CCACHE_PROGRAM ccache)
|
||||
if(CCACHE_PROGRAM)
|
||||
message(STATUS "Found CCache: ${CCACHE_PROGRAM}")
|
||||
set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
|
||||
set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
|
||||
set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
|
||||
@@ -88,6 +89,11 @@ cmake_policy(SET CMP0135 NEW)
|
||||
|
||||
add_library(mlx)
|
||||
|
||||
# Supress warnings: note: parameter passing for argument of type
|
||||
# ‘std::pair<float, float>’ when C++17 is enabled changed to match C++14 in GCC
|
||||
# 10.1
|
||||
target_compile_options(mlx PRIVATE -Wno-psabi)
|
||||
|
||||
if(MLX_BUILD_CUDA)
|
||||
enable_language(CUDA)
|
||||
endif()
|
||||
@@ -113,6 +119,10 @@ if(MLX_BUILD_METAL)
|
||||
COMMAND zsh "-c" "/usr/bin/xcrun -sdk macosx --show-sdk-version"
|
||||
OUTPUT_VARIABLE MACOS_SDK_VERSION
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE COMMAND_ERROR_IS_FATAL ANY)
|
||||
execute_process(
|
||||
COMMAND zsh "-c" "/usr/bin/xcrun -sdk macosx --show-sdk-path"
|
||||
OUTPUT_VARIABLE CMAKE_OSX_SYSROOT
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE COMMAND_ERROR_IS_FATAL ANY)
|
||||
|
||||
if(${MACOS_SDK_VERSION} LESS 14.0)
|
||||
message(
|
||||
@@ -122,9 +132,12 @@ if(MLX_BUILD_METAL)
|
||||
message(STATUS "Building with macOS SDK version ${MACOS_SDK_VERSION}")
|
||||
|
||||
set(METAL_CPP_URL
|
||||
https://developer.apple.com/metal/cpp/files/metal-cpp_macOS15_iOS18.zip)
|
||||
https://developer.apple.com/metal/cpp/files/metal-cpp_26.zip)
|
||||
|
||||
if(NOT CMAKE_OSX_DEPLOYMENT_TARGET STREQUAL "")
|
||||
if(${CMAKE_OSX_DEPLOYMENT_TARGET} LESS 14.0)
|
||||
message(FATAL_ERROR "MLX requires macOS >= 14.0")
|
||||
endif()
|
||||
set(XCRUN_FLAGS "-mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET}")
|
||||
endif()
|
||||
execute_process(
|
||||
@@ -133,7 +146,6 @@ if(MLX_BUILD_METAL)
|
||||
"echo \"__METAL_VERSION__\" | xcrun -sdk macosx metal ${XCRUN_FLAGS} -E -x metal -P - | tail -1 | tr -d '\n'"
|
||||
OUTPUT_VARIABLE MLX_METAL_VERSION COMMAND_ERROR_IS_FATAL ANY)
|
||||
FetchContent_Declare(metal_cpp URL ${METAL_CPP_URL})
|
||||
|
||||
FetchContent_MakeAvailable(metal_cpp)
|
||||
target_include_directories(
|
||||
mlx PUBLIC $<BUILD_INTERFACE:${metal_cpp_SOURCE_DIR}>
|
||||
|
||||
@@ -75,7 +75,7 @@ void time_irregular_binary_ops_3D() {
|
||||
|
||||
void time_irregular_binary_ops_4D() {
|
||||
auto device = mx::default_device();
|
||||
std::vector<int> shape = {8, 8, 512, 512};
|
||||
mx::Shape shape = {8, 8, 512, 512};
|
||||
auto a = mx::random::uniform(shape);
|
||||
auto b = mx::random::uniform(shape);
|
||||
|
||||
@@ -115,7 +115,7 @@ void time_irregular_binary_ops_4D() {
|
||||
|
||||
void time_irregular_reshape() {
|
||||
auto device = mx::default_device();
|
||||
std::vector<int> shape;
|
||||
mx::Shape shape;
|
||||
auto reshape_fn = [&shape, device](const mx::array& a) {
|
||||
return mx::reshape(a, shape, device);
|
||||
};
|
||||
@@ -170,7 +170,7 @@ void time_irregular_astype_1D() {
|
||||
void time_irregular_astype_2D() {
|
||||
auto device = mx::default_device();
|
||||
int size = 2048;
|
||||
std::vector<int> shape = {size, size};
|
||||
mx::Shape shape = {size, size};
|
||||
|
||||
auto a = mx::random::uniform(shape);
|
||||
TIMEM("2D regular", mx::astype, a, mx::int32, device);
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
# Copyright © 2023 Apple Inc.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
212
benchmarks/python/masked_scatter.py
Normal file
212
benchmarks/python/masked_scatter.py
Normal file
@@ -0,0 +1,212 @@
|
||||
import math
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
from copy import copy
|
||||
from functools import partial
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import mlx.core as mx
|
||||
import numpy as np
|
||||
import torch
|
||||
from matplotlib.ticker import FuncFormatter
|
||||
|
||||
RESULTS_DIR = "./results"
|
||||
|
||||
|
||||
if not os.path.isdir(RESULTS_DIR):
|
||||
os.mkdir(RESULTS_DIR)
|
||||
|
||||
DEVICE_NAME = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"])
|
||||
DEVICE_NAME = DEVICE_NAME.decode("utf-8").strip("\n")
|
||||
|
||||
TORCH_DEVICE = torch.device(
|
||||
"mps"
|
||||
if torch.backends.mps.is_available()
|
||||
else ("cuda" if torch.cuda.is_available() else "cpu")
|
||||
)
|
||||
|
||||
|
||||
N_WARMUP = 5
|
||||
N_ITER_BENCH = 50
|
||||
N_ITER_FUNC = 20
|
||||
|
||||
VECTOR_LENGTHS = [4096 * (2**i) for i in range(10)]
|
||||
MASK_DENSITIES = [0.01, 0.1, 0.25, 0.5]
|
||||
D_TYPES = ("float32", "float16")
|
||||
|
||||
|
||||
def _power_of_two_formatter(value, _position):
|
||||
if value <= 0:
|
||||
return ""
|
||||
exponent = int(round(math.log2(value)))
|
||||
if abs(value - (1 << exponent)) / value > 1e-6:
|
||||
return f"{value:g}"
|
||||
return f"$2^{{{exponent}}}$"
|
||||
|
||||
|
||||
def torch_sync():
|
||||
if TORCH_DEVICE.type == "cuda":
|
||||
torch.cuda.synchronize()
|
||||
elif TORCH_DEVICE.type == "mps":
|
||||
torch.mps.synchronize()
|
||||
|
||||
|
||||
def masked_scatter_mlx(self_arr, mask_arr, src_arr):
|
||||
outs = []
|
||||
for _ in range(N_ITER_FUNC):
|
||||
out = copy(self_arr)
|
||||
out[mask_arr] = src_arr
|
||||
outs.append(out)
|
||||
mx.eval(outs)
|
||||
return outs
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def masked_scatter_torch(self_tensor, mask_tensor, src_tensor):
|
||||
outs = []
|
||||
for _ in range(N_ITER_FUNC):
|
||||
out = self_tensor.clone()
|
||||
out.masked_scatter_(mask_tensor, src_tensor)
|
||||
outs.append(out)
|
||||
torch_sync()
|
||||
return outs
|
||||
|
||||
|
||||
def measure(fn):
|
||||
for _ in range(N_WARMUP):
|
||||
fn()
|
||||
start = time.perf_counter_ns()
|
||||
for _ in range(N_ITER_BENCH):
|
||||
fn()
|
||||
end = time.perf_counter_ns()
|
||||
return (end - start) * 1e-9
|
||||
|
||||
|
||||
def bytes_touched(length, true_count, item_size):
|
||||
mask_bytes = length
|
||||
self_bytes = length * item_size * 2 # read + write
|
||||
src_bytes = true_count * item_size
|
||||
return (mask_bytes + self_bytes + src_bytes) * N_ITER_FUNC * N_ITER_BENCH
|
||||
|
||||
|
||||
def build_case(length, density, np_dtype, torch_dtype):
|
||||
true_count = max(1, int(round(length * density)))
|
||||
|
||||
rng = np.random.default_rng()
|
||||
self_np = rng.normal(0.0, 1.0, length).astype(np_dtype)
|
||||
mask_np = np.zeros(length, dtype=bool)
|
||||
mask_np[:true_count] = True
|
||||
rng.shuffle(mask_np)
|
||||
src_np = rng.normal(0.0, 1.0, true_count).astype(np_dtype)
|
||||
|
||||
self_mlx = mx.array(self_np)
|
||||
mask_mlx = mx.array(mask_np)
|
||||
src_mlx = mx.array(src_np)
|
||||
|
||||
self_torch = torch.from_numpy(self_np).to(device=TORCH_DEVICE, dtype=torch_dtype)
|
||||
mask_torch = torch.from_numpy(mask_np).to(device=TORCH_DEVICE)
|
||||
src_torch = torch.from_numpy(src_np).to(device=TORCH_DEVICE, dtype=torch_dtype)
|
||||
|
||||
# Correctness check once per configuration
|
||||
mx_out = mx.array(self_np)
|
||||
mx_out[mask_mlx] = src_mlx
|
||||
mx.eval(mx_out)
|
||||
torch_out = self_torch.clone()
|
||||
torch_out.masked_scatter_(mask_torch, src_torch)
|
||||
|
||||
atol = 5e-3 if np_dtype == np.float16 else 1e-5
|
||||
if not np.allclose(np.array(mx_out), torch_out.cpu().numpy(), atol=atol):
|
||||
raise AssertionError("masked_scatter results diverged between MLX and Torch")
|
||||
|
||||
return (self_mlx, mask_mlx, src_mlx, self_torch, mask_torch, src_torch, true_count)
|
||||
|
||||
|
||||
def bench_case(length, density, dtype):
|
||||
np_dtype = getattr(np, dtype)
|
||||
torch_dtype = getattr(torch, dtype)
|
||||
(
|
||||
self_mlx,
|
||||
mask_mlx,
|
||||
src_mlx,
|
||||
self_torch,
|
||||
mask_torch,
|
||||
src_torch,
|
||||
true_count,
|
||||
) = build_case(length, density, np_dtype, torch_dtype)
|
||||
|
||||
time_mlx = measure(partial(masked_scatter_mlx, self_mlx, mask_mlx, src_mlx))
|
||||
time_torch = measure(
|
||||
partial(masked_scatter_torch, self_torch, mask_torch, src_torch)
|
||||
)
|
||||
|
||||
total_bytes = bytes_touched(length, true_count, np_dtype().itemsize)
|
||||
bytes_per_gb = float(1024**3)
|
||||
mlx_gbps = (total_bytes / bytes_per_gb) / time_mlx
|
||||
torch_gbps = (total_bytes / bytes_per_gb) / time_torch
|
||||
|
||||
return time_mlx, time_torch, mlx_gbps, torch_gbps
|
||||
|
||||
|
||||
def plot_density(ax_perf, ax_speedup, density, dtype):
|
||||
mlx_gbps = []
|
||||
torch_gbps = []
|
||||
mlx_times = []
|
||||
torch_times = []
|
||||
|
||||
for length in VECTOR_LENGTHS:
|
||||
t_mlx, t_torch, gbps_mlx, gbps_torch = bench_case(length, density, dtype)
|
||||
mlx_gbps.append(gbps_mlx)
|
||||
torch_gbps.append(gbps_torch)
|
||||
mlx_times.append(t_mlx)
|
||||
torch_times.append(t_torch)
|
||||
|
||||
ax_perf.plot(VECTOR_LENGTHS, mlx_gbps, "tab:blue", label="MLX")
|
||||
ax_perf.plot(VECTOR_LENGTHS, torch_gbps, "tab:red", label="Torch")
|
||||
ax_perf.set_xscale("log", base=2)
|
||||
ax_perf.set_xticks(VECTOR_LENGTHS)
|
||||
formatter = FuncFormatter(_power_of_two_formatter)
|
||||
ax_perf.xaxis.set_major_formatter(formatter)
|
||||
ax_perf.set_title(f"density={density:.2f}")
|
||||
ax_perf.set_ylabel("GB/s")
|
||||
ax_perf.grid(True, which="both", linestyle=":", alpha=0.4)
|
||||
ax_perf.legend()
|
||||
|
||||
speedup = np.array(torch_times) / np.array(mlx_times)
|
||||
ax_speedup.plot(VECTOR_LENGTHS, speedup, "tab:green")
|
||||
ax_speedup.axhline(1.0, color="tab:gray", linestyle="--")
|
||||
ax_speedup.set_xscale("log", base=2)
|
||||
ax_speedup.set_xticks(VECTOR_LENGTHS)
|
||||
ax_speedup.xaxis.set_major_formatter(formatter)
|
||||
ax_speedup.set_ylabel("Speedup (Torch_t / MLX_t)")
|
||||
ax_speedup.grid(True, which="both", linestyle=":", alpha=0.4)
|
||||
|
||||
|
||||
def main():
|
||||
for dtype in D_TYPES:
|
||||
fig, axs = plt.subplots(
|
||||
len(MASK_DENSITIES),
|
||||
2,
|
||||
figsize=(10, 12),
|
||||
layout="constrained",
|
||||
sharex=True,
|
||||
)
|
||||
|
||||
for i, density in enumerate(MASK_DENSITIES):
|
||||
plot_density(axs[i][0], axs[i][1], density, dtype)
|
||||
axs[i][0].set_xlabel("vector length")
|
||||
axs[i][1].set_xlabel("vector length")
|
||||
|
||||
fig.suptitle(
|
||||
f"{DEVICE_NAME.replace('Apple ', '')} ({TORCH_DEVICE.type}) | dtype={dtype}"
|
||||
)
|
||||
output_path = os.path.join(
|
||||
RESULTS_DIR,
|
||||
f"{DEVICE_NAME.replace(' ', '_')}_masked_scatter_{dtype}.pdf",
|
||||
)
|
||||
fig.savefig(output_path)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
3
cmake/Findnvpl.cmake
Normal file
3
cmake/Findnvpl.cmake
Normal file
@@ -0,0 +1,3 @@
|
||||
# This file does nothing but to suppress the cmake warning: "By not providing
|
||||
# Findnvpl.cmake in CMAKE_MODULE_PATH...", which is caused by the
|
||||
# find_package(nvpl) from cmake's builtin FindLAPACK.cmake module.
|
||||
@@ -16,12 +16,11 @@ silicon computer is
|
||||
To install from PyPI your system must meet the following requirements:
|
||||
|
||||
- Using an M series chip (Apple silicon)
|
||||
- Using a native Python >= 3.9
|
||||
- macOS >= 13.5
|
||||
- Using a native Python >= 3.10
|
||||
- macOS >= 14.0
|
||||
|
||||
.. note::
|
||||
MLX is only available on devices running macOS >= 13.5
|
||||
It is highly recommended to use macOS 14 (Sonoma)
|
||||
MLX is only available on devices running macOS >= 14.0 and higher.
|
||||
|
||||
CUDA
|
||||
^^^^
|
||||
@@ -39,7 +38,7 @@ requirements:
|
||||
- Nvidia driver >= 550.54.14
|
||||
- CUDA toolkit >= 12.0
|
||||
- Linux distribution with glibc >= 2.35
|
||||
- Python >= 3.9
|
||||
- Python >= 3.10
|
||||
|
||||
|
||||
CPU-only (Linux)
|
||||
@@ -55,7 +54,7 @@ To install the CPU-only package from PyPi your system must meet the following
|
||||
requirements:
|
||||
|
||||
- Linux distribution with glibc >= 2.35
|
||||
- Python >= 3.9
|
||||
- Python >= 3.10
|
||||
|
||||
|
||||
Troubleshooting
|
||||
|
||||
@@ -112,6 +112,7 @@ Operations
|
||||
max
|
||||
maximum
|
||||
mean
|
||||
median
|
||||
meshgrid
|
||||
min
|
||||
minimum
|
||||
|
||||
@@ -7,12 +7,13 @@ Distributed Communication
|
||||
|
||||
MLX supports distributed communication operations that allow the computational cost
|
||||
of training or inference to be shared across many physical machines. At the
|
||||
moment we support two different communication backends:
|
||||
moment we support three different communication backends:
|
||||
|
||||
* `MPI <https://en.wikipedia.org/wiki/Message_Passing_Interface>`_ a
|
||||
full-featured and mature distributed communications library
|
||||
* A **ring** backend of our own that uses native TCP sockets and should be
|
||||
faster for thunderbolt connections.
|
||||
* A **ring** backend of our own that uses native TCP sockets. It should be
|
||||
faster for thunderbolt connections, but it also works over Ethernet.
|
||||
* `nccl <https://developer.nvidia.com/nccl>`_, for use in CUDA environments.
|
||||
|
||||
The list of all currently supported operations and their documentation can be
|
||||
seen in the :ref:`API docs<distributed>`.
|
||||
@@ -84,9 +85,8 @@ Selecting Backend
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
You can select the backend you want to use when calling :func:`init` by passing
|
||||
one of ``{'any', 'ring', 'mpi'}``. When passing ``any``, MLX will try to
|
||||
initialize the ``ring`` backend and if it fails the ``mpi`` backend. If they
|
||||
both fail then a singleton group is created.
|
||||
one of ``{'any', 'ring', 'mpi', 'nccl'}``. When passing ``any``, MLX will try all
|
||||
available backends. If they all fail then a singleton group is created.
|
||||
|
||||
.. note::
|
||||
After a distributed backend is successfully initialized :func:`init` will
|
||||
@@ -220,7 +220,7 @@ print 4 etc.
|
||||
Installing MPI
|
||||
^^^^^^^^^^^^^^
|
||||
|
||||
MPI can be installed with Homebrew, using the Anaconda package manager or
|
||||
MPI can be installed with Homebrew, pip, using the Anaconda package manager, or
|
||||
compiled from source. Most of our testing is done using ``openmpi`` installed
|
||||
with the Anaconda package manager as follows:
|
||||
|
||||
@@ -228,14 +228,16 @@ with the Anaconda package manager as follows:
|
||||
|
||||
$ conda install conda-forge::openmpi
|
||||
|
||||
Installing with Homebrew may require specifying the location of ``libmpi.dyld``
|
||||
Installing with Homebrew or pip requires specifying the location of ``libmpi.dyld``
|
||||
so that MLX can find it and load it at runtime. This can simply be achieved by
|
||||
passing the ``DYLD_LIBRARY_PATH`` environment variable to ``mpirun`` and it is
|
||||
done automatically by ``mlx.launch``.
|
||||
done automatically by ``mlx.launch``. Some environments use a non-standard
|
||||
library filename that can be specified using the ``MPI_LIBNAME`` environment
|
||||
variable. This is automatically taken care of by ``mlx.launch`` as well.
|
||||
|
||||
.. code:: shell
|
||||
|
||||
$ mpirun -np 2 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python test.py
|
||||
$ mpirun -np 2 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ -x MPI_LIBNAME=libmpi.40.dylib python test.py
|
||||
$ # or simply
|
||||
$ mlx.launch -n 2 test.py
|
||||
|
||||
|
||||
@@ -70,7 +70,8 @@ Differences from NumPy
|
||||
|
||||
* Indexing does not perform bounds checking. Indexing out of bounds is
|
||||
undefined behavior.
|
||||
* Boolean mask based indexing is not yet supported.
|
||||
* Boolean mask based indexing is supported for assignment only (see
|
||||
:ref:`boolean-mask-assignment`).
|
||||
|
||||
The reason for the lack of bounds checking is that exceptions cannot propagate
|
||||
from the GPU. Performing bounds checking for array indices before launching the
|
||||
@@ -143,3 +144,51 @@ expected. For example:
|
||||
|
||||
In the above ``dfdx`` will have the correct gradient, namely zeros at ``idx``
|
||||
and ones elsewhere.
|
||||
|
||||
.. _boolean-mask-assignment:
|
||||
|
||||
Boolean Mask Assignment
|
||||
-----------------------
|
||||
|
||||
MLX supports boolean indices using NumPy syntax. A mask must already be
|
||||
a :class:`bool_` MLX :class:`array` or a NumPy ``ndarray`` with ``dtype=bool``.
|
||||
Other index types are routed through the standard scatter code.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
>>> a = mx.array([1.0, 2.0, 3.0])
|
||||
>>> mask = mx.array([True, False, True])
|
||||
>>> updates = mx.array([5.0, 6.0])
|
||||
>>> a[mask] = updates
|
||||
>>> a
|
||||
array([5.0, 2.0, 6.0], dtype=float32)
|
||||
|
||||
Scalar assignments broadcast to every ``True`` entry in ``mask``. For non-scalar
|
||||
assignments, ``updates`` must provide at least as many elements as there are
|
||||
``True`` entries in ``mask``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
>>> a = mx.zeros((2, 3))
|
||||
>>> mask = mx.array([[True, False, True],
|
||||
[False, False, True]])
|
||||
>>> a[mask] = 1.0
|
||||
>>> a
|
||||
array([[1.0, 0.0, 1.0],
|
||||
[0.0, 0.0, 1.0]], dtype=float32)
|
||||
|
||||
Boolean masks follow NumPy semantics:
|
||||
|
||||
- The mask shape must match the shape of the axes it indexes exactly. The only
|
||||
exception is a scalar boolean mask, which broadcasts to the full array.
|
||||
- Any axes not covered by the mask are taken in full.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
>>> a = mx.arange(1000).reshape(10, 10, 10)
|
||||
>>> a[mx.random.randn(10, 10) > 0.0] = 0 # valid: mask covers axes 0 and 1
|
||||
|
||||
The mask of shape ``(10, 10)`` applies to the first two axes, so ``a[mask]``
|
||||
selects the 1-D slices ``a[i, j, :]`` where ``mask[i, j]`` is ``True``.
|
||||
Shapes such as ``(1, 10, 10)`` or ``(10, 10, 1)`` do not match the indexed
|
||||
axes and therefore raise errors.
|
||||
|
||||
@@ -14,7 +14,7 @@ class Buffer {
|
||||
void* ptr_;
|
||||
|
||||
public:
|
||||
Buffer(void* ptr) : ptr_(ptr) {};
|
||||
explicit Buffer(void* ptr) : ptr_(ptr) {};
|
||||
|
||||
// Get the raw data pointer from the buffer
|
||||
void* raw_ptr();
|
||||
|
||||
@@ -64,7 +64,7 @@ array array::unsafe_weak_copy(const array& other) {
|
||||
other.strides(),
|
||||
other.flags(),
|
||||
[](auto) {});
|
||||
cpy.array_desc_->data_ptr = other.array_desc_->data_ptr;
|
||||
cpy.array_desc_->offset = other.array_desc_->offset;
|
||||
return cpy;
|
||||
}
|
||||
|
||||
@@ -141,7 +141,7 @@ bool array::is_tracer() const {
|
||||
|
||||
void array::set_data(allocator::Buffer buffer, Deleter d) {
|
||||
array_desc_->data = std::make_shared<Data>(buffer, d);
|
||||
array_desc_->data_ptr = buffer.raw_ptr();
|
||||
array_desc_->offset = 0;
|
||||
array_desc_->data_size = size();
|
||||
array_desc_->flags.contiguous = true;
|
||||
array_desc_->flags.row_contiguous = true;
|
||||
@@ -156,7 +156,7 @@ void array::set_data(
|
||||
Flags flags,
|
||||
Deleter d) {
|
||||
array_desc_->data = std::make_shared<Data>(buffer, d);
|
||||
array_desc_->data_ptr = buffer.raw_ptr();
|
||||
array_desc_->offset = 0;
|
||||
array_desc_->data_size = data_size;
|
||||
array_desc_->strides = std::move(strides);
|
||||
array_desc_->flags = flags;
|
||||
@@ -167,14 +167,13 @@ void array::copy_shared_buffer(
|
||||
const Strides& strides,
|
||||
Flags flags,
|
||||
size_t data_size,
|
||||
size_t offset /* = 0 */) {
|
||||
int64_t offset /* = 0 */) {
|
||||
array_desc_->data = other.array_desc_->data;
|
||||
array_desc_->strides = strides;
|
||||
array_desc_->flags = flags;
|
||||
array_desc_->data_size = data_size;
|
||||
auto char_offset = sizeof(char) * itemsize() * offset;
|
||||
array_desc_->data_ptr = static_cast<void*>(
|
||||
static_cast<char*>(other.array_desc_->data_ptr) + char_offset);
|
||||
array_desc_->offset =
|
||||
sizeof(char) * itemsize() * offset + other.array_desc_->offset;
|
||||
}
|
||||
|
||||
void array::copy_shared_buffer(const array& other) {
|
||||
@@ -241,8 +240,8 @@ array::ArrayDesc::ArrayDesc(
|
||||
std::vector<array> inputs)
|
||||
: shape(std::move(shape)),
|
||||
dtype(dtype),
|
||||
status(Status::unscheduled),
|
||||
primitive(std::move(primitive)),
|
||||
status(Status::unscheduled),
|
||||
inputs(std::move(inputs)) {
|
||||
init();
|
||||
}
|
||||
|
||||
25
mlx/array.h
25
mlx/array.h
@@ -294,6 +294,11 @@ class array {
|
||||
return array_desc_->siblings;
|
||||
}
|
||||
|
||||
/** The array's position in the sibling list. */
|
||||
int sibling_position() const {
|
||||
return array_desc_->position;
|
||||
}
|
||||
|
||||
void set_siblings(std::vector<array> siblings, uint16_t position) {
|
||||
array_desc_->siblings = std::move(siblings);
|
||||
array_desc_->position = position;
|
||||
@@ -349,15 +354,23 @@ class array {
|
||||
return array_desc_->data;
|
||||
}
|
||||
|
||||
// Return a raw pointer to the arrays data
|
||||
// Return a raw pointer to the arrays data. This function may do a copy if
|
||||
// the underlying buffer is not accessible on the CPU. When accessing the
|
||||
// data for GPU kernels, be sure to use the correct method / function for the
|
||||
// given backend to access the GPU pointer.
|
||||
template <typename T>
|
||||
T* data() {
|
||||
return static_cast<T*>(array_desc_->data_ptr);
|
||||
return reinterpret_cast<T*>(
|
||||
(static_cast<char*>(buffer().raw_ptr()) + array_desc_->offset));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const T* data() const {
|
||||
return static_cast<T*>(array_desc_->data_ptr);
|
||||
return const_cast<array&>(*this).data<T>();
|
||||
}
|
||||
|
||||
int64_t offset() const {
|
||||
return array_desc_->offset;
|
||||
}
|
||||
|
||||
enum Status {
|
||||
@@ -426,7 +439,7 @@ class array {
|
||||
const Strides& strides,
|
||||
Flags flags,
|
||||
size_t data_size,
|
||||
size_t offset = 0);
|
||||
int64_t offset = 0);
|
||||
|
||||
void copy_shared_buffer(const array& other);
|
||||
|
||||
@@ -461,8 +474,8 @@ class array {
|
||||
// can share the underlying data buffer.
|
||||
std::shared_ptr<Data> data;
|
||||
|
||||
// Properly offset data pointer
|
||||
void* data_ptr{nullptr};
|
||||
// Offset from beginning of data pointer
|
||||
int64_t offset{0};
|
||||
|
||||
// The size in elements of the data buffer the array accesses
|
||||
size_t data_size;
|
||||
|
||||
@@ -38,20 +38,20 @@ inline void set_binary_op_output_data(
|
||||
const array& a,
|
||||
const array& b,
|
||||
array& out,
|
||||
BinaryOpType bopt) {
|
||||
BinaryOpType bopt,
|
||||
std::function<allocator::Buffer(size_t)> mallocfn = allocator::malloc) {
|
||||
bool b_donatable = is_donatable(b, out);
|
||||
bool a_donatable = is_donatable(a, out);
|
||||
switch (bopt) {
|
||||
case BinaryOpType::ScalarScalar:
|
||||
out.set_data(
|
||||
allocator::malloc(out.itemsize()), 1, a.strides(), a.flags());
|
||||
out.set_data(mallocfn(out.itemsize()), 1, a.strides(), a.flags());
|
||||
break;
|
||||
case BinaryOpType::ScalarVector:
|
||||
if (b_donatable) {
|
||||
out.copy_shared_buffer(b);
|
||||
} else {
|
||||
out.set_data(
|
||||
allocator::malloc(b.data_size() * out.itemsize()),
|
||||
mallocfn(b.data_size() * out.itemsize()),
|
||||
b.data_size(),
|
||||
b.strides(),
|
||||
b.flags());
|
||||
@@ -62,7 +62,7 @@ inline void set_binary_op_output_data(
|
||||
out.copy_shared_buffer(a);
|
||||
} else {
|
||||
out.set_data(
|
||||
allocator::malloc(a.data_size() * out.itemsize()),
|
||||
mallocfn(a.data_size() * out.itemsize()),
|
||||
a.data_size(),
|
||||
a.strides(),
|
||||
a.flags());
|
||||
@@ -75,7 +75,7 @@ inline void set_binary_op_output_data(
|
||||
out.copy_shared_buffer(b);
|
||||
} else {
|
||||
out.set_data(
|
||||
allocator::malloc(a.data_size() * out.itemsize()),
|
||||
mallocfn(a.data_size() * out.itemsize()),
|
||||
a.data_size(),
|
||||
a.strides(),
|
||||
a.flags());
|
||||
@@ -88,7 +88,7 @@ inline void set_binary_op_output_data(
|
||||
b_donatable && b.flags().row_contiguous && b.size() == out.size()) {
|
||||
out.copy_shared_buffer(b);
|
||||
} else {
|
||||
out.set_data(allocator::malloc(out.nbytes()));
|
||||
out.set_data(mallocfn(out.nbytes()));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ namespace mlx::core {
|
||||
|
||||
void broadcast(const array& in, array& out) {
|
||||
if (out.size() == 0) {
|
||||
out.set_data(nullptr);
|
||||
out.set_data(allocator::malloc(0));
|
||||
return;
|
||||
}
|
||||
Strides strides(out.ndim(), 0);
|
||||
|
||||
@@ -114,7 +114,9 @@ void compiled_allocate_outputs(
|
||||
const std::vector<array>& inputs,
|
||||
std::vector<array>& outputs,
|
||||
const std::function<bool(size_t)>& is_constant,
|
||||
bool contiguous) {
|
||||
bool contiguous,
|
||||
const std::function<allocator::Buffer(size_t)>&
|
||||
mallocfn /* = allocator::malloc */) {
|
||||
if (contiguous) {
|
||||
int o = 0;
|
||||
Strides strides;
|
||||
@@ -140,7 +142,7 @@ void compiled_allocate_outputs(
|
||||
}
|
||||
for (; o < outputs.size(); ++o) {
|
||||
outputs[o].set_data(
|
||||
allocator::malloc(data_size * outputs[o].itemsize()),
|
||||
mallocfn(data_size * outputs[o].itemsize()),
|
||||
data_size,
|
||||
strides,
|
||||
flags);
|
||||
@@ -163,7 +165,7 @@ void compiled_allocate_outputs(
|
||||
}
|
||||
}
|
||||
for (; o < outputs.size(); ++o) {
|
||||
outputs[o].set_data(allocator::malloc(outputs[o].nbytes()));
|
||||
outputs[o].set_data(mallocfn(outputs[o].nbytes()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -58,7 +58,9 @@ void compiled_allocate_outputs(
|
||||
const std::vector<array>& inputs,
|
||||
std::vector<array>& outputs,
|
||||
const std::function<bool(size_t)>& is_constant,
|
||||
bool contiguous);
|
||||
bool contiguous,
|
||||
const std::function<allocator::Buffer(size_t)>& mallocfn =
|
||||
allocator::malloc);
|
||||
|
||||
// Collapse contiguous dims ignoring scalars and constants.
|
||||
std::tuple<bool, Shape, std::vector<Strides>> compiled_collapse_contiguous_dims(
|
||||
|
||||
@@ -22,7 +22,11 @@ enum class CopyType {
|
||||
GeneralGeneral
|
||||
};
|
||||
|
||||
inline bool set_copy_output_data(const array& in, array& out, CopyType ctype) {
|
||||
inline bool set_copy_output_data(
|
||||
const array& in,
|
||||
array& out,
|
||||
CopyType ctype,
|
||||
std::function<allocator::Buffer(size_t)> mallocfn = allocator::malloc) {
|
||||
if (ctype == CopyType::Vector) {
|
||||
// If the input is donateable, we are doing a vector copy and the types
|
||||
// have the same size, then the input buffer can hold the output.
|
||||
@@ -31,14 +35,14 @@ inline bool set_copy_output_data(const array& in, array& out, CopyType ctype) {
|
||||
return true;
|
||||
} else {
|
||||
out.set_data(
|
||||
allocator::malloc(in.data_size() * out.itemsize()),
|
||||
mallocfn(in.data_size() * out.itemsize()),
|
||||
in.data_size(),
|
||||
in.strides(),
|
||||
in.flags());
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
out.set_data(allocator::malloc(out.nbytes()));
|
||||
out.set_data(mallocfn(out.nbytes()));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,17 +14,13 @@ std::tuple<int64_t, Strides> prepare_slice(
|
||||
data_offset += start_indices[i] * in.strides()[i];
|
||||
inp_strides[i] = in.strides()[i] * strides[i];
|
||||
}
|
||||
// Normalize the offset
|
||||
if (data_offset < 0) {
|
||||
data_offset += in.data_size();
|
||||
}
|
||||
return std::make_tuple(data_offset, inp_strides);
|
||||
}
|
||||
|
||||
void shared_buffer_slice(
|
||||
const array& in,
|
||||
const Strides& out_strides,
|
||||
size_t data_offset,
|
||||
int64_t data_offset,
|
||||
size_t data_size,
|
||||
array& out) {
|
||||
// Compute row/col contiguity
|
||||
@@ -45,23 +41,30 @@ void slice(
|
||||
const Shape& start_indices,
|
||||
const Shape& strides) {
|
||||
if (out.size() == 0) {
|
||||
out.set_data(nullptr);
|
||||
out.set_data(allocator::malloc(0));
|
||||
return;
|
||||
}
|
||||
|
||||
// Calculate out strides, initial offset
|
||||
auto [data_offset, inp_strides] = prepare_slice(in, start_indices, strides);
|
||||
int64_t data_end = 1;
|
||||
for (int i = 0; i < start_indices.size(); ++i) {
|
||||
if (in.shape()[i] > 1) {
|
||||
auto end_idx = start_indices[i] + out.shape()[i] * strides[i] - 1;
|
||||
data_end += end_idx * in.strides()[i];
|
||||
|
||||
// Get the location of the end based on the inp strides and out.shape()
|
||||
int64_t low_idx = 0;
|
||||
int64_t high_idx = 0;
|
||||
for (int i = 0; i < inp_strides.size(); ++i) {
|
||||
auto delta = inp_strides[i] * (out.shape()[i] - 1);
|
||||
if (inp_strides[i] > 0) {
|
||||
high_idx += delta;
|
||||
} else {
|
||||
low_idx += delta;
|
||||
}
|
||||
}
|
||||
if (data_end < 0) {
|
||||
data_end += in.data_size();
|
||||
int64_t data_size = (high_idx - low_idx) + 1;
|
||||
if (data_size < 0) {
|
||||
std::ostringstream msg;
|
||||
msg << "[slice] Computed invalid data size: " << data_size << ".";
|
||||
throw std::runtime_error(msg.str());
|
||||
}
|
||||
size_t data_size = (data_end - data_offset);
|
||||
shared_buffer_slice(in, inp_strides, data_offset, data_size, out);
|
||||
}
|
||||
|
||||
|
||||
@@ -46,7 +46,8 @@ inline void set_ternary_op_output_data(
|
||||
const array& b,
|
||||
const array& c,
|
||||
array& out,
|
||||
TernaryOpType topt) {
|
||||
TernaryOpType topt,
|
||||
std::function<allocator::Buffer(size_t)> mallocfn = allocator::malloc) {
|
||||
auto maybe_donate = [&out](const array& x) {
|
||||
if (is_donatable(x, out)) {
|
||||
out.copy_shared_buffer(x);
|
||||
@@ -57,13 +58,12 @@ inline void set_ternary_op_output_data(
|
||||
|
||||
switch (topt) {
|
||||
case TernaryOpType::ScalarScalarScalar:
|
||||
out.set_data(
|
||||
allocator::malloc(out.itemsize()), 1, b.strides(), b.flags());
|
||||
out.set_data(mallocfn(out.itemsize()), 1, b.strides(), b.flags());
|
||||
break;
|
||||
case TernaryOpType::VectorVectorVector:
|
||||
if (!(maybe_donate(a) || maybe_donate(b) || maybe_donate(c))) {
|
||||
out.set_data(
|
||||
allocator::malloc(out.itemsize() * b.data_size()),
|
||||
mallocfn(out.itemsize() * b.data_size()),
|
||||
b.data_size(),
|
||||
b.strides(),
|
||||
b.flags());
|
||||
@@ -76,7 +76,7 @@ inline void set_ternary_op_output_data(
|
||||
if (!((a.flags().row_contiguous && maybe_donate(a)) ||
|
||||
(b.flags().row_contiguous && maybe_donate(b)) ||
|
||||
(c.flags().row_contiguous && maybe_donate(c)))) {
|
||||
out.set_data(allocator::malloc(out.nbytes()));
|
||||
out.set_data(mallocfn(out.nbytes()));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -7,19 +7,22 @@
|
||||
|
||||
namespace mlx::core {
|
||||
|
||||
inline void set_unary_output_data(const array& in, array& out) {
|
||||
inline void set_unary_output_data(
|
||||
const array& in,
|
||||
array& out,
|
||||
std::function<allocator::Buffer(size_t)> mallocfn = allocator::malloc) {
|
||||
if (in.flags().contiguous) {
|
||||
if (is_donatable(in, out)) {
|
||||
out.copy_shared_buffer(in);
|
||||
} else {
|
||||
out.set_data(
|
||||
allocator::malloc(in.data_size() * out.itemsize()),
|
||||
mallocfn(in.data_size() * out.itemsize()),
|
||||
in.data_size(),
|
||||
in.strides(),
|
||||
in.flags());
|
||||
}
|
||||
} else {
|
||||
out.set_data(allocator::malloc(out.nbytes()));
|
||||
out.set_data(mallocfn(out.nbytes()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -14,233 +14,11 @@
|
||||
|
||||
namespace mlx::core {
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename Op>
|
||||
void binary(const array& a, const array& b, array& out, Op op, Stream stream) {
|
||||
auto bopt = get_binary_op_type(a, b);
|
||||
set_binary_op_output_data(a, b, out, bopt);
|
||||
|
||||
auto& encoder = cpu::get_command_encoder(stream);
|
||||
encoder.set_input_array(a);
|
||||
encoder.set_input_array(b);
|
||||
encoder.set_output_array(out);
|
||||
encoder.dispatch([a = array::unsafe_weak_copy(a),
|
||||
b = array::unsafe_weak_copy(b),
|
||||
out = array::unsafe_weak_copy(out),
|
||||
bopt]() mutable {
|
||||
switch (out.dtype()) {
|
||||
case bool_:
|
||||
binary_op<bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case uint8:
|
||||
binary_op<uint8_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case uint16:
|
||||
binary_op<uint16_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case uint32:
|
||||
binary_op<uint32_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case uint64:
|
||||
binary_op<uint64_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int8:
|
||||
binary_op<int8_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int16:
|
||||
binary_op<int16_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int32:
|
||||
binary_op<int32_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int64:
|
||||
binary_op<int64_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case float16:
|
||||
binary_op<float16_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case float32:
|
||||
binary_op<float, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case float64:
|
||||
binary_op<double, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case bfloat16:
|
||||
binary_op<bfloat16_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case complex64:
|
||||
binary_op<complex64_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
template <typename Op>
|
||||
void comparison_op(
|
||||
const array& a,
|
||||
const array& b,
|
||||
array& out,
|
||||
Op op,
|
||||
Stream stream) {
|
||||
auto bopt = get_binary_op_type(a, b);
|
||||
set_binary_op_output_data(a, b, out, bopt);
|
||||
|
||||
auto& encoder = cpu::get_command_encoder(stream);
|
||||
encoder.set_input_array(a);
|
||||
encoder.set_input_array(b);
|
||||
encoder.set_output_array(out);
|
||||
encoder.dispatch([a = array::unsafe_weak_copy(a),
|
||||
b = array::unsafe_weak_copy(b),
|
||||
out = array::unsafe_weak_copy(out),
|
||||
bopt]() mutable {
|
||||
switch (a.dtype()) {
|
||||
case bool_:
|
||||
binary_op<bool, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case uint8:
|
||||
binary_op<uint8_t, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case uint16:
|
||||
binary_op<uint16_t, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case uint32:
|
||||
binary_op<uint32_t, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case uint64:
|
||||
binary_op<uint64_t, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int8:
|
||||
binary_op<int8_t, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int16:
|
||||
binary_op<int16_t, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int32:
|
||||
binary_op<int32_t, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int64:
|
||||
binary_op<int64_t, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case float16:
|
||||
binary_op<float16_t, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case float32:
|
||||
binary_op<float, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case float64:
|
||||
binary_op<double, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case bfloat16:
|
||||
binary_op<bfloat16_t, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case complex64:
|
||||
binary_op<complex64_t, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
template <typename Op>
|
||||
void binary_float(
|
||||
const array& a,
|
||||
const array& b,
|
||||
array& out,
|
||||
Op op,
|
||||
Stream stream) {
|
||||
auto bopt = get_binary_op_type(a, b);
|
||||
set_binary_op_output_data(a, b, out, bopt);
|
||||
|
||||
auto& encoder = cpu::get_command_encoder(stream);
|
||||
encoder.set_input_array(a);
|
||||
encoder.set_input_array(b);
|
||||
encoder.set_output_array(out);
|
||||
encoder.dispatch([a = array::unsafe_weak_copy(a),
|
||||
b = array::unsafe_weak_copy(b),
|
||||
out = array::unsafe_weak_copy(out),
|
||||
bopt]() mutable {
|
||||
switch (out.dtype()) {
|
||||
case float16:
|
||||
binary_op<float16_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case float32:
|
||||
binary_op<float, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case float64:
|
||||
binary_op<double, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case bfloat16:
|
||||
binary_op<bfloat16_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case complex64:
|
||||
binary_op<complex64_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"[binary_float] Only supports floating point types.");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
template <typename Op>
|
||||
void binary_int(
|
||||
const array& a,
|
||||
const array& b,
|
||||
array& out,
|
||||
Op op,
|
||||
Stream stream) {
|
||||
auto bopt = get_binary_op_type(a, b);
|
||||
set_binary_op_output_data(a, b, out, bopt);
|
||||
|
||||
auto& encoder = cpu::get_command_encoder(stream);
|
||||
encoder.set_input_array(a);
|
||||
encoder.set_input_array(b);
|
||||
encoder.set_output_array(out);
|
||||
encoder.dispatch([a = array::unsafe_weak_copy(a),
|
||||
b = array::unsafe_weak_copy(b),
|
||||
out = array::unsafe_weak_copy(out),
|
||||
bopt]() mutable {
|
||||
switch (out.dtype()) {
|
||||
case bool_:
|
||||
binary_op<bool, Op>(a, b, out, bopt);
|
||||
case uint8:
|
||||
binary_op<uint8_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case uint16:
|
||||
binary_op<uint16_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case uint32:
|
||||
binary_op<uint32_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case uint64:
|
||||
binary_op<uint64_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int8:
|
||||
binary_op<int8_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int16:
|
||||
binary_op<int16_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int32:
|
||||
binary_op<int32_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int64:
|
||||
binary_op<int64_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("[binary_int] Type not supported");
|
||||
break;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void Add::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
assert(inputs.size() == 2);
|
||||
auto& a = inputs[0];
|
||||
auto& b = inputs[1];
|
||||
binary(a, b, out, detail::Add(), stream());
|
||||
binary_op_cpu(a, b, out, detail::Add(), stream());
|
||||
}
|
||||
|
||||
void DivMod::eval_cpu(
|
||||
@@ -324,14 +102,14 @@ void Divide::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
assert(inputs.size() == 2);
|
||||
auto& a = inputs[0];
|
||||
auto& b = inputs[1];
|
||||
binary(a, b, out, detail::Divide(), stream());
|
||||
binary_op_cpu(a, b, out, detail::Divide(), stream());
|
||||
}
|
||||
|
||||
void Remainder::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
assert(inputs.size() == 2);
|
||||
auto& a = inputs[0];
|
||||
auto& b = inputs[1];
|
||||
binary(a, b, out, detail::Remainder(), stream());
|
||||
binary_op_cpu(a, b, out, detail::Remainder(), stream());
|
||||
}
|
||||
|
||||
void Equal::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
@@ -372,89 +150,90 @@ void Equal::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
}
|
||||
});
|
||||
} else {
|
||||
comparison_op(a, b, out, detail::Equal(), stream());
|
||||
comparison_op_cpu(a, b, out, detail::Equal(), stream());
|
||||
}
|
||||
}
|
||||
|
||||
void Greater::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
assert(inputs.size() == 2);
|
||||
comparison_op(inputs[0], inputs[1], out, detail::Greater(), stream());
|
||||
comparison_op_cpu(inputs[0], inputs[1], out, detail::Greater(), stream());
|
||||
}
|
||||
|
||||
void GreaterEqual::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
assert(inputs.size() == 2);
|
||||
comparison_op(inputs[0], inputs[1], out, detail::GreaterEqual(), stream());
|
||||
comparison_op_cpu(
|
||||
inputs[0], inputs[1], out, detail::GreaterEqual(), stream());
|
||||
}
|
||||
|
||||
void Less::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
assert(inputs.size() == 2);
|
||||
comparison_op(inputs[0], inputs[1], out, detail::Less(), stream());
|
||||
comparison_op_cpu(inputs[0], inputs[1], out, detail::Less(), stream());
|
||||
}
|
||||
|
||||
void LessEqual::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
assert(inputs.size() == 2);
|
||||
comparison_op(inputs[0], inputs[1], out, detail::LessEqual(), stream());
|
||||
comparison_op_cpu(inputs[0], inputs[1], out, detail::LessEqual(), stream());
|
||||
}
|
||||
|
||||
void LogAddExp::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
assert(inputs.size() == 2);
|
||||
auto& a = inputs[0];
|
||||
auto& b = inputs[1];
|
||||
binary_float(a, b, out, detail::LogAddExp(), stream());
|
||||
binary_float_op_cpu(a, b, out, detail::LogAddExp(), stream());
|
||||
}
|
||||
|
||||
void LogicalAnd::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
assert(inputs.size() == 2); // LogicalAnd requires two input arrays
|
||||
auto& in1 = inputs[0];
|
||||
auto& in2 = inputs[1];
|
||||
binary(in1, in2, out, detail::LogicalAnd(), stream());
|
||||
binary_op_cpu(in1, in2, out, detail::LogicalAnd(), stream());
|
||||
}
|
||||
|
||||
void LogicalOr::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
assert(inputs.size() == 2); // LogicalOr requires two input arrays
|
||||
auto& in1 = inputs[0];
|
||||
auto& in2 = inputs[1];
|
||||
binary(in1, in2, out, detail::LogicalOr(), stream());
|
||||
binary_op_cpu(in1, in2, out, detail::LogicalOr(), stream());
|
||||
}
|
||||
|
||||
void Maximum::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
assert(inputs.size() == 2);
|
||||
auto& a = inputs[0];
|
||||
auto& b = inputs[1];
|
||||
binary(a, b, out, detail::Maximum(), stream());
|
||||
binary_op_cpu(a, b, out, detail::Maximum(), stream());
|
||||
}
|
||||
|
||||
void Minimum::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
assert(inputs.size() == 2);
|
||||
auto& a = inputs[0];
|
||||
auto& b = inputs[1];
|
||||
binary(a, b, out, detail::Minimum(), stream());
|
||||
binary_op_cpu(a, b, out, detail::Minimum(), stream());
|
||||
}
|
||||
|
||||
void Multiply::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
assert(inputs.size() == 2);
|
||||
auto& a = inputs[0];
|
||||
auto& b = inputs[1];
|
||||
binary(a, b, out, detail::Multiply(), stream());
|
||||
binary_op_cpu(a, b, out, detail::Multiply(), stream());
|
||||
}
|
||||
|
||||
void NotEqual::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
assert(inputs.size() == 2);
|
||||
comparison_op(inputs[0], inputs[1], out, detail::NotEqual(), stream());
|
||||
comparison_op_cpu(inputs[0], inputs[1], out, detail::NotEqual(), stream());
|
||||
}
|
||||
|
||||
void Power::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
assert(inputs.size() == 2);
|
||||
auto& a = inputs[0];
|
||||
auto& b = inputs[1];
|
||||
binary(a, b, out, detail::Power(), stream());
|
||||
binary_op_cpu(a, b, out, detail::Power(), stream());
|
||||
}
|
||||
|
||||
void Subtract::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
assert(inputs.size() == 2);
|
||||
auto& a = inputs[0];
|
||||
auto& b = inputs[1];
|
||||
binary(a, b, out, detail::Subtract(), stream());
|
||||
binary_op_cpu(a, b, out, detail::Subtract(), stream());
|
||||
}
|
||||
|
||||
void BitwiseBinary::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
@@ -463,19 +242,19 @@ void BitwiseBinary::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
auto& b = inputs[1];
|
||||
switch (op_) {
|
||||
case BitwiseBinary::And:
|
||||
binary_int(a, b, out, detail::BitwiseAnd(), stream());
|
||||
binary_int_op_cpu(a, b, out, detail::BitwiseAnd(), stream());
|
||||
break;
|
||||
case BitwiseBinary::Or:
|
||||
binary_int(a, b, out, detail::BitwiseOr(), stream());
|
||||
binary_int_op_cpu(a, b, out, detail::BitwiseOr(), stream());
|
||||
break;
|
||||
case BitwiseBinary::Xor:
|
||||
binary_int(a, b, out, detail::BitwiseXor(), stream());
|
||||
binary_int_op_cpu(a, b, out, detail::BitwiseXor(), stream());
|
||||
break;
|
||||
case BitwiseBinary::LeftShift:
|
||||
binary_int(a, b, out, detail::LeftShift(), stream());
|
||||
binary_int_op_cpu(a, b, out, detail::LeftShift(), stream());
|
||||
break;
|
||||
case BitwiseBinary::RightShift:
|
||||
binary_int(a, b, out, detail::RightShift(), stream());
|
||||
binary_int_op_cpu(a, b, out, detail::RightShift(), stream());
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -484,7 +263,7 @@ void ArcTan2::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
assert(inputs.size() == 2);
|
||||
const auto& a = inputs[0];
|
||||
const auto& b = inputs[1];
|
||||
binary_float(a, b, out, detail::ArcTan2(), stream());
|
||||
binary_float_op_cpu(a, b, out, detail::ArcTan2(), stream());
|
||||
}
|
||||
|
||||
} // namespace mlx::core
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include "mlx/backend/common/binary.h"
|
||||
#include "mlx/backend/common/utils.h"
|
||||
|
||||
#include "mlx/backend/cpu/encoder.h"
|
||||
#include "mlx/backend/cpu/simd/simd.h"
|
||||
|
||||
namespace mlx::core {
|
||||
@@ -290,4 +291,227 @@ void binary_op(const array& a, const array& b, array& out, BinaryOpType bopt) {
|
||||
binary_op<T, T, Op>(a, b, out, bopt);
|
||||
}
|
||||
|
||||
template <typename Op>
|
||||
void binary_op_cpu(
|
||||
const array& a,
|
||||
const array& b,
|
||||
array& out,
|
||||
Op op,
|
||||
Stream stream) {
|
||||
auto bopt = get_binary_op_type(a, b);
|
||||
set_binary_op_output_data(a, b, out, bopt);
|
||||
|
||||
auto& encoder = cpu::get_command_encoder(stream);
|
||||
encoder.set_input_array(a);
|
||||
encoder.set_input_array(b);
|
||||
encoder.set_output_array(out);
|
||||
encoder.dispatch([a = array::unsafe_weak_copy(a),
|
||||
b = array::unsafe_weak_copy(b),
|
||||
out = array::unsafe_weak_copy(out),
|
||||
bopt]() mutable {
|
||||
switch (out.dtype()) {
|
||||
case bool_:
|
||||
binary_op<bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case uint8:
|
||||
binary_op<uint8_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case uint16:
|
||||
binary_op<uint16_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case uint32:
|
||||
binary_op<uint32_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case uint64:
|
||||
binary_op<uint64_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int8:
|
||||
binary_op<int8_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int16:
|
||||
binary_op<int16_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int32:
|
||||
binary_op<int32_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int64:
|
||||
binary_op<int64_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case float16:
|
||||
binary_op<float16_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case float32:
|
||||
binary_op<float, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case float64:
|
||||
binary_op<double, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case bfloat16:
|
||||
binary_op<bfloat16_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case complex64:
|
||||
binary_op<complex64_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
template <typename Op>
|
||||
void comparison_op_cpu(
|
||||
const array& a,
|
||||
const array& b,
|
||||
array& out,
|
||||
Op op,
|
||||
Stream stream) {
|
||||
auto bopt = get_binary_op_type(a, b);
|
||||
set_binary_op_output_data(a, b, out, bopt);
|
||||
|
||||
auto& encoder = cpu::get_command_encoder(stream);
|
||||
encoder.set_input_array(a);
|
||||
encoder.set_input_array(b);
|
||||
encoder.set_output_array(out);
|
||||
encoder.dispatch([a = array::unsafe_weak_copy(a),
|
||||
b = array::unsafe_weak_copy(b),
|
||||
out = array::unsafe_weak_copy(out),
|
||||
bopt]() mutable {
|
||||
switch (a.dtype()) {
|
||||
case bool_:
|
||||
binary_op<bool, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case uint8:
|
||||
binary_op<uint8_t, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case uint16:
|
||||
binary_op<uint16_t, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case uint32:
|
||||
binary_op<uint32_t, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case uint64:
|
||||
binary_op<uint64_t, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int8:
|
||||
binary_op<int8_t, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int16:
|
||||
binary_op<int16_t, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int32:
|
||||
binary_op<int32_t, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int64:
|
||||
binary_op<int64_t, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case float16:
|
||||
binary_op<float16_t, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case float32:
|
||||
binary_op<float, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case float64:
|
||||
binary_op<double, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case bfloat16:
|
||||
binary_op<bfloat16_t, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case complex64:
|
||||
binary_op<complex64_t, bool, Op>(a, b, out, bopt);
|
||||
break;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
template <typename Op>
|
||||
void binary_float_op_cpu(
|
||||
const array& a,
|
||||
const array& b,
|
||||
array& out,
|
||||
Op op,
|
||||
Stream stream) {
|
||||
auto bopt = get_binary_op_type(a, b);
|
||||
set_binary_op_output_data(a, b, out, bopt);
|
||||
|
||||
auto& encoder = cpu::get_command_encoder(stream);
|
||||
encoder.set_input_array(a);
|
||||
encoder.set_input_array(b);
|
||||
encoder.set_output_array(out);
|
||||
encoder.dispatch([a = array::unsafe_weak_copy(a),
|
||||
b = array::unsafe_weak_copy(b),
|
||||
out = array::unsafe_weak_copy(out),
|
||||
bopt]() mutable {
|
||||
switch (out.dtype()) {
|
||||
case float16:
|
||||
binary_op<float16_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case float32:
|
||||
binary_op<float, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case float64:
|
||||
binary_op<double, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case bfloat16:
|
||||
binary_op<bfloat16_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case complex64:
|
||||
binary_op<complex64_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"[binary_float] Only supports floating point types.");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
template <typename Op>
|
||||
void binary_int_op_cpu(
|
||||
const array& a,
|
||||
const array& b,
|
||||
array& out,
|
||||
Op op,
|
||||
Stream stream) {
|
||||
auto bopt = get_binary_op_type(a, b);
|
||||
set_binary_op_output_data(a, b, out, bopt);
|
||||
|
||||
auto& encoder = cpu::get_command_encoder(stream);
|
||||
encoder.set_input_array(a);
|
||||
encoder.set_input_array(b);
|
||||
encoder.set_output_array(out);
|
||||
encoder.dispatch([a = array::unsafe_weak_copy(a),
|
||||
b = array::unsafe_weak_copy(b),
|
||||
out = array::unsafe_weak_copy(out),
|
||||
bopt]() mutable {
|
||||
switch (out.dtype()) {
|
||||
case bool_:
|
||||
binary_op<bool, Op>(a, b, out, bopt);
|
||||
case uint8:
|
||||
binary_op<uint8_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case uint16:
|
||||
binary_op<uint16_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case uint32:
|
||||
binary_op<uint32_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case uint64:
|
||||
binary_op<uint64_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int8:
|
||||
binary_op<int8_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int16:
|
||||
binary_op<int16_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int32:
|
||||
binary_op<int32_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
case int64:
|
||||
binary_op<int64_t, Op>(a, b, out, bopt);
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("[binary_int] Type not supported");
|
||||
break;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace mlx::core
|
||||
|
||||
@@ -996,131 +996,6 @@ void explicit_gemm_conv_1D_cpu(
|
||||
encoder.add_temporaries(std::move(temps));
|
||||
}
|
||||
|
||||
void explicit_gemm_conv_2D_cpu(
|
||||
const array& in,
|
||||
const array& wt,
|
||||
array out,
|
||||
const std::vector<int>& padding_lo,
|
||||
const std::vector<int>& padding_hi,
|
||||
const std::vector<int>& wt_strides,
|
||||
const std::vector<int>& wt_dilation,
|
||||
Stream stream) {
|
||||
const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
|
||||
const int iH = in.shape(1); // Input spatial dim
|
||||
const int iW = in.shape(2); // Input spatial dim
|
||||
const int oH = out.shape(1); // Output spatial dim
|
||||
const int oW = out.shape(2); // Output spatial dim
|
||||
const int O = wt.shape(0); // Out channels
|
||||
const int C = wt.shape(3); // In channels
|
||||
const int wH = wt.shape(1); // Weight spatial dim
|
||||
const int wW = wt.shape(2); // Weight spatial dim
|
||||
|
||||
auto conv_dtype = out.dtype();
|
||||
auto& encoder = cpu::get_command_encoder(stream);
|
||||
|
||||
// Pad input
|
||||
Shape padded_shape = {
|
||||
N,
|
||||
iH + padding_lo[0] + padding_hi[0],
|
||||
iW + padding_lo[1] + padding_hi[1],
|
||||
C};
|
||||
array in_padded(padded_shape, conv_dtype, nullptr, {});
|
||||
|
||||
// Fill with zeros
|
||||
std::vector<array> temps;
|
||||
temps.push_back(array(0, conv_dtype));
|
||||
copy_cpu(temps.back(), in_padded, CopyType::Scalar, stream);
|
||||
|
||||
// Pick input slice from padded
|
||||
size_t data_offset = padding_lo[0] * in_padded.strides()[1] +
|
||||
padding_lo[1] * in_padded.strides()[2];
|
||||
array in_padded_slice(in.shape(), in_padded.dtype(), nullptr, {});
|
||||
in_padded_slice.copy_shared_buffer(
|
||||
in_padded,
|
||||
in_padded.strides(),
|
||||
in_padded.flags(),
|
||||
in_padded_slice.size(),
|
||||
data_offset);
|
||||
temps.push_back(in_padded_slice);
|
||||
|
||||
// Copy input values into the slice
|
||||
copy_cpu_inplace(in, in_padded_slice, CopyType::GeneralGeneral, stream);
|
||||
|
||||
// Make strided view
|
||||
Shape strided_shape = {N, oH, oW, wH, wW, C};
|
||||
|
||||
Strides strided_strides = {
|
||||
in_padded.strides()[0],
|
||||
in_padded.strides()[1] * wt_strides[0],
|
||||
in_padded.strides()[2] * wt_strides[1],
|
||||
in_padded.strides()[1],
|
||||
in_padded.strides()[2],
|
||||
in_padded.strides()[3]};
|
||||
auto flags = in_padded.flags();
|
||||
|
||||
array in_strided_view(strided_shape, in_padded.dtype(), nullptr, {});
|
||||
in_strided_view.copy_shared_buffer(
|
||||
in_padded, strided_strides, flags, in_strided_view.size(), 0);
|
||||
|
||||
// Materialize strided view
|
||||
Shape strided_reshape = {N * oH * oW, wH * wW * C};
|
||||
array in_strided(strided_reshape, in_strided_view.dtype(), nullptr, {});
|
||||
copy_cpu(in_strided_view, in_strided, CopyType::General, stream);
|
||||
temps.push_back(in_strided);
|
||||
|
||||
// Check wt dtype and prepare
|
||||
auto gemm_wt = wt;
|
||||
auto gemm_out = out;
|
||||
|
||||
if (wt.dtype() != float32 || !wt.flags().row_contiguous) {
|
||||
auto ctype =
|
||||
wt.flags().row_contiguous ? CopyType::Vector : CopyType::General;
|
||||
gemm_wt = array(wt.shape(), float32, nullptr, {});
|
||||
copy_cpu(wt, gemm_wt, ctype, stream);
|
||||
temps.push_back(gemm_wt);
|
||||
}
|
||||
|
||||
if (out.dtype() != float32) {
|
||||
gemm_out = array(out.shape(), float32, nullptr, {});
|
||||
gemm_out.set_data(allocator::malloc(gemm_out.nbytes()));
|
||||
temps.push_back(gemm_out);
|
||||
}
|
||||
|
||||
encoder.set_input_array(in_strided);
|
||||
encoder.set_input_array(gemm_wt);
|
||||
encoder.set_output_array(gemm_out);
|
||||
|
||||
encoder.dispatch([in_strided_ptr = in_strided.data<float>(),
|
||||
gemm_wt_ptr = gemm_wt.data<float>(),
|
||||
gemm_out_ptr = gemm_out.data<float>(),
|
||||
strided_reshape = std::move(strided_reshape),
|
||||
O]() {
|
||||
// Perform gemm
|
||||
cblas_sgemm(
|
||||
CblasRowMajor,
|
||||
CblasNoTrans, // no trans A
|
||||
CblasTrans, // transB
|
||||
strided_reshape[0], // M
|
||||
O, // N
|
||||
strided_reshape[1], // K
|
||||
1.0f, // alpha
|
||||
in_strided_ptr,
|
||||
strided_reshape[1], // lda
|
||||
gemm_wt_ptr,
|
||||
strided_reshape[1], // ldb
|
||||
0.0f, // beta
|
||||
gemm_out_ptr,
|
||||
O // ldc
|
||||
);
|
||||
});
|
||||
|
||||
// Copy results if needed
|
||||
if (out.dtype() != float32) {
|
||||
copy_cpu_inplace(gemm_out, out, CopyType::Vector, stream);
|
||||
}
|
||||
encoder.add_temporaries(std::move(temps));
|
||||
}
|
||||
|
||||
void explicit_gemm_conv_ND_cpu(
|
||||
const array& in,
|
||||
const array& wt,
|
||||
|
||||
@@ -95,4 +95,9 @@ void Recv::eval_cpu(
|
||||
distributed::detail::recv(group(), outputs[0], src_, stream());
|
||||
}
|
||||
|
||||
void ReduceScatter::eval_cpu(
|
||||
const std::vector<array>& inputs,
|
||||
std::vector<array>& outputs) {
|
||||
throw std::runtime_error("[ReduceScatter] Not implemented yet.");
|
||||
}
|
||||
} // namespace mlx::core::distributed
|
||||
|
||||
@@ -12,6 +12,167 @@ namespace mlx::core {
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename T>
|
||||
complex64_t to_complex(T r, T i) {
|
||||
return {static_cast<float>(r), static_cast<float>(i)};
|
||||
}
|
||||
|
||||
template <typename T, class Enable = void>
|
||||
struct EigWork {};
|
||||
|
||||
template <typename T>
|
||||
struct EigWork<
|
||||
T,
|
||||
typename std::enable_if<std::is_floating_point<T>::value>::type> {
|
||||
using O = complex64_t;
|
||||
|
||||
char jobl;
|
||||
char jobr;
|
||||
int N;
|
||||
int lwork;
|
||||
int info;
|
||||
std::vector<array::Data> buffers;
|
||||
|
||||
EigWork(char jobl_, char jobr_, int N_, bool compute_eigenvectors)
|
||||
: jobl(jobl_), jobr(jobr_), N(N_), lwork(-1) {
|
||||
T work;
|
||||
int n_vecs_l = compute_eigenvectors ? N_ : 1;
|
||||
int n_vecs_r = 1;
|
||||
geev<T>(
|
||||
&jobl,
|
||||
&jobr,
|
||||
&N,
|
||||
nullptr,
|
||||
&N,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
&n_vecs_l,
|
||||
nullptr,
|
||||
&n_vecs_r,
|
||||
&work,
|
||||
&lwork,
|
||||
&info);
|
||||
lwork = static_cast<int>(work);
|
||||
|
||||
buffers.emplace_back(allocator::malloc(sizeof(T) * N * 2));
|
||||
if (compute_eigenvectors) {
|
||||
buffers.emplace_back(allocator::malloc(sizeof(T) * N * N * 2));
|
||||
}
|
||||
buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
|
||||
}
|
||||
|
||||
void run(T* a, O* values, O* vectors) {
|
||||
auto eig_tmp = static_cast<T*>(buffers[0].buffer.raw_ptr());
|
||||
T* vec_tmp = nullptr;
|
||||
if (vectors) {
|
||||
vec_tmp = static_cast<T*>(buffers[1].buffer.raw_ptr());
|
||||
}
|
||||
auto work = static_cast<T*>(buffers.back().buffer.raw_ptr());
|
||||
|
||||
int n_vecs_l = vectors ? N : 1;
|
||||
int n_vecs_r = 1;
|
||||
geev<T>(
|
||||
&jobl,
|
||||
&jobr,
|
||||
&N,
|
||||
a,
|
||||
&N,
|
||||
eig_tmp,
|
||||
eig_tmp + N,
|
||||
vectors ? vec_tmp : nullptr,
|
||||
&n_vecs_l,
|
||||
nullptr,
|
||||
&n_vecs_r,
|
||||
work,
|
||||
&lwork,
|
||||
&info);
|
||||
|
||||
for (int i = 0; i < N; ++i) {
|
||||
values[i] = to_complex(eig_tmp[i], eig_tmp[N + i]);
|
||||
}
|
||||
|
||||
if (vectors) {
|
||||
for (int i = 0; i < N; ++i) {
|
||||
if (values[i].imag() != 0) {
|
||||
for (int j = 0; j < N; ++j) {
|
||||
vectors[i * N + j] =
|
||||
to_complex(vec_tmp[i * N + j], -vec_tmp[(i + 1) * N + j]);
|
||||
vectors[(i + 1) * N + j] =
|
||||
to_complex(vec_tmp[i * N + j], vec_tmp[(i + 1) * N + j]);
|
||||
}
|
||||
i += 1;
|
||||
} else {
|
||||
for (int j = 0; j < N; ++j) {
|
||||
vectors[i * N + j] = to_complex(vec_tmp[i * N + j], T(0.0));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct EigWork<std::complex<float>> {
|
||||
using T = std::complex<float>;
|
||||
using R = float;
|
||||
using O = T;
|
||||
|
||||
char jobl;
|
||||
char jobr;
|
||||
int N;
|
||||
int lwork;
|
||||
int lrwork;
|
||||
int info;
|
||||
std::vector<array::Data> buffers;
|
||||
|
||||
EigWork(char jobl_, char jobr_, int N_, bool compute_eigenvectors)
|
||||
: jobl(jobl_), jobr(jobr_), N(N_), lwork(-1), lrwork(2 * N_) {
|
||||
T work;
|
||||
R rwork;
|
||||
int n_vecs_l = compute_eigenvectors ? N_ : 1;
|
||||
int n_vecs_r = 1;
|
||||
geev<T>(
|
||||
&jobl,
|
||||
&jobr,
|
||||
&N,
|
||||
nullptr,
|
||||
&N,
|
||||
nullptr,
|
||||
nullptr,
|
||||
&n_vecs_l,
|
||||
nullptr,
|
||||
&n_vecs_r,
|
||||
&work,
|
||||
&lwork,
|
||||
&rwork,
|
||||
&info);
|
||||
lwork = static_cast<int>(work.real());
|
||||
buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
|
||||
buffers.emplace_back(allocator::malloc(sizeof(R) * lrwork));
|
||||
}
|
||||
|
||||
void run(T* a, T* values, T* vectors) {
|
||||
int n_vecs_l = vectors ? N : 1;
|
||||
int n_vecs_r = 1;
|
||||
geev<T>(
|
||||
&jobl,
|
||||
&jobr,
|
||||
&N,
|
||||
a,
|
||||
&N,
|
||||
values,
|
||||
vectors,
|
||||
&n_vecs_l,
|
||||
nullptr,
|
||||
&n_vecs_r,
|
||||
static_cast<T*>(buffers[0].buffer.raw_ptr()),
|
||||
&lwork,
|
||||
static_cast<R*>(buffers[1].buffer.raw_ptr()),
|
||||
&info);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void eig_impl(
|
||||
array& a,
|
||||
@@ -19,102 +180,39 @@ void eig_impl(
|
||||
array& values,
|
||||
bool compute_eigenvectors,
|
||||
Stream stream) {
|
||||
using OT = std::complex<T>;
|
||||
auto a_ptr = a.data<T>();
|
||||
auto eig_ptr = values.data<OT>();
|
||||
auto val_ptr = values.data<complex64_t>();
|
||||
|
||||
auto& encoder = cpu::get_command_encoder(stream);
|
||||
encoder.set_input_array(a);
|
||||
encoder.set_output_array(values);
|
||||
OT* vec_ptr = nullptr;
|
||||
complex64_t* vec_ptr = nullptr;
|
||||
if (compute_eigenvectors) {
|
||||
encoder.set_output_array(vectors);
|
||||
vec_ptr = vectors.data<OT>();
|
||||
vec_ptr = vectors.data<complex64_t>();
|
||||
}
|
||||
encoder.dispatch([a_ptr,
|
||||
val_ptr,
|
||||
vec_ptr,
|
||||
eig_ptr,
|
||||
compute_eigenvectors,
|
||||
N = vectors.shape(-1),
|
||||
size = vectors.size()]() mutable {
|
||||
// Work query
|
||||
char jobr = 'N';
|
||||
char jobl = compute_eigenvectors ? 'V' : 'N';
|
||||
int n_vecs_r = 1;
|
||||
int n_vecs_l = compute_eigenvectors ? N : 1;
|
||||
int lwork = -1;
|
||||
int info;
|
||||
{
|
||||
T work;
|
||||
int iwork;
|
||||
geev<T>(
|
||||
&jobl,
|
||||
&jobr,
|
||||
&N,
|
||||
nullptr,
|
||||
&N,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
&n_vecs_l,
|
||||
nullptr,
|
||||
&n_vecs_r,
|
||||
&work,
|
||||
&lwork,
|
||||
&info);
|
||||
lwork = static_cast<int>(work);
|
||||
}
|
||||
|
||||
auto eig_tmp_data = array::Data{allocator::malloc(sizeof(T) * N * 2)};
|
||||
auto vec_tmp_data =
|
||||
array::Data{allocator::malloc(vec_ptr ? sizeof(T) * N * N * 2 : 0)};
|
||||
auto eig_tmp = static_cast<T*>(eig_tmp_data.buffer.raw_ptr());
|
||||
auto vec_tmp = static_cast<T*>(vec_tmp_data.buffer.raw_ptr());
|
||||
auto work_buf = array::Data{allocator::malloc(sizeof(T) * lwork)};
|
||||
EigWork<T> work(jobl, jobr, N, compute_eigenvectors);
|
||||
|
||||
for (size_t i = 0; i < size / (N * N); ++i) {
|
||||
geev<T>(
|
||||
&jobl,
|
||||
&jobr,
|
||||
&N,
|
||||
a_ptr,
|
||||
&N,
|
||||
eig_tmp,
|
||||
eig_tmp + N,
|
||||
vec_tmp,
|
||||
&n_vecs_l,
|
||||
nullptr,
|
||||
&n_vecs_r,
|
||||
static_cast<T*>(work_buf.buffer.raw_ptr()),
|
||||
&lwork,
|
||||
&info);
|
||||
for (int i = 0; i < N; ++i) {
|
||||
eig_ptr[i] = {eig_tmp[i], eig_tmp[N + i]};
|
||||
}
|
||||
work.run(a_ptr, val_ptr, vec_ptr);
|
||||
a_ptr += N * N;
|
||||
val_ptr += N;
|
||||
if (vec_ptr) {
|
||||
for (int i = 0; i < N; ++i) {
|
||||
if (eig_ptr[i].imag() != 0) {
|
||||
// This vector and the next are a pair
|
||||
for (int j = 0; j < N; ++j) {
|
||||
vec_ptr[i * N + j] = {
|
||||
vec_tmp[i * N + j], -vec_tmp[(i + 1) * N + j]};
|
||||
vec_ptr[(i + 1) * N + j] = {
|
||||
vec_tmp[i * N + j], vec_tmp[(i + 1) * N + j]};
|
||||
}
|
||||
i += 1;
|
||||
} else {
|
||||
for (int j = 0; j < N; ++j) {
|
||||
vec_ptr[i * N + j] = {vec_tmp[i * N + j], 0};
|
||||
}
|
||||
}
|
||||
}
|
||||
vec_ptr += N * N;
|
||||
}
|
||||
a_ptr += N * N;
|
||||
eig_ptr += N;
|
||||
if (info != 0) {
|
||||
if (work.info != 0) {
|
||||
std::stringstream msg;
|
||||
msg << "[Eig::eval_cpu] Eigenvalue decomposition failed with error code "
|
||||
<< info;
|
||||
<< work.info;
|
||||
throw std::runtime_error(msg.str());
|
||||
}
|
||||
}
|
||||
@@ -166,8 +264,17 @@ void Eig::eval_cpu(
|
||||
case float32:
|
||||
eig_impl<float>(a_copy, vectors, values, compute_eigenvectors_, stream());
|
||||
break;
|
||||
case float64:
|
||||
eig_impl<double>(
|
||||
a_copy, vectors, values, compute_eigenvectors_, stream());
|
||||
break;
|
||||
case complex64:
|
||||
eig_impl<std::complex<float>>(
|
||||
a_copy, vectors, values, compute_eigenvectors_, stream());
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("[Eig::eval_cpu] only supports float32.");
|
||||
throw std::runtime_error(
|
||||
"[Eig::eval_cpu] only supports float32, float64, or complex64.");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
// Copyright © 2023-2024 Apple Inc.
|
||||
|
||||
#include <Accelerate/Accelerate.h>
|
||||
|
||||
#include "mlx/array.h"
|
||||
@@ -49,9 +48,15 @@ void matmul_bnns(
|
||||
size_t K = a_shape[ndim - 1];
|
||||
|
||||
BNNSDataType bnns_dtype = to_bnns_dtype<T>();
|
||||
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
||||
if (beta != 1.0 && beta != 0.0) {
|
||||
// scale the output
|
||||
for (auto i = 0; i < batch_size * M * N; ++i) {
|
||||
out[i] *= beta;
|
||||
}
|
||||
beta = 1.0;
|
||||
}
|
||||
const BNNSLayerParametersBroadcastMatMul gemm_params{
|
||||
/* float alpha = */ alpha,
|
||||
/* float beta = */ beta,
|
||||
|
||||
@@ -747,4 +747,108 @@ void ScatterAxis::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
});
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void masked_scatter_impl(const array& mask, const array& src, array& out) {
|
||||
ContiguousIterator mask_it(mask);
|
||||
ContiguousIterator src_it(src);
|
||||
ContiguousIterator out_it(out);
|
||||
|
||||
const bool* mask_ptr = mask.data<bool>();
|
||||
const T* src_ptr = src.data<T>();
|
||||
T* dst_ptr = out.data<T>();
|
||||
|
||||
const size_t batch_count = mask.shape(0);
|
||||
const size_t mask_batch_size = mask.size() / batch_count;
|
||||
const size_t src_batch_size = src.size() / batch_count;
|
||||
|
||||
for (uint b = 0; b < batch_count; ++b) {
|
||||
size_t src_consumed = 0;
|
||||
src_it.seek(b * src_batch_size);
|
||||
|
||||
for (size_t i = 0; i < mask_batch_size; ++i) {
|
||||
if (mask_ptr[mask_it.loc]) {
|
||||
if (src_consumed >= src_batch_size) {
|
||||
throw std::runtime_error(
|
||||
"[MaskedScatter::eval_cpu] Source does not have enough elements for mask.");
|
||||
}
|
||||
dst_ptr[out_it.loc] = src_ptr[src_it.loc];
|
||||
src_it.step();
|
||||
++src_consumed;
|
||||
}
|
||||
mask_it.step();
|
||||
out_it.step();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MaskedScatter::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
assert(inputs.size() == 3);
|
||||
|
||||
auto& dst = inputs[0];
|
||||
auto& mask = inputs[1];
|
||||
auto& src = inputs[2];
|
||||
|
||||
// Copy src into out (copy allocates memory for out)
|
||||
auto ctype =
|
||||
dst.flags().row_contiguous ? CopyType::Vector : CopyType::General;
|
||||
copy_cpu(dst, out, ctype, stream());
|
||||
|
||||
if (mask.size() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto& encoder = cpu::get_command_encoder(stream());
|
||||
encoder.set_input_array(mask);
|
||||
encoder.set_input_array(src);
|
||||
encoder.set_output_array(out);
|
||||
encoder.dispatch([mask = array::unsafe_weak_copy(mask),
|
||||
src = array::unsafe_weak_copy(src),
|
||||
out = array::unsafe_weak_copy(out)]() mutable {
|
||||
switch (out.dtype()) {
|
||||
case bool_:
|
||||
masked_scatter_impl<bool>(mask, src, out);
|
||||
break;
|
||||
case uint8:
|
||||
masked_scatter_impl<uint8_t>(mask, src, out);
|
||||
break;
|
||||
case uint16:
|
||||
masked_scatter_impl<uint16_t>(mask, src, out);
|
||||
break;
|
||||
case uint32:
|
||||
masked_scatter_impl<uint32_t>(mask, src, out);
|
||||
break;
|
||||
case uint64:
|
||||
masked_scatter_impl<uint64_t>(mask, src, out);
|
||||
break;
|
||||
case int8:
|
||||
masked_scatter_impl<int8_t>(mask, src, out);
|
||||
break;
|
||||
case int16:
|
||||
masked_scatter_impl<int16_t>(mask, src, out);
|
||||
break;
|
||||
case int32:
|
||||
masked_scatter_impl<int32_t>(mask, src, out);
|
||||
break;
|
||||
case int64:
|
||||
masked_scatter_impl<int64_t>(mask, src, out);
|
||||
break;
|
||||
case float16:
|
||||
masked_scatter_impl<float16_t>(mask, src, out);
|
||||
break;
|
||||
case float32:
|
||||
masked_scatter_impl<float>(mask, src, out);
|
||||
break;
|
||||
case float64:
|
||||
masked_scatter_impl<double>(mask, src, out);
|
||||
break;
|
||||
case bfloat16:
|
||||
masked_scatter_impl<bfloat16_t>(mask, src, out);
|
||||
break;
|
||||
case complex64:
|
||||
masked_scatter_impl<complex64_t>(mask, src, out);
|
||||
break;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace mlx::core
|
||||
|
||||
@@ -45,9 +45,7 @@
|
||||
INSTANTIATE_LAPACK_REAL(geqrf)
|
||||
INSTANTIATE_LAPACK_REAL(orgqr)
|
||||
INSTANTIATE_LAPACK_REAL(syevd)
|
||||
INSTANTIATE_LAPACK_REAL(geev)
|
||||
INSTANTIATE_LAPACK_REAL(potrf)
|
||||
INSTANTIATE_LAPACK_REAL(gesdd)
|
||||
INSTANTIATE_LAPACK_REAL(getrf)
|
||||
INSTANTIATE_LAPACK_REAL(getri)
|
||||
INSTANTIATE_LAPACK_REAL(trtri)
|
||||
@@ -63,3 +61,20 @@ INSTANTIATE_LAPACK_REAL(trtri)
|
||||
}
|
||||
|
||||
INSTANTIATE_LAPACK_COMPLEX(heevd)
|
||||
|
||||
#define INSTANTIATE_LAPACK_ALL(FUNC) \
|
||||
template <typename T, typename... Args> \
|
||||
void FUNC(Args... args) { \
|
||||
if constexpr (std::is_same_v<T, float>) { \
|
||||
MLX_LAPACK_FUNC(s##FUNC)(std::forward<Args>(args)...); \
|
||||
} else if constexpr (std::is_same_v<T, double>) { \
|
||||
MLX_LAPACK_FUNC(d##FUNC)(std::forward<Args>(args)...); \
|
||||
} else if constexpr (std::is_same_v<T, std::complex<float>>) { \
|
||||
MLX_LAPACK_FUNC(c##FUNC)(std::forward<Args>(args)...); \
|
||||
} else if constexpr (std::is_same_v<T, std::complex<double>>) { \
|
||||
MLX_LAPACK_FUNC(z##FUNC)(std::forward<Args>(args)...); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSTANTIATE_LAPACK_ALL(geev)
|
||||
INSTANTIATE_LAPACK_ALL(gesdd)
|
||||
|
||||
@@ -215,18 +215,18 @@ void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
|
||||
encoder.set_input_array(a);
|
||||
encoder.set_input_array(b);
|
||||
const void* a_mask_ptr;
|
||||
const void* b_mask_ptr;
|
||||
const void* out_mask_ptr;
|
||||
const void* a_mask_ptr = nullptr;
|
||||
const void* b_mask_ptr = nullptr;
|
||||
const void* out_mask_ptr = nullptr;
|
||||
Shape a_mask_shape;
|
||||
Shape b_mask_shape;
|
||||
Shape out_mask_shape;
|
||||
Strides a_mask_strides;
|
||||
Strides b_mask_strides;
|
||||
Strides out_mask_strides;
|
||||
bool a_mask_bool;
|
||||
bool b_mask_bool;
|
||||
bool out_mask_bool;
|
||||
bool a_mask_bool = false;
|
||||
bool b_mask_bool = false;
|
||||
bool out_mask_bool = false;
|
||||
if (has_op_mask) {
|
||||
auto& a_mask = inputs[inputs.size() - 2];
|
||||
auto& b_mask = inputs[inputs.size() - 1];
|
||||
@@ -423,7 +423,6 @@ void GatherMM::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
auto& rhs_indices = inputs[3];
|
||||
|
||||
auto batch_shape = get_batch_dims(out.shape());
|
||||
int batch_ndim = batch_shape.size();
|
||||
|
||||
auto batch_shape_A = get_batch_dims(a.shape());
|
||||
auto batch_strides_A = get_batch_dims(a.strides());
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
#include <cstring>
|
||||
#include "mlx/array.h"
|
||||
#include "mlx/backend/cpu/binary.h"
|
||||
#include "mlx/backend/cpu/binary_ops.h"
|
||||
#include "mlx/backend/cpu/copy.h"
|
||||
#include "mlx/backend/cpu/encoder.h"
|
||||
#include "mlx/backend/cpu/gemm.h"
|
||||
@@ -91,7 +93,6 @@ void matmul_general(
|
||||
auto [b_transposed, ldb, b] = check_transpose(b_pre);
|
||||
size_t M = a.shape(-2);
|
||||
size_t N = b.shape(-1);
|
||||
size_t K = a.shape(-1);
|
||||
if (M == 0 || N == 0) {
|
||||
return;
|
||||
}
|
||||
@@ -136,15 +137,29 @@ void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Handle empty matrix case (K=0)
|
||||
if (inputs[0].shape(-1) == 0) {
|
||||
auto& c = inputs[2];
|
||||
if (beta_ == 1.0f) {
|
||||
CopyType ctype = c.data_size() == 1
|
||||
? CopyType::Scalar
|
||||
: (c.flags().row_contiguous ? CopyType::Vector : CopyType::General);
|
||||
copy_cpu(c, out, ctype, stream());
|
||||
} else {
|
||||
array beta_scalar = array(beta_, c.dtype());
|
||||
auto& encoder = cpu::get_command_encoder(stream());
|
||||
binary_float_op_cpu(c, beta_scalar, out, detail::Multiply(), stream());
|
||||
encoder.add_temporary(std::move(beta_scalar));
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Fill output with C
|
||||
auto& c = inputs[2];
|
||||
CopyType ctype = c.data_size() == 1
|
||||
? CopyType::Scalar
|
||||
: (c.flags().row_contiguous ? CopyType::Vector : CopyType::General);
|
||||
copy_cpu(c, out, ctype, stream());
|
||||
if (inputs[0].shape(-1) == 0) {
|
||||
return;
|
||||
}
|
||||
matmul_general(inputs[0], inputs[1], out, stream(), alpha_, beta_);
|
||||
}
|
||||
|
||||
|
||||
@@ -333,7 +333,7 @@ void Reshape::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
|
||||
void DynamicSlice::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
if (out.size() == 0) {
|
||||
out.set_data(nullptr);
|
||||
out.set_data(allocator::malloc(0));
|
||||
return;
|
||||
}
|
||||
auto& in = inputs[0];
|
||||
@@ -361,7 +361,7 @@ void DynamicSliceUpdate::eval_cpu(
|
||||
const std::vector<array>& inputs,
|
||||
array& out) {
|
||||
if (out.size() == 0) {
|
||||
out.set_data(nullptr);
|
||||
out.set_data(allocator::malloc(0));
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -396,7 +396,7 @@ void DynamicSliceUpdate::eval_cpu(
|
||||
void SliceUpdate::eval_cpu(const std::vector<array>& inputs, array& out) {
|
||||
assert(inputs.size() == 2);
|
||||
if (out.size() == 0) {
|
||||
out.set_data(nullptr);
|
||||
out.set_data(allocator::malloc(0));
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,8 +1,11 @@
|
||||
// Copyright © 2023 Apple Inc.
|
||||
|
||||
#include "mlx/backend/common/unary.h"
|
||||
#include "mlx/backend/cpu/copy.h"
|
||||
#include "mlx/backend/cpu/encoder.h"
|
||||
#include "mlx/backend/cpu/simd/simd.h"
|
||||
#include "mlx/backend/cpu/unary.h"
|
||||
#include "mlx/backend/cpu/unary_ops.h"
|
||||
#include "mlx/fast_primitives.h"
|
||||
#include "mlx/primitives.h"
|
||||
#include "mlx/utils.h"
|
||||
@@ -445,7 +448,6 @@ void mxfp4_qmm(
|
||||
int K) {
|
||||
constexpr int group_size = 32;
|
||||
constexpr int pack_factor = get_pack_factor(4, 8);
|
||||
constexpr int bytes_per_pack = get_bytes_per_pack(4);
|
||||
constexpr int packs_in_group = group_size / pack_factor;
|
||||
|
||||
for (int m = 0; m < M; m++) {
|
||||
@@ -487,7 +489,6 @@ void mxfp4_qmm_t(
|
||||
int K) {
|
||||
constexpr int group_size = 32;
|
||||
constexpr int pack_factor = get_pack_factor(4, 8);
|
||||
constexpr int bytes_per_pack = get_bytes_per_pack(4);
|
||||
constexpr int packs_in_group = group_size / pack_factor;
|
||||
|
||||
for (int m = 0; m < M; m++) {
|
||||
@@ -1104,4 +1105,44 @@ void fast::Quantize::eval_cpu(
|
||||
});
|
||||
}
|
||||
|
||||
void fast::ConvertFP8::eval_cpu(
|
||||
const std::vector<array>& inputs,
|
||||
std::vector<array>& outputs) {
|
||||
auto& in = inputs[0];
|
||||
auto& out = outputs[0];
|
||||
set_unary_output_data(in, out);
|
||||
auto& encoder = cpu::get_command_encoder(stream());
|
||||
encoder.set_input_array(in);
|
||||
encoder.set_output_array(out);
|
||||
encoder.dispatch([in = array::unsafe_weak_copy(in),
|
||||
out = array::unsafe_weak_copy(out),
|
||||
to_fp8 = to_fp8_]() mutable {
|
||||
if (to_fp8) {
|
||||
switch (in.dtype()) {
|
||||
case float16:
|
||||
unary_op<float16_t, uint8_t>(in, out, detail::ToFP8());
|
||||
break;
|
||||
case bfloat16:
|
||||
unary_op<bfloat16_t, uint8_t>(in, out, detail::ToFP8());
|
||||
break;
|
||||
default:
|
||||
unary_op<float, uint8_t>(in, out, detail::ToFP8());
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
switch (out.dtype()) {
|
||||
case float16:
|
||||
unary_op<uint8_t, float16_t>(in, out, detail::FromFP8());
|
||||
break;
|
||||
case bfloat16:
|
||||
unary_op<uint8_t, bfloat16_t>(in, out, detail::FromFP8());
|
||||
break;
|
||||
default:
|
||||
unary_op<uint8_t, float>(in, out, detail::FromFP8());
|
||||
break;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace mlx::core
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include <simd/math.h>
|
||||
#include <simd/vector.h>
|
||||
|
||||
@@ -200,6 +201,15 @@ SIMD_DEFAULT_COMPARISONS(<=)
|
||||
SIMD_DEFAULT_COMPARISONS(==)
|
||||
SIMD_DEFAULT_COMPARISONS(!=)
|
||||
|
||||
template <typename T, int N>
|
||||
Simd<T, N> clz(Simd<T, N> x) {
|
||||
auto a = *(uint32x4_t*)(&x);
|
||||
auto b = *((uint32x4_t*)(&x) + 1);
|
||||
a = vclzq_u32(a);
|
||||
b = vclzq_u32(b);
|
||||
return asd::make_uint8(a, b);
|
||||
}
|
||||
|
||||
template <typename T, int N>
|
||||
Simd<T, N> atan2(Simd<T, N> a, Simd<T, N> b) {
|
||||
return asd::atan2(a.value, b.value);
|
||||
@@ -207,14 +217,20 @@ Simd<T, N> atan2(Simd<T, N> a, Simd<T, N> b) {
|
||||
|
||||
template <typename T, int N>
|
||||
Simd<T, N> maximum(Simd<T, N> a, Simd<T, N> b) {
|
||||
// TODO add isnan
|
||||
return asd::max(a.value, b.value);
|
||||
auto out = Simd<T, N>(asd::max(a.value, b.value));
|
||||
if constexpr (!std::is_integral_v<T>) {
|
||||
out = select(isnan(b), b, select(isnan(a), a, out));
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
template <typename T, int N>
|
||||
Simd<T, N> minimum(Simd<T, N> a, Simd<T, N> b) {
|
||||
// TODO add isnan
|
||||
return asd::min(a.value, b.value);
|
||||
auto out = Simd<T, N>(asd::min(a.value, b.value));
|
||||
if constexpr (!std::is_integral_v<T>) {
|
||||
out = select(isnan(b), b, select(isnan(a), a, out));
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
template <typename T, int N>
|
||||
|
||||
@@ -171,6 +171,11 @@ DEFAULT_BINARY(&)
|
||||
DEFAULT_BINARY(&&)
|
||||
DEFAULT_BINARY(||)
|
||||
|
||||
template <typename T>
|
||||
Simd<T, 1> clz(Simd<T, 1> x_) {
|
||||
return __builtin_clz(x_.value);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Simd<T, 1> remainder(Simd<T, 1> a_, Simd<T, 1> b_) {
|
||||
T a = a_.value;
|
||||
|
||||
@@ -39,7 +39,7 @@ struct StridedIterator {
|
||||
StridedIterator() = default;
|
||||
|
||||
explicit StridedIterator(T* ptr, int64_t stride, difference_type offset = 0)
|
||||
: ptr_(ptr + offset * stride), stride_(stride) {}
|
||||
: stride_(stride), ptr_(ptr + offset * stride) {}
|
||||
|
||||
explicit StridedIterator(array& arr, int axis, difference_type offset = 0)
|
||||
: StridedIterator(arr.data<T>(), arr.strides()[axis], offset) {}
|
||||
|
||||
@@ -8,6 +8,183 @@
|
||||
|
||||
namespace mlx::core {
|
||||
|
||||
template <typename T, class Enable = void>
|
||||
struct SVDWork {};
|
||||
|
||||
template <typename T>
|
||||
struct SVDWork<
|
||||
T,
|
||||
typename std::enable_if<std::is_floating_point<T>::value>::type> {
|
||||
using R = T;
|
||||
|
||||
int N;
|
||||
int M;
|
||||
int K;
|
||||
int lda;
|
||||
int ldu;
|
||||
int ldvt;
|
||||
char jobz;
|
||||
std::vector<array::Data> buffers;
|
||||
int lwork;
|
||||
|
||||
SVDWork(int N, int M, int K, char jobz)
|
||||
: N(N), M(M), K(K), lda(N), ldu(N), ldvt(M), jobz(jobz) {
|
||||
T workspace_dimension = 0;
|
||||
|
||||
// Will contain the indices of eigenvectors that failed to converge (not
|
||||
// used here but required by lapack).
|
||||
buffers.emplace_back(allocator::malloc(sizeof(int) * 8 * K));
|
||||
|
||||
int lwork_query = -1;
|
||||
int info;
|
||||
|
||||
// Compute workspace size.
|
||||
gesdd<T>(
|
||||
/* jobz = */ &jobz,
|
||||
// M and N are swapped since lapack expects column-major.
|
||||
/* m = */ &N,
|
||||
/* n = */ &M,
|
||||
/* a = */ nullptr,
|
||||
/* lda = */ &lda,
|
||||
/* s = */ nullptr,
|
||||
/* u = */ nullptr,
|
||||
/* ldu = */ &ldu,
|
||||
/* vt = */ nullptr,
|
||||
/* ldvt = */ &ldvt,
|
||||
/* work = */ &workspace_dimension,
|
||||
/* lwork = */ &lwork_query,
|
||||
/* iwork = */ static_cast<int*>(buffers[0].buffer.raw_ptr()),
|
||||
/* info = */ &info);
|
||||
|
||||
if (info != 0) {
|
||||
std::stringstream ss;
|
||||
ss << "[SVD::eval_cpu] workspace calculation failed with code " << info;
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
|
||||
lwork = workspace_dimension;
|
||||
buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
|
||||
}
|
||||
|
||||
void run(T* a, R* s, T* u, T* vt) {
|
||||
int info;
|
||||
gesdd<T>(
|
||||
/* jobz = */ &jobz,
|
||||
// M and N are swapped since lapack expects column-major.
|
||||
/* m = */ &N,
|
||||
/* n = */ &M,
|
||||
/* a = */ a,
|
||||
/* lda = */ &lda,
|
||||
/* s = */ s,
|
||||
// According to the identity above, lapack will write Vᵀᵀ as U.
|
||||
/* u = */ u,
|
||||
/* ldu = */ &ldu,
|
||||
// According to the identity above, lapack will write Uᵀ as Vᵀ.
|
||||
/* vt = */ vt,
|
||||
/* ldvt = */ &ldvt,
|
||||
/* work = */ static_cast<T*>(buffers[1].buffer.raw_ptr()),
|
||||
/* lwork = */ &lwork,
|
||||
/* iwork = */ static_cast<int*>(buffers[0].buffer.raw_ptr()),
|
||||
/* info = */ &info);
|
||||
|
||||
if (info != 0) {
|
||||
std::stringstream ss;
|
||||
ss << "svd_impl: sgesvdx_ failed with code " << info;
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SVDWork<std::complex<float>> {
|
||||
using T = std::complex<float>;
|
||||
using R = float;
|
||||
|
||||
int N;
|
||||
int M;
|
||||
int K;
|
||||
int lda;
|
||||
int ldu;
|
||||
int ldvt;
|
||||
char jobz;
|
||||
std::vector<array::Data> buffers;
|
||||
int lwork;
|
||||
|
||||
SVDWork(int N, int M, int K, char jobz)
|
||||
: N(N), M(M), K(K), lda(N), ldu(N), ldvt(M), jobz(jobz) {
|
||||
T workspace_dimension = 0;
|
||||
|
||||
// Will contain the indices of eigenvectors that failed to converge (not
|
||||
// used here but required by lapack).
|
||||
buffers.emplace_back(allocator::malloc(sizeof(int) * 8 * K));
|
||||
|
||||
const int lrwork =
|
||||
jobz == 'A' ? std::max(1, 5 * K * K + 5 * K) : std::max(1, 7 * K);
|
||||
buffers.emplace_back(allocator::malloc(sizeof(float) * lrwork));
|
||||
|
||||
int lwork_query = -1;
|
||||
int work_query = -1;
|
||||
int info;
|
||||
|
||||
// Compute workspace size.
|
||||
gesdd<T>(
|
||||
/* jobz = */ &jobz,
|
||||
// M and N are swapped since lapack expects column-major.
|
||||
/* m = */ &N,
|
||||
/* n = */ &M,
|
||||
/* a = */ nullptr,
|
||||
/* lda = */ &lda,
|
||||
/* s = */ nullptr,
|
||||
/* u = */ nullptr,
|
||||
/* ldu = */ &ldu,
|
||||
/* vt = */ nullptr,
|
||||
/* ldvt = */ &ldvt,
|
||||
/* work = */ &workspace_dimension,
|
||||
/* lwork = */ &lwork_query,
|
||||
/* rwork = */ static_cast<float*>(buffers[1].buffer.raw_ptr()),
|
||||
/* iwork = */ static_cast<int*>(buffers[0].buffer.raw_ptr()),
|
||||
/* info = */ &info);
|
||||
|
||||
if (info != 0) {
|
||||
std::stringstream ss;
|
||||
ss << "[SVD::eval_cpu] workspace calculation failed with code " << info;
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
|
||||
lwork = workspace_dimension.real();
|
||||
buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
|
||||
}
|
||||
|
||||
void run(T* a, R* s, T* u, T* vt) {
|
||||
int info;
|
||||
gesdd<T>(
|
||||
/* jobz = */ &jobz,
|
||||
// M and N are swapped since lapack expects column-major.
|
||||
/* m = */ &N,
|
||||
/* n = */ &M,
|
||||
/* a = */ a,
|
||||
/* lda = */ &lda,
|
||||
/* s = */ s,
|
||||
// According to the identity above, lapack will write Vᵀᵀ as U.
|
||||
/* u = */ u,
|
||||
/* ldu = */ &ldu,
|
||||
// According to the identity above, lapack will write Uᵀ as Vᵀ.
|
||||
/* vt = */ vt,
|
||||
/* ldvt = */ &ldvt,
|
||||
/* work = */ static_cast<T*>(buffers[2].buffer.raw_ptr()),
|
||||
/* lwork = */ &lwork,
|
||||
/* rwork = */ static_cast<float*>(buffers[1].buffer.raw_ptr()),
|
||||
/* iwork = */ static_cast<int*>(buffers[0].buffer.raw_ptr()),
|
||||
/* info = */ &info);
|
||||
|
||||
if (info != 0) {
|
||||
std::stringstream ss;
|
||||
ss << "svd_impl: sgesvdx_ failed with code " << info;
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void svd_impl(
|
||||
const array& a,
|
||||
@@ -27,6 +204,8 @@ void svd_impl(
|
||||
const int N = a.shape(-1);
|
||||
const int K = std::min(M, N);
|
||||
|
||||
using R = typename SVDWork<T>::R;
|
||||
|
||||
size_t num_matrices = a.size() / (M * N);
|
||||
|
||||
// lapack clobbers the input, so we have to make a copy.
|
||||
@@ -42,7 +221,7 @@ void svd_impl(
|
||||
encoder.set_input_array(a);
|
||||
auto in_ptr = in.data<T>();
|
||||
T* u_ptr;
|
||||
T* s_ptr;
|
||||
R* s_ptr;
|
||||
T* vt_ptr;
|
||||
|
||||
if (compute_uv) {
|
||||
@@ -58,7 +237,7 @@ void svd_impl(
|
||||
encoder.set_output_array(s);
|
||||
encoder.set_output_array(vt);
|
||||
|
||||
s_ptr = s.data<T>();
|
||||
s_ptr = s.data<R>();
|
||||
u_ptr = u.data<T>();
|
||||
vt_ptr = vt.data<T>();
|
||||
} else {
|
||||
@@ -68,98 +247,26 @@ void svd_impl(
|
||||
|
||||
encoder.set_output_array(s);
|
||||
|
||||
s_ptr = s.data<T>();
|
||||
s_ptr = s.data<R>();
|
||||
u_ptr = nullptr;
|
||||
vt_ptr = nullptr;
|
||||
}
|
||||
|
||||
encoder.dispatch([in_ptr, u_ptr, s_ptr, vt_ptr, M, N, K, num_matrices]() {
|
||||
// A of shape M x N. The leading dimension is N since lapack receives Aᵀ.
|
||||
const int lda = N;
|
||||
// U of shape M x M. (N x N in lapack).
|
||||
const int ldu = N;
|
||||
// Vᵀ of shape N x N. (M x M in lapack).
|
||||
const int ldvt = M;
|
||||
|
||||
auto jobz = (u_ptr) ? "A" : "N";
|
||||
|
||||
// Will contain the number of singular values after the call has returned.
|
||||
int ns = 0;
|
||||
T workspace_dimension = 0;
|
||||
|
||||
// Will contain the indices of eigenvectors that failed to converge (not
|
||||
// used here but required by lapack).
|
||||
auto iwork = array::Data{allocator::malloc(sizeof(int) * 8 * K)};
|
||||
|
||||
static const int lwork_query = -1;
|
||||
|
||||
int info;
|
||||
|
||||
// Compute workspace size.
|
||||
gesdd<T>(
|
||||
/* jobz = */ jobz,
|
||||
// M and N are swapped since lapack expects column-major.
|
||||
/* m = */ &N,
|
||||
/* n = */ &M,
|
||||
/* a = */ nullptr,
|
||||
/* lda = */ &lda,
|
||||
/* s = */ nullptr,
|
||||
/* u = */ nullptr,
|
||||
/* ldu = */ &ldu,
|
||||
/* vt = */ nullptr,
|
||||
/* ldvt = */ &ldvt,
|
||||
/* work = */ &workspace_dimension,
|
||||
/* lwork = */ &lwork_query,
|
||||
/* iwork = */ static_cast<int*>(iwork.buffer.raw_ptr()),
|
||||
/* info = */ &info);
|
||||
|
||||
if (info != 0) {
|
||||
std::stringstream ss;
|
||||
ss << "[SVD::eval_cpu] workspace calculation failed with code " << info;
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
|
||||
const int lwork = workspace_dimension;
|
||||
auto scratch = array::Data{allocator::malloc(sizeof(T) * lwork)};
|
||||
|
||||
auto jobz = (u_ptr) ? 'A' : 'N';
|
||||
SVDWork<T> svd_work(N, M, K, jobz);
|
||||
// Loop over matrices.
|
||||
for (int i = 0; i < num_matrices; i++) {
|
||||
gesdd<T>(
|
||||
/* jobz = */ jobz,
|
||||
// M and N are swapped since lapack expects column-major.
|
||||
/* m = */ &N,
|
||||
/* n = */ &M,
|
||||
/* a = */ in_ptr + M * N * i,
|
||||
/* lda = */ &lda,
|
||||
/* s = */ s_ptr + K * i,
|
||||
// According to the identity above, lapack will write Vᵀᵀ as U.
|
||||
/* u = */ vt_ptr ? vt_ptr + N * N * i : nullptr,
|
||||
/* ldu = */ &ldu,
|
||||
// According to the identity above, lapack will write Uᵀ as Vᵀ.
|
||||
/* vt = */ u_ptr ? u_ptr + M * M * i : nullptr,
|
||||
/* ldvt = */ &ldvt,
|
||||
/* work = */ static_cast<T*>(scratch.buffer.raw_ptr()),
|
||||
/* lwork = */ &lwork,
|
||||
/* iwork = */ static_cast<int*>(iwork.buffer.raw_ptr()),
|
||||
/* info = */ &info);
|
||||
|
||||
if (info != 0) {
|
||||
std::stringstream ss;
|
||||
ss << "svd_impl: sgesvdx_ failed with code " << info;
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
svd_work.run(
|
||||
in_ptr + M * N * i,
|
||||
s_ptr + K * i,
|
||||
vt_ptr ? vt_ptr + N * N * i : nullptr,
|
||||
u_ptr ? u_ptr + M * M * i : nullptr);
|
||||
}
|
||||
});
|
||||
encoder.add_temporary(in);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void compute_svd(
|
||||
const array& a,
|
||||
bool compute_uv,
|
||||
std::vector<array>& outputs,
|
||||
Stream stream) {}
|
||||
|
||||
void SVD::eval_cpu(
|
||||
const std::vector<array>& inputs,
|
||||
std::vector<array>& outputs) {
|
||||
@@ -170,9 +277,12 @@ void SVD::eval_cpu(
|
||||
case float64:
|
||||
svd_impl<double>(inputs[0], outputs, compute_uv_, stream());
|
||||
break;
|
||||
case complex64:
|
||||
svd_impl<std::complex<float>>(inputs[0], outputs, compute_uv_, stream());
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"[SVD::eval_cpu] only supports float32 or float64.");
|
||||
"[SVD::eval_cpu] only supports float32, float64, or complex64.");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -24,9 +24,9 @@ void unary_op(const array& a, array& out, Op) {
|
||||
auto ndim = a.ndim();
|
||||
if (a.flags().contiguous) {
|
||||
auto size = a.data_size();
|
||||
constexpr int N = simd::max_size<T>;
|
||||
constexpr int N = std::min(simd::max_size<T>, simd::max_size<U>);
|
||||
while (size >= N) {
|
||||
simd::store(dst, Op{}(simd::load<T, N>(src)));
|
||||
simd::store(dst, simd::Simd<U, N>(Op{}(simd::load<T, N>(src))));
|
||||
size -= N;
|
||||
src += N;
|
||||
dst += N;
|
||||
|
||||
@@ -108,4 +108,73 @@ struct Square {
|
||||
SINGLE()
|
||||
};
|
||||
|
||||
template <int N>
|
||||
Simd<float, N> fp32_from_bits(Simd<uint32_t, N> x) {
|
||||
return *(Simd<float, N>*)(&x);
|
||||
}
|
||||
template <int N>
|
||||
Simd<uint32_t, N> fp32_to_bits(Simd<float, N> x) {
|
||||
return *(Simd<uint32_t, N>*)(&x);
|
||||
}
|
||||
|
||||
struct ToFP8 {
|
||||
template <typename T, int N>
|
||||
Simd<uint8_t, N> operator()(Simd<T, N> f) {
|
||||
uint32_t fp8_max = 543 << 21;
|
||||
auto denorm_mask = Simd<uint32_t, N>(141 << 23);
|
||||
Simd<uint32_t, N> f_bits;
|
||||
Simd<float, N> f32 = f;
|
||||
f_bits = fp32_to_bits(f32);
|
||||
Simd<uint8_t, N> result = 0u;
|
||||
auto sign = f_bits & 0x80000000;
|
||||
f_bits = f_bits ^ sign;
|
||||
|
||||
auto f_bits_low =
|
||||
fp32_to_bits(fp32_from_bits(f_bits) + fp32_from_bits(denorm_mask));
|
||||
auto result_low = Simd<uint8_t, N>(f_bits_low - denorm_mask);
|
||||
|
||||
auto mant_odd = Simd<uint8_t, N>((f_bits >> 20) & 1);
|
||||
auto f_bits_high = f_bits + (((uint32_t)(7 - 127) << 23) + 0x7FFFF);
|
||||
f_bits_high = f_bits_high + Simd<uint32_t, N>(mant_odd);
|
||||
|
||||
auto result_high = Simd<uint8_t, N>(f_bits_high >> 20);
|
||||
result = select(f_bits < (121 << 23), result_low, result_high);
|
||||
|
||||
auto result_sat = Simd<uint8_t, N>(0x7E);
|
||||
result = select(f_bits >= fp8_max, result_sat, result);
|
||||
return result | Simd<uint8_t, N>(sign >> 24);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint8_t operator()(T x) {
|
||||
return (*this)(Simd<T, 1>(x)).value;
|
||||
}
|
||||
};
|
||||
|
||||
struct FromFP8 {
|
||||
template <int N>
|
||||
Simd<float, N> operator()(Simd<uint8_t, N> x) {
|
||||
auto w = Simd<uint32_t, N>(x) << 24;
|
||||
auto sign = w & 0x80000000;
|
||||
auto nonsign = w & 0x7FFFFFFF;
|
||||
|
||||
auto renorm_shift = clz(nonsign);
|
||||
renorm_shift = simd::select(
|
||||
renorm_shift > Simd<uint32_t, N>{4},
|
||||
renorm_shift - Simd<uint32_t, N>{4},
|
||||
Simd<uint32_t, N>{0});
|
||||
|
||||
Simd<int32_t, N> inf_nan_mask =
|
||||
(Simd<int32_t, N>(nonsign + 0x01000000) >> 8) & 0x7F800000;
|
||||
auto zero_mask = Simd<int32_t, N>(nonsign - 1) >> 31;
|
||||
auto result = sign |
|
||||
((((nonsign << renorm_shift >> 4) + ((0x78 - renorm_shift) << 23)) |
|
||||
inf_nan_mask) &
|
||||
~zero_mask);
|
||||
return fp32_from_bits(result);
|
||||
}
|
||||
float operator()(uint8_t x) {
|
||||
return (*this)(Simd<uint8_t, 1>(x)).value;
|
||||
}
|
||||
};
|
||||
} // namespace mlx::core::detail
|
||||
|
||||
@@ -32,6 +32,7 @@ target_sources(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kernel_utils.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/layer_norm.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/logsumexp.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
|
||||
@@ -43,6 +44,7 @@ target_sources(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/reduce/row_reduce.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/rms_norm.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/rope.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/scaled_dot_product_attention.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/scaled_dot_product_attention.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/scan.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
|
||||
@@ -51,12 +53,19 @@ target_sources(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/ternary.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/quantized/affine_quantize.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/quantized/fp_quantize.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/quantized/quantized.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/quantized/convert_fp8.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/worker.cpp)
|
||||
|
||||
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/binary)
|
||||
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/unary)
|
||||
|
||||
# fp4 is not available on < 12.8
|
||||
if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.8.0)
|
||||
target_include_directories(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/quantized/)
|
||||
endif()
|
||||
|
||||
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9.0)
|
||||
target_sources(
|
||||
mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/cublas_gemm_batched_12_9.cu)
|
||||
@@ -114,10 +123,21 @@ if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8.0)
|
||||
mlx PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--compress-mode=size>")
|
||||
endif()
|
||||
|
||||
# Compute capability >= 7.0 is required for synchronization between CPU/GPU with
|
||||
# managed memory.
|
||||
# Use native CUDA arch by default.
|
||||
if(NOT DEFINED MLX_CUDA_ARCHITECTURES)
|
||||
set(MLX_CUDA_ARCHITECTURES "native")
|
||||
execute_process(
|
||||
COMMAND __nvcc_device_query
|
||||
OUTPUT_VARIABLE MLX_CUDA_ARCHITECTURES
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
set(UPGRADABLE_ARCHITECTURES "90;100;121")
|
||||
if(MLX_CUDA_ARCHITECTURES STREQUAL "")
|
||||
message(
|
||||
FATAL_ERROR
|
||||
"Can not get native CUDA arch, must set MLX_CUDA_ARCHITECTURES")
|
||||
elseif(MLX_CUDA_ARCHITECTURES IN_LIST UPGRADABLE_ARCHITECTURES)
|
||||
# Use arch-specific compute capability whenever possible.
|
||||
set(MLX_CUDA_ARCHITECTURES "${MLX_CUDA_ARCHITECTURES}a")
|
||||
endif()
|
||||
endif()
|
||||
message(STATUS "CUDA architectures: ${MLX_CUDA_ARCHITECTURES}")
|
||||
set_target_properties(mlx PROPERTIES CUDA_ARCHITECTURES
|
||||
@@ -129,6 +149,7 @@ FetchContent_Declare(
|
||||
URL "https://github.com/NVIDIA/cccl/releases/download/v2.8.1/cccl-v2.8.1.zip")
|
||||
FetchContent_MakeAvailable(cccl)
|
||||
target_include_directories(mlx BEFORE PRIVATE "${cccl_SOURCE_DIR}/include")
|
||||
set_target_properties(mlx PROPERTIES CCCL_DIR "${cccl_SOURCE_DIR}/include")
|
||||
|
||||
# Use fixed version of NVTX.
|
||||
FetchContent_Declare(
|
||||
@@ -154,7 +175,7 @@ target_link_libraries(mlx PRIVATE CUDA::nvrtc CUDA::cuda_driver)
|
||||
FetchContent_Declare(
|
||||
cudnn
|
||||
GIT_REPOSITORY https://github.com/NVIDIA/cudnn-frontend.git
|
||||
GIT_TAG v1.14.0
|
||||
GIT_TAG v1.16.0
|
||||
GIT_SHALLOW TRUE
|
||||
EXCLUDE_FROM_ALL)
|
||||
set(CUDNN_FRONTEND_SKIP_JSON_LIB ON)
|
||||
@@ -170,11 +191,6 @@ target_link_libraries(mlx PRIVATE CUDNN::cudnn_all)
|
||||
# Suppress nvcc warnings on MLX headers.
|
||||
target_compile_options(mlx PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe
|
||||
--diag_suppress=997>)
|
||||
# Supress warnings: note: parameter passing for argument of type
|
||||
# ‘std::pair<float, float>’ when C++17 is enabled changed to match C++14 in GCC
|
||||
# 10.1
|
||||
target_compile_options(mlx PRIVATE -Wno-psabi)
|
||||
|
||||
# Install CCCL headers for JIT.
|
||||
install(DIRECTORY ${cccl_SOURCE_DIR}/include/cuda
|
||||
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cccl)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/backend/cuda/allocator.h"
|
||||
#include "mlx/backend/cuda/device.h"
|
||||
#include "mlx/backend/cuda/utils.h"
|
||||
#include "mlx/utils.h"
|
||||
|
||||
@@ -67,6 +68,7 @@ CudaBuffer* SmallSizePool::malloc() {
|
||||
next_free_ = next_free_->next;
|
||||
b->buf.data = static_cast<char*>(data_) + i * small_block_size;
|
||||
b->buf.size = small_block_size;
|
||||
b->buf.device = -1;
|
||||
return &b->buf;
|
||||
}
|
||||
|
||||
@@ -88,16 +90,42 @@ CudaAllocator::CudaAllocator()
|
||||
page_size,
|
||||
[](CudaBuffer* buf) { return buf->size; },
|
||||
[this](CudaBuffer* buf) { cuda_free(buf); }) {
|
||||
// TODO: Set memory limit for multi-device.
|
||||
size_t free, total;
|
||||
CHECK_CUDA_ERROR(cudaMemGetInfo(&free, &total));
|
||||
memory_limit_ = total * 0.95;
|
||||
memory_limit_ = total * 0.9;
|
||||
max_pool_size_ = memory_limit_;
|
||||
|
||||
int device_count = 0;
|
||||
CHECK_CUDA_ERROR(cudaGetDeviceCount(&device_count));
|
||||
int curr;
|
||||
CHECK_CUDA_ERROR(cudaGetDevice(&curr));
|
||||
for (int i = 0; i < device_count; ++i) {
|
||||
CHECK_CUDA_ERROR(cudaSetDevice(i));
|
||||
cudaStream_t s;
|
||||
CHECK_CUDA_ERROR(cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking));
|
||||
free_streams_.push_back(s);
|
||||
}
|
||||
CHECK_CUDA_ERROR(cudaSetDevice(curr));
|
||||
}
|
||||
|
||||
Buffer CudaAllocator::malloc(size_t size) {
|
||||
void copy_to_managed(CudaBuffer& buf) {
|
||||
// TODO maybe make this async on a i/o stream to avoid synchronizing the
|
||||
// device on malloc/and free
|
||||
void* new_data;
|
||||
CHECK_CUDA_ERROR(cudaMallocManaged(&new_data, buf.size));
|
||||
buf.device = -1;
|
||||
CHECK_CUDA_ERROR(cudaMemcpy(new_data, buf.data, buf.size, cudaMemcpyDefault));
|
||||
CHECK_CUDA_ERROR(cudaFree(buf.data));
|
||||
buf.data = new_data;
|
||||
}
|
||||
|
||||
Buffer
|
||||
CudaAllocator::malloc_async(size_t size, int device, cudaStream_t stream) {
|
||||
if (size == 0) {
|
||||
return Buffer{new CudaBuffer{nullptr, 0, -1}};
|
||||
}
|
||||
|
||||
// Find available buffer from cache.
|
||||
auto orig_size = size;
|
||||
std::unique_lock lock(mutex_);
|
||||
if (size <= small_block_size) {
|
||||
size = 8;
|
||||
@@ -107,6 +135,10 @@ Buffer CudaAllocator::malloc(size_t size) {
|
||||
size = page_size * ((size + page_size - 1) / page_size);
|
||||
}
|
||||
|
||||
if (size <= small_block_size || stream == nullptr) {
|
||||
device = -1;
|
||||
}
|
||||
|
||||
CudaBuffer* buf = buffer_cache_.reuse_from_cache(size);
|
||||
if (!buf) {
|
||||
// If we have a lot of memory pressure try to reclaim memory from the cache.
|
||||
@@ -122,30 +154,51 @@ Buffer CudaAllocator::malloc(size_t size) {
|
||||
}
|
||||
lock.unlock();
|
||||
if (!buf) {
|
||||
buf = new CudaBuffer{nullptr, size};
|
||||
cudaError_t err = cudaMallocManaged(&buf->data, size);
|
||||
cudaError_t err;
|
||||
void* data = nullptr;
|
||||
if (device == -1) {
|
||||
err = cudaMallocManaged(&data, size);
|
||||
} else {
|
||||
err = cudaMallocAsync(&data, size, stream);
|
||||
}
|
||||
if (err != cudaSuccess && err != cudaErrorMemoryAllocation) {
|
||||
throw std::runtime_error(fmt::format(
|
||||
"cudaMallocManaged failed: {}.", cudaGetErrorString(err)));
|
||||
}
|
||||
if (!data) {
|
||||
return Buffer{nullptr};
|
||||
}
|
||||
buf = new CudaBuffer{data, size, device};
|
||||
}
|
||||
lock.lock();
|
||||
}
|
||||
active_memory_ += size;
|
||||
active_memory_ += buf->size;
|
||||
peak_memory_ = std::max(active_memory_, peak_memory_);
|
||||
|
||||
// Maintain the cache below the requested limit.
|
||||
if (get_cache_memory() > max_pool_size_) {
|
||||
buffer_cache_.release_cached_buffers(get_cache_memory() - max_pool_size_);
|
||||
}
|
||||
// Copy to managed here if the buffer is not on the right device
|
||||
if (buf->device >= 0 && buf->device != device) {
|
||||
copy_to_managed(*buf);
|
||||
}
|
||||
return Buffer{buf};
|
||||
}
|
||||
|
||||
Buffer CudaAllocator::malloc(size_t size) {
|
||||
return malloc_async(size, -1, nullptr);
|
||||
}
|
||||
|
||||
void CudaAllocator::free(Buffer buffer) {
|
||||
auto* buf = static_cast<CudaBuffer*>(buffer.ptr());
|
||||
if (!buf) {
|
||||
return;
|
||||
}
|
||||
if (buf->size == 0) {
|
||||
delete buf;
|
||||
return;
|
||||
}
|
||||
|
||||
std::unique_lock lock(mutex_);
|
||||
active_memory_ -= buf->size;
|
||||
@@ -169,7 +222,11 @@ void CudaAllocator::cuda_free(CudaBuffer* buf) {
|
||||
if (scalar_pool_.in_pool(buf)) {
|
||||
scalar_pool_.free(buf);
|
||||
} else {
|
||||
cudaFree(buf->data);
|
||||
if (buf->device >= 0) {
|
||||
CHECK_CUDA_ERROR(cudaFreeAsync(buf->data, free_streams_[buf->device]));
|
||||
} else {
|
||||
CHECK_CUDA_ERROR(cudaFree(buf->data));
|
||||
}
|
||||
delete buf;
|
||||
}
|
||||
}
|
||||
@@ -220,6 +277,17 @@ CudaAllocator& allocator() {
|
||||
return *allocator_;
|
||||
}
|
||||
|
||||
Buffer malloc_async(size_t size, CommandEncoder& encoder) {
|
||||
auto buffer = allocator().malloc_async(
|
||||
size, encoder.device().cuda_device(), encoder.stream());
|
||||
if (size && !buffer.ptr()) {
|
||||
std::ostringstream msg;
|
||||
msg << "[malloc_async] Unable to allocate " << size << " bytes.";
|
||||
throw std::runtime_error(msg.str());
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
||||
} // namespace cu
|
||||
|
||||
namespace allocator {
|
||||
@@ -232,7 +300,11 @@ void* Buffer::raw_ptr() {
|
||||
if (!ptr_) {
|
||||
return nullptr;
|
||||
}
|
||||
return static_cast<cu::CudaBuffer*>(ptr_)->data;
|
||||
auto& cbuf = *static_cast<cu::CudaBuffer*>(ptr_);
|
||||
if (cbuf.device != -1) {
|
||||
copy_to_managed(cbuf);
|
||||
}
|
||||
return cbuf.data;
|
||||
}
|
||||
|
||||
} // namespace allocator
|
||||
|
||||
@@ -4,19 +4,24 @@
|
||||
|
||||
#include "mlx/allocator.h"
|
||||
#include "mlx/backend/common/buffer_cache.h"
|
||||
#include "mlx/backend/cuda/cuda_utils.h"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <mutex>
|
||||
#include <set>
|
||||
#include <utility>
|
||||
|
||||
namespace mlx::core::cu {
|
||||
|
||||
class CommandEncoder;
|
||||
|
||||
using allocator::Buffer;
|
||||
|
||||
// Stores cuda-managed unified memory.
|
||||
struct CudaBuffer {
|
||||
void* data;
|
||||
size_t size;
|
||||
int device; // -1 for managed
|
||||
};
|
||||
|
||||
class SmallSizePool {
|
||||
@@ -45,6 +50,7 @@ class SmallSizePool {
|
||||
class CudaAllocator : public allocator::Allocator {
|
||||
public:
|
||||
Buffer malloc(size_t size) override;
|
||||
Buffer malloc_async(size_t size, int device, cudaStream_t stream);
|
||||
void free(Buffer buffer) override;
|
||||
size_t size(Buffer buffer) const override;
|
||||
|
||||
@@ -69,9 +75,12 @@ class CudaAllocator : public allocator::Allocator {
|
||||
BufferCache<CudaBuffer> buffer_cache_;
|
||||
size_t active_memory_{0};
|
||||
size_t peak_memory_{0};
|
||||
std::vector<cudaStream_t> free_streams_;
|
||||
SmallSizePool scalar_pool_;
|
||||
};
|
||||
|
||||
CudaAllocator& allocator();
|
||||
|
||||
Buffer malloc_async(size_t size, CommandEncoder& encoder);
|
||||
|
||||
} // namespace mlx::core::cu
|
||||
|
||||
@@ -41,9 +41,8 @@ void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
if (out.size() == 0) {
|
||||
return;
|
||||
}
|
||||
out.set_data(allocator::malloc(out.nbytes()));
|
||||
|
||||
auto& encoder = cu::get_command_encoder(stream());
|
||||
out.set_data(cu::malloc_async(out.nbytes(), encoder));
|
||||
encoder.set_output_array(out);
|
||||
|
||||
dispatch_int_float_types(out.dtype(), "Arange", [&](auto type_tag) {
|
||||
@@ -58,7 +57,7 @@ void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
num_blocks,
|
||||
block_dims,
|
||||
0,
|
||||
out.data<OutType>(),
|
||||
gpu_ptr<OutType>(out),
|
||||
out.data_size(),
|
||||
static_cast<CTYPE>(start_),
|
||||
static_cast<CTYPE>(start_ + step_) - static_cast<CTYPE>(start_));
|
||||
|
||||
@@ -140,8 +140,10 @@ void ArgReduce::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
nvtx3::scoped_range r("ArgReduce::eval_gpu");
|
||||
assert(inputs.size() == 1);
|
||||
auto& in = inputs[0];
|
||||
out.set_data(allocator::malloc(out.nbytes()));
|
||||
|
||||
auto& s = stream();
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
out.set_data(cu::malloc_async(out.nbytes(), encoder));
|
||||
|
||||
// Prepare the shapes, strides and axis arguments.
|
||||
Shape shape = remove_index(in.shape(), axis_);
|
||||
@@ -154,7 +156,6 @@ void ArgReduce::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
int32_t ndim = shape.size();
|
||||
|
||||
// ArgReduce.
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
encoder.set_input_array(in);
|
||||
encoder.set_output_array(out);
|
||||
dispatch_real_types(in.dtype(), "ArgReduce", [&](auto type_tag) {
|
||||
@@ -172,8 +173,8 @@ void ArgReduce::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
num_blocks,
|
||||
block_dim(),
|
||||
0,
|
||||
in.data<T>(),
|
||||
out.data<uint32_t>(),
|
||||
gpu_ptr<T>(in),
|
||||
gpu_ptr<uint32_t>(out),
|
||||
out.size(),
|
||||
const_param(shape),
|
||||
const_param(in_strides),
|
||||
|
||||
@@ -292,9 +292,9 @@ void binary_op_gpu_inplace(
|
||||
{num_blocks_x, num_blocks_y},
|
||||
block_dims,
|
||||
0,
|
||||
a.data<InType>(),
|
||||
b.data<InType>(),
|
||||
out.data<OutType>(),
|
||||
gpu_ptr<InType>(a),
|
||||
gpu_ptr<InType>(b),
|
||||
gpu_ptr<OutType>(out),
|
||||
rest,
|
||||
const_param<dims_constant()>(shape),
|
||||
const_param<dims_constant()>(a_strides),
|
||||
@@ -310,9 +310,9 @@ void binary_op_gpu_inplace(
|
||||
{num_blocks_x, num_blocks_y},
|
||||
block_dims,
|
||||
0,
|
||||
a.data<InType>(),
|
||||
b.data<InType>(),
|
||||
out.data<OutType>(),
|
||||
gpu_ptr<InType>(a),
|
||||
gpu_ptr<InType>(b),
|
||||
gpu_ptr<OutType>(out),
|
||||
rest,
|
||||
const_param(shape),
|
||||
const_param(a_strides),
|
||||
@@ -339,9 +339,9 @@ void binary_op_gpu_inplace(
|
||||
num_blocks,
|
||||
block_dims,
|
||||
0,
|
||||
a.data<InType>(),
|
||||
b.data<InType>(),
|
||||
out.data<OutType>(),
|
||||
gpu_ptr<InType>(a),
|
||||
gpu_ptr<InType>(b),
|
||||
gpu_ptr<OutType>(out),
|
||||
out.data_size());
|
||||
});
|
||||
}
|
||||
@@ -365,7 +365,10 @@ void binary_op_gpu(
|
||||
auto& a = inputs[0];
|
||||
auto& b = inputs[1];
|
||||
auto bopt = get_binary_op_type(a, b);
|
||||
set_binary_op_output_data(a, b, out, bopt);
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
|
||||
set_binary_op_output_data(
|
||||
a, b, out, bopt, [&](auto n) { return cu::malloc_async(n, encoder); });
|
||||
binary_op_gpu_inplace<Op>(inputs, out, op, s);
|
||||
}
|
||||
|
||||
|
||||
@@ -245,14 +245,16 @@ void binary_two_op_gpu_inplace(
|
||||
auto& out_a = outputs[0];
|
||||
auto& out_b = outputs[1];
|
||||
auto bopt = get_binary_op_type(a, b);
|
||||
set_binary_op_output_data(a, b, out_a, bopt);
|
||||
set_binary_op_output_data(a, b, out_b, bopt);
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
set_binary_op_output_data(
|
||||
a, b, out_a, bopt, [&](auto n) { return cu::malloc_async(n, encoder); });
|
||||
set_binary_op_output_data(
|
||||
a, b, out_b, bopt, [&](auto n) { return cu::malloc_async(n, encoder); });
|
||||
|
||||
if (out_a.size() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
encoder.set_input_array(a);
|
||||
encoder.set_input_array(b);
|
||||
encoder.set_output_array(out_a);
|
||||
@@ -313,10 +315,10 @@ void binary_two_op_gpu_inplace(
|
||||
{num_blocks_x, num_blocks_y},
|
||||
block_dims,
|
||||
0,
|
||||
a.data<InType>(),
|
||||
b.data<InType>(),
|
||||
out_a.data<OutType>(),
|
||||
out_b.data<OutType>(),
|
||||
gpu_ptr<InType>(a),
|
||||
gpu_ptr<InType>(b),
|
||||
gpu_ptr<OutType>(out_a),
|
||||
gpu_ptr<OutType>(out_b),
|
||||
rest,
|
||||
const_param<dims_constant()>(shape),
|
||||
const_param<dims_constant()>(a_strides),
|
||||
@@ -332,10 +334,10 @@ void binary_two_op_gpu_inplace(
|
||||
{num_blocks_x, num_blocks_y},
|
||||
block_dims,
|
||||
0,
|
||||
a.data<InType>(),
|
||||
b.data<InType>(),
|
||||
out_a.data<OutType>(),
|
||||
out_b.data<OutType>(),
|
||||
gpu_ptr<InType>(a),
|
||||
gpu_ptr<InType>(b),
|
||||
gpu_ptr<OutType>(out_a),
|
||||
gpu_ptr<OutType>(out_b),
|
||||
rest,
|
||||
const_param(shape),
|
||||
const_param(a_strides),
|
||||
@@ -366,10 +368,10 @@ void binary_two_op_gpu_inplace(
|
||||
num_blocks,
|
||||
block_dims,
|
||||
0,
|
||||
a.data<InType>(),
|
||||
b.data<InType>(),
|
||||
out_a.data<OutType>(),
|
||||
out_b.data<OutType>(),
|
||||
gpu_ptr<InType>(a),
|
||||
gpu_ptr<InType>(b),
|
||||
gpu_ptr<OutType>(out_a),
|
||||
gpu_ptr<OutType>(out_b),
|
||||
out_a.data_size());
|
||||
});
|
||||
}
|
||||
|
||||
@@ -293,8 +293,13 @@ void Compiled::eval_gpu(
|
||||
}
|
||||
}
|
||||
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
|
||||
// Put outputs.
|
||||
compiled_allocate_outputs(inputs, outputs, is_constant_, contiguous);
|
||||
compiled_allocate_outputs(
|
||||
inputs, outputs, is_constant_, contiguous, [&](auto n) {
|
||||
return cu::malloc_async(n, encoder);
|
||||
});
|
||||
for (auto& x : outputs) {
|
||||
args.append(x);
|
||||
}
|
||||
@@ -324,7 +329,6 @@ void Compiled::eval_gpu(
|
||||
kernel_name += fmt::format(
|
||||
"_strided<{}, {}, {}>", shape.size(), index_type, work_per_thread);
|
||||
}
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
for (const auto& in : inputs) {
|
||||
encoder.set_input_array(in);
|
||||
}
|
||||
|
||||
@@ -15,19 +15,16 @@ namespace mlx::core {
|
||||
|
||||
namespace {
|
||||
|
||||
// Alias for better readability.
|
||||
#define CONV_FORWARD CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR
|
||||
#define CONV_BACKWARD_INPUT \
|
||||
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
|
||||
#define CONV_BACKWARD_WEIGHT \
|
||||
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
|
||||
|
||||
// Custom placeholder representing fallback kernel.
|
||||
#define CONV_FALLBACK static_cast<cudnnBackendDescriptorType_t>(-1)
|
||||
enum ConvBackendType {
|
||||
CONV_FALLBACK,
|
||||
CONV_FORWARD,
|
||||
CONV_BACKWARD_INPUT,
|
||||
CONV_BACKWARD_WEIGHT,
|
||||
};
|
||||
|
||||
struct ConvCacheKey {
|
||||
int device_id;
|
||||
cudnnDataType_t cudnn_dtype;
|
||||
fe::DataType_t cudnn_dtype;
|
||||
std::array<int, MAX_NDIM> input_shape;
|
||||
std::array<int, MAX_NDIM> weight_shape;
|
||||
std::array<int, MAX_NDIM> stride;
|
||||
@@ -44,15 +41,13 @@ struct ConvCacheKey {
|
||||
auto& conv_cache() {
|
||||
static LRUBytesKeyCache<
|
||||
ConvCacheKey,
|
||||
std::pair<
|
||||
cudnnBackendDescriptorType_t,
|
||||
std::optional<cudnn_frontend::ExecutionPlan>>>
|
||||
std::pair<ConvBackendType, std::optional<DnnGraph>>>
|
||||
cache("MLX_CUDA_CONV_CACHE_SIZE", /* default_capacity */ 128);
|
||||
return cache;
|
||||
}
|
||||
|
||||
auto get_conv_op_settings(
|
||||
cudnnBackendDescriptorType_t backend_type,
|
||||
auto get_conv_settings(
|
||||
ConvBackendType backend_type,
|
||||
array& x,
|
||||
array& w,
|
||||
array& y,
|
||||
@@ -68,8 +63,8 @@ auto get_conv_op_settings(
|
||||
for (int i = 0; i < padding_lo.size(); ++i) {
|
||||
int wt_size = 1 + kernel_dilation[i] * (w.shape(1 + i) - 1);
|
||||
padding_lo[i] = wt_size - padding_lo[i] - 1;
|
||||
int in_size = 1 + kernel_strides[i] * (x.shape(1 + i) - 1);
|
||||
int out_size = 1 + input_dilation[i] * (y.shape(1 + i) - 1);
|
||||
int in_size = 1 + kernel_strides[i] * (y.shape(1 + i) - 1);
|
||||
int out_size = 1 + input_dilation[i] * (x.shape(1 + i) - 1);
|
||||
padding_hi[i] = out_size - in_size + padding_hi[i];
|
||||
}
|
||||
return std::make_tuple(
|
||||
@@ -95,49 +90,57 @@ auto get_conv_op_settings(
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<cudnn_frontend::OperationGraph> build_conv_op_graph(
|
||||
std::optional<DnnGraph> build_conv_graph(
|
||||
cu::CommandEncoder& encoder,
|
||||
cudnnBackendDescriptorType_t backend_type,
|
||||
ConvBackendType backend_type,
|
||||
Dtype dtype,
|
||||
array& x,
|
||||
array& w,
|
||||
array& y,
|
||||
const SmallVector<int64_t>& stride,
|
||||
const SmallVector<int64_t>& padding_lo,
|
||||
const SmallVector<int64_t>& padding_hi,
|
||||
const SmallVector<int64_t>& dilation) {
|
||||
try {
|
||||
auto compute_dtype = (dtype == float16 || dtype == bfloat16)
|
||||
? CUDNN_DATA_FLOAT
|
||||
: dtype_to_cudnn_type(dtype);
|
||||
auto conv_desc = cudnn_frontend::ConvDescBuilder()
|
||||
.setDataType(compute_dtype)
|
||||
.setMathMode(CUDNN_CROSS_CORRELATION)
|
||||
.setNDims(stride.size())
|
||||
.setStrides(stride.size(), stride.data())
|
||||
.setPrePadding(padding_lo.size(), padding_lo.data())
|
||||
.setPostPadding(padding_hi.size(), padding_hi.data())
|
||||
.setDilation(dilation.size(), dilation.data())
|
||||
.build();
|
||||
const std::vector<int64_t>& stride,
|
||||
const std::vector<int64_t>& padding_lo,
|
||||
const std::vector<int64_t>& padding_hi,
|
||||
const std::vector<int64_t>& dilation) {
|
||||
auto compute_dtype =
|
||||
(dtype == float16 || dtype == bfloat16) ? float32 : dtype;
|
||||
DnnGraph graph(encoder.device().cudnn_handle(), dtype, compute_dtype);
|
||||
auto x_ = graph.tensor_nchw("X", 'x', x);
|
||||
auto w_ = graph.tensor_nchw("W", 'w', w);
|
||||
|
||||
auto op = cudnn_frontend::OperationBuilder(backend_type)
|
||||
.setxDesc(build_cudnn_tensor_nchw('x', x))
|
||||
.setwDesc(build_cudnn_tensor_nchw('w', w))
|
||||
.setyDesc(build_cudnn_tensor_nchw('y', y))
|
||||
.setcDesc(conv_desc)
|
||||
.build();
|
||||
auto set_options = [&](auto& options) {
|
||||
options.set_compute_data_type(dtype_to_cudnn_type(compute_dtype))
|
||||
.set_convolution_mode(fe::ConvolutionMode_t::CROSS_CORRELATION)
|
||||
.set_stride(stride)
|
||||
.set_pre_padding(padding_lo)
|
||||
.set_post_padding(padding_hi)
|
||||
.set_dilation(dilation);
|
||||
};
|
||||
|
||||
std::array<cudnn_frontend::Operation const*, 1> ops = {&op};
|
||||
return cudnn_frontend::OperationGraphBuilder()
|
||||
.setHandle(encoder.device().cudnn_handle())
|
||||
.setOperationGraph(ops.size(), ops.data())
|
||||
.build();
|
||||
} catch (cudnn_frontend::cudnnException& error) {
|
||||
if (error.getCudnnStatus() != CUDNN_STATUS_BAD_PARAM) {
|
||||
throw;
|
||||
}
|
||||
std::shared_ptr<fe::graph::Tensor_attributes> y_;
|
||||
if (backend_type == CONV_FORWARD) {
|
||||
auto options = fe::graph::Conv_fprop_attributes();
|
||||
set_options(options);
|
||||
y_ = graph.conv_fprop(x_, w_, options);
|
||||
} else if (backend_type == CONV_BACKWARD_INPUT) {
|
||||
auto options = fe::graph::Conv_dgrad_attributes();
|
||||
set_options(options);
|
||||
y_ = graph.conv_dgrad(x_, w_, options);
|
||||
} else if (backend_type == CONV_BACKWARD_WEIGHT) {
|
||||
auto options = fe::graph::Conv_wgrad_attributes();
|
||||
set_options(options);
|
||||
y_ = graph.conv_wgrad(w_, x_, options);
|
||||
}
|
||||
graph.tensor_nchw(y_, 'y', y)->set_output(true);
|
||||
|
||||
if (graph.prepare().is_bad()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
graph.deselect_numeric_notes({fe::NumericalNote_t::DOWN_CONVERT_INPUTS});
|
||||
if (dtype == float32 && !env::enable_tf32()) {
|
||||
graph.deselect_numeric_notes({fe::NumericalNote_t::TENSOR_CORE});
|
||||
}
|
||||
CHECK_CUDNN_FE_ERROR(graph.build());
|
||||
return graph;
|
||||
}
|
||||
|
||||
// Transpose from (C_out, H, W, C_in / groups) to (C_in, H, W, C_out / groups).
|
||||
@@ -181,7 +184,7 @@ array group_transpose(
|
||||
// eval_gpu, with cost of possible redundant copies.
|
||||
std::tuple<array, array, array> prepare_args(
|
||||
cu::CommandEncoder& encoder,
|
||||
cudnnBackendDescriptorType_t backend_type,
|
||||
ConvBackendType backend_type,
|
||||
array in,
|
||||
array wt,
|
||||
array out,
|
||||
@@ -221,27 +224,11 @@ std::tuple<array, array, array> prepare_args(
|
||||
return {std::move(in), std::move(wt), std::move(out)};
|
||||
}
|
||||
|
||||
// Get the x/w/y args from the in/wt/out args depending on backend type.
|
||||
inline std::tuple<array&, array&, array&> dispatch_args(
|
||||
cudnnBackendDescriptorType_t backend_type,
|
||||
array& in,
|
||||
array& wt,
|
||||
array& out) {
|
||||
switch (backend_type) {
|
||||
case CONV_BACKWARD_INPUT:
|
||||
return {out, wt, in};
|
||||
case CONV_BACKWARD_WEIGHT:
|
||||
return {in, out, wt};
|
||||
default:
|
||||
return {in, wt, out};
|
||||
}
|
||||
}
|
||||
|
||||
// Register inputs and outputs before actually running conv op. Can only be
|
||||
// called once per eval_gpu.
|
||||
void register_args(
|
||||
cu::CommandEncoder& encoder,
|
||||
cudnnBackendDescriptorType_t backend_type,
|
||||
ConvBackendType backend_type,
|
||||
array& in,
|
||||
array& wt,
|
||||
array& intermediate_out,
|
||||
@@ -270,19 +257,19 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
|
||||
if (out_.size() == 0) {
|
||||
return;
|
||||
}
|
||||
auto& s = stream();
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
|
||||
assert(inputs.size() == 2);
|
||||
array in = inputs[0];
|
||||
array wt = inputs[1];
|
||||
array out = out_;
|
||||
out.set_data(allocator::malloc(out.nbytes()));
|
||||
out.set_data(cu::malloc_async(out.nbytes(), encoder));
|
||||
Dtype dtype = out.dtype();
|
||||
|
||||
auto& s = stream();
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
|
||||
// Search cache.
|
||||
ConvCacheKey cache_key{
|
||||
BytesKey<ConvCacheKey> cache_key;
|
||||
cache_key.pod = {
|
||||
encoder.device().cuda_device(),
|
||||
dtype_to_cudnn_type(dtype),
|
||||
vector_key(in.shape()),
|
||||
@@ -297,16 +284,19 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
|
||||
get_alignment(wt),
|
||||
get_alignment(out)};
|
||||
if (auto it = conv_cache().find(cache_key); it != conv_cache().end()) {
|
||||
auto& [backend_type, plan] = it->second;
|
||||
if (plan) {
|
||||
// Run cached plan.
|
||||
auto& [backend_type, graph] = it->second;
|
||||
if (graph) {
|
||||
// Run cached graph.
|
||||
std::tie(in, wt, out) =
|
||||
prepare_args(encoder, backend_type, in, wt, out, groups_, s);
|
||||
register_args(encoder, backend_type, in, wt, out, out_);
|
||||
auto [x, w, y] = dispatch_args(backend_type, in, wt, out);
|
||||
if (!encode_cudnn_plan(encoder, *plan, {'x', 'w', 'y'}, x, w, y)) {
|
||||
throw std::runtime_error("[conv] Cached plan failed to execute.");
|
||||
}
|
||||
CHECK_CUDNN_FE_ERROR(graph->encode_capturing(
|
||||
encoder,
|
||||
{
|
||||
{'x', gpu_ptr<void>(in)},
|
||||
{'w', gpu_ptr<void>(wt)},
|
||||
{'y', gpu_ptr<void>(out)},
|
||||
}));
|
||||
} else {
|
||||
// Run fallback kernel.
|
||||
gemm_conv(
|
||||
@@ -327,7 +317,7 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
|
||||
|
||||
// There is no reliable way to deduce the proper cuDNN backend for the
|
||||
// convolution, so we make a best guess and then try.
|
||||
SmallVector<cudnnBackendDescriptorType_t, 2> try_backends;
|
||||
SmallVector<ConvBackendType, 2> try_backends;
|
||||
if (flip_) {
|
||||
// When weight is flipped, we assume it is backward input convolution.
|
||||
try_backends.push_back(CONV_BACKWARD_INPUT);
|
||||
@@ -345,13 +335,12 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
|
||||
}
|
||||
|
||||
// Try to build op graph.
|
||||
cudnnBackendDescriptorType_t backend_type;
|
||||
std::optional<cudnn_frontend::OperationGraph> op_graph;
|
||||
ConvBackendType backend_type;
|
||||
std::optional<DnnGraph> graph;
|
||||
for (auto try_backend : try_backends) {
|
||||
auto [in_copy, wt_copy, out_copy] =
|
||||
auto [x, w, y] =
|
||||
prepare_args(encoder, try_backend, in, wt, out, groups_, s);
|
||||
auto [x, w, y] = dispatch_args(try_backend, in_copy, wt_copy, out_copy);
|
||||
auto [stride, padding_lo, padding_hi, dilation] = get_conv_op_settings(
|
||||
auto [stride, padding_lo, padding_hi, dilation] = get_conv_settings(
|
||||
try_backend,
|
||||
x,
|
||||
w,
|
||||
@@ -361,7 +350,7 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
|
||||
padding_hi_,
|
||||
kernel_dilation_,
|
||||
input_dilation_);
|
||||
op_graph = build_conv_op_graph(
|
||||
graph = build_conv_graph(
|
||||
encoder,
|
||||
try_backend,
|
||||
dtype,
|
||||
@@ -372,30 +361,27 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
|
||||
padding_lo,
|
||||
padding_hi,
|
||||
dilation);
|
||||
if (op_graph) {
|
||||
if (graph) {
|
||||
backend_type = try_backend;
|
||||
in = std::move(in_copy);
|
||||
wt = std::move(wt_copy);
|
||||
out = std::move(out_copy);
|
||||
in = std::move(x);
|
||||
wt = std::move(w);
|
||||
out = std::move(y);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (op_graph) {
|
||||
// Find a plan for the graph and execute it.
|
||||
auto plan = find_cudnn_plan_from_op_graph(
|
||||
encoder.device().cudnn_handle(), backend_type, dtype, *op_graph);
|
||||
if (plan) {
|
||||
// Setup inputs and outputs.
|
||||
register_args(encoder, backend_type, in, wt, out, out_);
|
||||
|
||||
auto [x, w, y] = dispatch_args(backend_type, in, wt, out);
|
||||
if (encode_cudnn_plan(encoder, *plan, {'x', 'w', 'y'}, x, w, y)) {
|
||||
conv_cache().emplace(
|
||||
cache_key, std::make_pair(backend_type, std::move(*plan)));
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (graph) {
|
||||
register_args(encoder, backend_type, in, wt, out, out_);
|
||||
CHECK_CUDNN_FE_ERROR(graph->encode_capturing(
|
||||
encoder,
|
||||
{
|
||||
{'x', gpu_ptr<void>(in)},
|
||||
{'w', gpu_ptr<void>(wt)},
|
||||
{'y', gpu_ptr<void>(out)},
|
||||
}));
|
||||
conv_cache().emplace(
|
||||
cache_key, std::make_pair(backend_type, std::move(*graph)));
|
||||
return;
|
||||
}
|
||||
|
||||
// Use fallback kernel for settings not supported by cuDNN.
|
||||
|
||||
@@ -86,7 +86,7 @@ array unfold_inputs_nd(
|
||||
int mat_N,
|
||||
ConvParams<NDIM>& params) {
|
||||
array unfolded({mat_M, mat_K}, in.dtype(), nullptr, {});
|
||||
unfolded.set_data(allocator::malloc(unfolded.nbytes()));
|
||||
unfolded.set_data(cu::malloc_async(unfolded.nbytes(), encoder));
|
||||
encoder.add_temporary(unfolded);
|
||||
|
||||
int filter_size = params.C;
|
||||
@@ -118,8 +118,8 @@ array unfold_inputs_nd(
|
||||
num_blocks,
|
||||
block_dims,
|
||||
0,
|
||||
in.data<DataType>(),
|
||||
unfolded.data<DataType>(),
|
||||
gpu_ptr<DataType>(in),
|
||||
gpu_ptr<DataType>(unfolded),
|
||||
filter_size,
|
||||
out_pixels,
|
||||
params);
|
||||
|
||||
@@ -89,7 +89,7 @@ array grouped_unfold_transpose_inputs_nd(
|
||||
int mat_N,
|
||||
ConvParams<NDIM>& params) {
|
||||
array unfolded({mat_M, mat_K * params.groups}, in.dtype(), nullptr, {});
|
||||
unfolded.set_data(allocator::malloc(unfolded.nbytes()));
|
||||
unfolded.set_data(cu::malloc_async(unfolded.nbytes(), encoder));
|
||||
encoder.add_temporary(unfolded);
|
||||
|
||||
int filter_size = params.C;
|
||||
@@ -121,8 +121,8 @@ array grouped_unfold_transpose_inputs_nd(
|
||||
num_blocks,
|
||||
block_dims,
|
||||
0,
|
||||
in.data<DataType>(),
|
||||
unfolded.data<DataType>(),
|
||||
gpu_ptr<DataType>(in),
|
||||
gpu_ptr<DataType>(unfolded),
|
||||
filter_size,
|
||||
out_pixels,
|
||||
params);
|
||||
|
||||
@@ -5,6 +5,21 @@
|
||||
|
||||
namespace mlx::core {
|
||||
|
||||
void copy_gpu(const array& in, array& out, CopyType ctype, const Stream& s) {
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
bool donated = set_copy_output_data(
|
||||
in, out, ctype, [&](auto n) { return cu::malloc_async(n, encoder); });
|
||||
if (donated && in.dtype() == out.dtype()) {
|
||||
// If the output has the same type as the input then there is nothing to
|
||||
// copy, just use the buffer.
|
||||
return;
|
||||
}
|
||||
if (ctype == CopyType::GeneralGeneral) {
|
||||
ctype = CopyType::General;
|
||||
}
|
||||
copy_gpu_inplace(in, out, ctype, s);
|
||||
}
|
||||
|
||||
void copy_gpu_inplace(
|
||||
const array& in,
|
||||
array& out,
|
||||
@@ -87,11 +102,31 @@ void fill_gpu(const array& in, array& out, const Stream& s) {
|
||||
if (out.size() == 0) {
|
||||
return;
|
||||
}
|
||||
out.set_data(allocator::malloc(out.nbytes()));
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
out.set_data(cu::malloc_async(out.nbytes(), encoder));
|
||||
encoder.set_input_array(in);
|
||||
encoder.set_output_array(out);
|
||||
copy_contiguous(encoder, CopyType::Scalar, in, out, 0, 0);
|
||||
}
|
||||
|
||||
void reshape_gpu(const array& in, array& out, Stream s) {
|
||||
auto [copy_necessary, out_strides] = prepare_reshape(in, out);
|
||||
if (copy_necessary) {
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
out.set_data(cu::malloc_async(out.nbytes(), encoder));
|
||||
copy_gpu_inplace(
|
||||
in,
|
||||
out,
|
||||
in.shape(),
|
||||
in.strides(),
|
||||
make_contiguous_strides(in.shape()),
|
||||
0,
|
||||
0,
|
||||
CopyType::General,
|
||||
s);
|
||||
} else {
|
||||
shared_buffer_reshape(in, out_strides, out);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace mlx::core
|
||||
|
||||
@@ -77,8 +77,8 @@ void copy_contiguous(
|
||||
num_blocks,
|
||||
block_dims,
|
||||
0,
|
||||
in.data<InType>() + in_offset,
|
||||
out.data<OutType>() + out_offset,
|
||||
gpu_ptr<InType>(in) + in_offset,
|
||||
gpu_ptr<OutType>(out) + out_offset,
|
||||
out.data_size());
|
||||
});
|
||||
});
|
||||
|
||||
@@ -106,8 +106,8 @@ void copy_general(
|
||||
using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
|
||||
using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
|
||||
using IdxT = std::conditional_t<large(), int64_t, int32_t>;
|
||||
const InType* in_ptr = in.data<InType>() + offset_in;
|
||||
OutType* out_ptr = out.data<OutType>() + offset_out;
|
||||
const InType* in_ptr = gpu_ptr<InType>(in) + offset_in;
|
||||
OutType* out_ptr = gpu_ptr<OutType>(out) + offset_out;
|
||||
int ndim = shape.size();
|
||||
size_t data_size = 1;
|
||||
for (auto& s : shape)
|
||||
|
||||
@@ -69,8 +69,8 @@ void copy_general_dynamic(
|
||||
using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
|
||||
using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
|
||||
using IdxT = std::conditional_t<large(), int64_t, int32_t>;
|
||||
const InType* in_ptr = in.data<InType>() + offset_in;
|
||||
OutType* out_ptr = out.data<OutType>() + offset_out;
|
||||
const InType* in_ptr = gpu_ptr<InType>(in) + offset_in;
|
||||
OutType* out_ptr = gpu_ptr<OutType>(out) + offset_out;
|
||||
int ndim = shape.size();
|
||||
if (ndim <= 3) {
|
||||
dispatch_1_2_3(ndim, [&](auto dims_constant) {
|
||||
@@ -90,8 +90,8 @@ void copy_general_dynamic(
|
||||
const_param<dims_constant()>(shape),
|
||||
const_param<dims_constant()>(strides_in),
|
||||
const_param<dims_constant()>(strides_out),
|
||||
dynamic_offset_in.data<int64_t>(),
|
||||
dynamic_offset_out.data<int64_t>());
|
||||
gpu_ptr<int64_t>(dynamic_offset_in),
|
||||
gpu_ptr<int64_t>(dynamic_offset_out));
|
||||
});
|
||||
} else { // ndim >= 4
|
||||
auto [num_blocks, block_dims] = get_launch_args(out, large());
|
||||
@@ -107,8 +107,8 @@ void copy_general_dynamic(
|
||||
const_param(strides_in),
|
||||
const_param(strides_out),
|
||||
ndim,
|
||||
dynamic_offset_in.data<int64_t>(),
|
||||
dynamic_offset_out.data<int64_t>());
|
||||
gpu_ptr<int64_t>(dynamic_offset_in),
|
||||
gpu_ptr<int64_t>(dynamic_offset_out));
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -92,8 +92,8 @@ void copy_general_input(
|
||||
using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
|
||||
using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
|
||||
using IdxT = std::conditional_t<large(), int64_t, int32_t>;
|
||||
const InType* in_ptr = in.data<InType>() + offset_in;
|
||||
OutType* out_ptr = out.data<OutType>() + offset_out;
|
||||
const InType* in_ptr = gpu_ptr<InType>(in) + offset_in;
|
||||
OutType* out_ptr = gpu_ptr<OutType>(out) + offset_out;
|
||||
int ndim = shape.size();
|
||||
int work_per_thread = 1;
|
||||
auto dim0 = ndim > 0 ? shape.back() : 1;
|
||||
|
||||
89
mlx/backend/cuda/cuda_utils.h
Normal file
89
mlx/backend/cuda/cuda_utils.h
Normal file
@@ -0,0 +1,89 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cublasLt.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cudnn.h>
|
||||
|
||||
namespace mlx::core {
|
||||
|
||||
// Throw exception if the cuda API does not succeed.
|
||||
void check_cublas_error(const char* name, cublasStatus_t err);
|
||||
void check_cuda_error(const char* name, cudaError_t err);
|
||||
void check_cuda_error(const char* name, CUresult err);
|
||||
void check_cudnn_error(const char* name, cudnnStatus_t err);
|
||||
|
||||
// The macro version that prints the command that failed.
|
||||
#define CHECK_CUBLAS_ERROR(cmd) check_cublas_error(#cmd, (cmd))
|
||||
#define CHECK_CUDA_ERROR(cmd) check_cuda_error(#cmd, (cmd))
|
||||
#define CHECK_CUDNN_ERROR(cmd) check_cudnn_error(#cmd, (cmd))
|
||||
|
||||
// Base class for RAII managed CUDA resources.
|
||||
template <typename Handle, cudaError_t (*Destroy)(Handle)>
|
||||
class CudaHandle {
|
||||
public:
|
||||
CudaHandle(Handle handle = nullptr) : handle_(handle) {}
|
||||
|
||||
CudaHandle(CudaHandle&& other) : handle_(other.handle_) {
|
||||
assert(this != &other);
|
||||
other.handle_ = nullptr;
|
||||
}
|
||||
|
||||
~CudaHandle() {
|
||||
// Skip if there was an error to avoid throwing in the destructors
|
||||
if (cudaPeekAtLastError() != cudaSuccess) {
|
||||
return;
|
||||
}
|
||||
reset();
|
||||
}
|
||||
|
||||
CudaHandle(const CudaHandle&) = delete;
|
||||
CudaHandle& operator=(const CudaHandle&) = delete;
|
||||
|
||||
CudaHandle& operator=(CudaHandle&& other) {
|
||||
assert(this != &other);
|
||||
reset();
|
||||
std::swap(handle_, other.handle_);
|
||||
return *this;
|
||||
}
|
||||
|
||||
void reset() {
|
||||
if (handle_ != nullptr) {
|
||||
CHECK_CUDA_ERROR(Destroy(handle_));
|
||||
handle_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
operator Handle() const {
|
||||
return handle_;
|
||||
}
|
||||
|
||||
protected:
|
||||
Handle handle_;
|
||||
};
|
||||
|
||||
namespace cu {
|
||||
class Device;
|
||||
}; // namespace cu
|
||||
|
||||
// Wrappers of CUDA resources.
|
||||
class CudaGraph : public CudaHandle<cudaGraph_t, cudaGraphDestroy> {
|
||||
public:
|
||||
using CudaHandle::CudaHandle;
|
||||
explicit CudaGraph(cu::Device& device);
|
||||
void end_capture(cudaStream_t stream);
|
||||
};
|
||||
|
||||
class CudaGraphExec : public CudaHandle<cudaGraphExec_t, cudaGraphExecDestroy> {
|
||||
public:
|
||||
void instantiate(cudaGraph_t graph);
|
||||
};
|
||||
|
||||
class CudaStream : public CudaHandle<cudaStream_t, cudaStreamDestroy> {
|
||||
public:
|
||||
explicit CudaStream(cu::Device& device);
|
||||
};
|
||||
|
||||
} // namespace mlx::core
|
||||
@@ -7,32 +7,26 @@ namespace mlx::core {
|
||||
|
||||
namespace {
|
||||
|
||||
// Create a cudnn tensor descriptor.
|
||||
template <typename Vec>
|
||||
inline cudnn_frontend::Tensor build_cudnn_tensor(
|
||||
int64_t id,
|
||||
const array& x,
|
||||
const Vec& shape,
|
||||
const Vec& strides) {
|
||||
return cudnn_frontend::TensorBuilder()
|
||||
.setDim(shape.size(), shape.data())
|
||||
.setStrides(strides.size(), strides.data())
|
||||
.setId(id)
|
||||
.setAlignment(get_alignment(x))
|
||||
.setDataType(dtype_to_cudnn_type(x.dtype()))
|
||||
.build();
|
||||
}
|
||||
#define RETURN_IF_ERROR(cmd) \
|
||||
if (auto ret = cmd; ret.is_bad()) { \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
// In MLX a singleton dim (shape[dim] == 1) can have any stride, but in cuDNN
|
||||
// whether a tensor is contiguous is determined with:
|
||||
// shape[dim] == shape[dim + 1] * strides[dim + 1]
|
||||
// So a contiguous array with singleton dims in MLX may be mistakenly treated
|
||||
// as strided in cuDNN, and we work around it by normalizing the strides.
|
||||
Strides normalized_strides(const array& x) {
|
||||
if (!x.flags().row_contiguous || x.ndim() < 2) {
|
||||
return x.strides();
|
||||
std::vector<int64_t> normalized_strides(const array& x) {
|
||||
std::vector<int64_t> strides(x.strides().begin(), x.strides().end());
|
||||
if (std::all_of(
|
||||
strides.begin(), strides.end(), [](int64_t s) { return s == 0; })) {
|
||||
strides.back() = 1;
|
||||
return strides;
|
||||
}
|
||||
if (!x.flags().row_contiguous || x.ndim() < 2) {
|
||||
return strides;
|
||||
}
|
||||
Strides strides = x.strides();
|
||||
for (int i = x.ndim() - 2; i >= 0; --i) {
|
||||
if (x.shape(i) == 1) {
|
||||
strides[i] = x.shape(i + 1) * strides[i + 1];
|
||||
@@ -42,7 +36,9 @@ Strides normalized_strides(const array& x) {
|
||||
}
|
||||
|
||||
// Return the shape and strides after transposing from NHWC to NCHW.
|
||||
auto nhwc_to_nchw(SmallVector<int64_t> shape, SmallVector<int64_t> strides) {
|
||||
inline auto nhwc_to_nchw(const array& x) {
|
||||
auto shape = convert_vector<int64_t>(x.shape());
|
||||
auto strides = normalized_strides(x);
|
||||
assert(shape.size() >= 3);
|
||||
shape.insert(shape.begin() + 1, shape.back());
|
||||
shape.erase(shape.end() - 1);
|
||||
@@ -51,225 +47,95 @@ auto nhwc_to_nchw(SmallVector<int64_t> shape, SmallVector<int64_t> strides) {
|
||||
return std::make_tuple(std::move(shape), std::move(strides));
|
||||
}
|
||||
|
||||
inline auto nhwc_to_nchw(const array& x) {
|
||||
return nhwc_to_nchw(
|
||||
convert_vector<int64_t>(x.shape()), normalized_strides(x));
|
||||
}
|
||||
|
||||
// Return available engines for a |op_graph|.
|
||||
cudnn_frontend::EngineConfigList get_cudnn_engine_configs(
|
||||
cudnnBackendDescriptorType_t backend_type,
|
||||
Dtype dtype,
|
||||
cudnn_frontend::OperationGraph& op_graph,
|
||||
bool use_fallback = true) {
|
||||
SmallVector<cudnn_frontend::GeneratorSource, 2> sources;
|
||||
sources.push_back([](auto& op_graph) {
|
||||
auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
|
||||
.setOperationGraph(op_graph)
|
||||
.setHeurMode(CUDNN_HEUR_MODE_A)
|
||||
.build();
|
||||
return heuristics.getEngineConfig(heuristics.getEngineConfigCount());
|
||||
});
|
||||
if (use_fallback) {
|
||||
sources.push_back([&backend_type](auto& op_graph) {
|
||||
auto fallback = cudnn_frontend::EngineFallbackListBuilder()
|
||||
.setOperationGraph(op_graph)
|
||||
.setOperation(backend_type)
|
||||
.build();
|
||||
return fallback.getFallbackList();
|
||||
});
|
||||
}
|
||||
|
||||
auto configs =
|
||||
cudnn_frontend::EngineConfigGenerator(sources.size(), sources.data())
|
||||
.generate_engine_config(op_graph);
|
||||
|
||||
cudnn_frontend::EngineConfigList filtered_configs;
|
||||
cudnn_frontend::filter(configs, filtered_configs, [dtype](auto c) {
|
||||
if (cudnn_frontend::hasNumericalNote<
|
||||
CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS>(c)) {
|
||||
return true;
|
||||
}
|
||||
if (cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_TENSOR_CORE>(c) &&
|
||||
dtype == float32 && !env::enable_tf32()) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
return filtered_configs;
|
||||
}
|
||||
|
||||
// Take |engine_configs| and |op_graph| and find a working execution plans
|
||||
// from them.
|
||||
std::optional<cudnn_frontend::ExecutionPlan>
|
||||
find_cudnn_plan_from_engine_configs(
|
||||
cudnnHandle_t handle,
|
||||
const cudnn_frontend::EngineConfigList& engine_configs,
|
||||
const cudnn_frontend::OperationGraph& op_graph) {
|
||||
auto op_graph_tag = op_graph.getTag();
|
||||
for (const auto& config : engine_configs) {
|
||||
try {
|
||||
return cudnn_frontend::ExecutionPlanBuilder()
|
||||
.setHandle(handle)
|
||||
.setEngineConfig(config, op_graph_tag)
|
||||
.build();
|
||||
} catch (cudnn_frontend::cudnnException& error) {
|
||||
if (error.getCudnnStatus() != CUDNN_STATUS_NOT_SUPPORTED) {
|
||||
throw;
|
||||
}
|
||||
}
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
// Prepare workspace and args to execute plan.
|
||||
template <typename F>
|
||||
bool prepare_cudnn_plan(
|
||||
cu::CommandEncoder& encoder,
|
||||
cudnn_frontend::ExecutionPlan& plan,
|
||||
int num_args,
|
||||
const int64_t* uids,
|
||||
void** data_ptrs,
|
||||
F&& execute) {
|
||||
int workspace_size = plan.getWorkspaceSize();
|
||||
array workspace(
|
||||
workspace_size > 0 ? allocator::malloc(workspace_size)
|
||||
: allocator::Buffer(nullptr),
|
||||
{workspace_size},
|
||||
uint8);
|
||||
|
||||
auto args = cudnn_frontend::VariantPackBuilder()
|
||||
.setWorkspacePointer(workspace.data<void>())
|
||||
.setDataPointers(num_args, data_ptrs)
|
||||
.setUids(num_args, uids)
|
||||
.build();
|
||||
|
||||
auto handle = encoder.device().cudnn_handle();
|
||||
cudnnSetStream(handle, encoder.stream());
|
||||
|
||||
if (!execute(handle, plan.get_raw_desc(), args.get_raw_desc())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
encoder.add_temporary(workspace);
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
cudnn_frontend::Tensor build_cudnn_tensor(int64_t id, const array& x) {
|
||||
auto shape = convert_vector<int64_t>(x.shape());
|
||||
return build_cudnn_tensor(id, x, shape, normalized_strides(x));
|
||||
fe::error_t DnnGraph::prepare() {
|
||||
RETURN_IF_ERROR(validate());
|
||||
try {
|
||||
RETURN_IF_ERROR(build_operation_graph(handle_));
|
||||
} catch (cudnn_frontend::cudnnException& error) {
|
||||
// cuDNN bug: they did not catch all exceptions in the API.
|
||||
return {fe::error_code_t::CUDNN_BACKEND_API_FAILED, error.what()};
|
||||
}
|
||||
RETURN_IF_ERROR(create_execution_plans({fe::HeurMode_t::A}));
|
||||
return {};
|
||||
}
|
||||
|
||||
cudnn_frontend::Tensor build_cudnn_tensor_nchw(int64_t id, const array& x) {
|
||||
fe::error_t DnnGraph::build() {
|
||||
RETURN_IF_ERROR(check_support(handle_));
|
||||
RETURN_IF_ERROR(build_plans(handle_));
|
||||
return {};
|
||||
}
|
||||
|
||||
fe::error_t DnnGraph::encode_graph(
|
||||
cu::CommandEncoder& encoder,
|
||||
std::unordered_map<int64_t, void*> variant_pack) {
|
||||
cudnnSetStream(handle_, encoder.stream());
|
||||
CudaGraph cuda_graph(encoder.device());
|
||||
RETURN_IF_ERROR(populate_cuda_graph(
|
||||
handle_, variant_pack, prepare_workspace(encoder), cuda_graph));
|
||||
encoder.add_graph_node(cuda_graph);
|
||||
return {};
|
||||
}
|
||||
|
||||
fe::error_t DnnGraph::encode_capturing(
|
||||
cu::CommandEncoder& encoder,
|
||||
std::unordered_map<int64_t, void*> variant_pack) {
|
||||
auto* workspace_ptr = prepare_workspace(encoder);
|
||||
auto capture = encoder.capture_context();
|
||||
cudnnSetStream(handle_, encoder.stream());
|
||||
auto ret = execute(handle_, variant_pack, workspace_ptr);
|
||||
if (ret.is_bad()) {
|
||||
capture.discard = true;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void* DnnGraph::prepare_workspace(cu::CommandEncoder& encoder) {
|
||||
int64_t workspace_size = 0;
|
||||
CHECK_CUDNN_FE_ERROR(get_workspace_size(workspace_size));
|
||||
if (workspace_size > 0) {
|
||||
array workspace(
|
||||
cu::malloc_async(workspace_size, encoder),
|
||||
{static_cast<int>(workspace_size)},
|
||||
uint8);
|
||||
encoder.add_temporary(workspace);
|
||||
return gpu_ptr<void>(workspace);
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void DnnGraph::set_tensor_attrs(
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
|
||||
int64_t uid,
|
||||
const array& x,
|
||||
const std::vector<int64_t>& shape,
|
||||
const std::vector<int64_t>& strides) {
|
||||
tensor->set_uid(uid)
|
||||
.set_alignment(get_alignment(x))
|
||||
.set_data_type(dtype_to_cudnn_type(x.dtype()))
|
||||
.set_dim(shape)
|
||||
.set_stride(strides);
|
||||
}
|
||||
|
||||
void DnnGraph::set_tensor_attrs(
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
|
||||
int64_t uid,
|
||||
const array& x) {
|
||||
set_tensor_attrs(
|
||||
tensor,
|
||||
uid,
|
||||
x,
|
||||
convert_vector<int64_t>(x.shape()),
|
||||
normalized_strides(x));
|
||||
}
|
||||
|
||||
void DnnGraph::set_tensor_attrs_nchw(
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
|
||||
int64_t uid,
|
||||
const array& x) {
|
||||
auto [shape, strides] = nhwc_to_nchw(x);
|
||||
return build_cudnn_tensor(id, x, shape, strides);
|
||||
set_tensor_attrs(tensor, uid, x, shape, strides);
|
||||
}
|
||||
|
||||
cudnn_frontend::Tensor build_cudnn_tensor_4d_nchw(int64_t id, const array& x) {
|
||||
if (x.ndim() == 0) {
|
||||
SmallVector<int64_t, 4> scalar_dims = {1, 1, 1, 1};
|
||||
return build_cudnn_tensor(id, x, scalar_dims, scalar_dims);
|
||||
}
|
||||
if (x.ndim() == 1) {
|
||||
int64_t s = x.shape(0);
|
||||
SmallVector<int64_t, 4> shape = {1, x.shape(0), 1, 1};
|
||||
SmallVector<int64_t, 4> strides = {s, 1, s, s};
|
||||
return build_cudnn_tensor(id, x, shape, strides);
|
||||
}
|
||||
if (x.ndim() == 2) {
|
||||
int64_t s =
|
||||
x.flags().row_contiguous ? x.shape(1) * x.strides(1) : x.strides(0);
|
||||
SmallVector<int64_t, 4> shape = {x.shape(0), x.shape(1), 1, 1};
|
||||
SmallVector<int64_t, 4> strides = {s, x.strides(1), s, s};
|
||||
return build_cudnn_tensor(id, x, shape, strides);
|
||||
}
|
||||
if (x.ndim() == 3 || x.ndim() == 4) {
|
||||
return build_cudnn_tensor_nchw(id, x);
|
||||
}
|
||||
throw std::runtime_error(
|
||||
fmt::format("Unsupported array with {} dims.", x.ndim()));
|
||||
}
|
||||
|
||||
cudnn_frontend::Tensor build_cudnn_scalar_4d(int64_t id, Dtype dtype) {
|
||||
SmallVector<int64_t, 4> scalar_dims = {1, 1, 1, 1};
|
||||
return cudnn_frontend::TensorBuilder()
|
||||
.setDim(scalar_dims.size(), scalar_dims.data())
|
||||
.setStrides(scalar_dims.size(), scalar_dims.data())
|
||||
.setId(id)
|
||||
.setAlignment(16)
|
||||
.setDataType(dtype_to_cudnn_type(dtype))
|
||||
.setByValue(true)
|
||||
.build();
|
||||
}
|
||||
|
||||
std::optional<cudnn_frontend::ExecutionPlan> find_cudnn_plan_from_op_graph(
|
||||
cudnnHandle_t handle,
|
||||
cudnnBackendDescriptorType_t backend_type,
|
||||
Dtype dtype,
|
||||
cudnn_frontend::OperationGraph& op_graph) {
|
||||
auto engine_configs = get_cudnn_engine_configs(backend_type, dtype, op_graph);
|
||||
if (engine_configs.empty()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
return find_cudnn_plan_from_engine_configs(handle, engine_configs, op_graph);
|
||||
}
|
||||
|
||||
bool encode_cudnn_plan_with_capturing(
|
||||
cu::CommandEncoder& encoder,
|
||||
cudnn_frontend::ExecutionPlan& plan,
|
||||
int num_args,
|
||||
const int64_t* uids,
|
||||
void** data_ptrs) {
|
||||
return prepare_cudnn_plan(
|
||||
encoder,
|
||||
plan,
|
||||
num_args,
|
||||
uids,
|
||||
data_ptrs,
|
||||
[&](auto handle, auto plan, auto args) {
|
||||
auto capture = encoder.capture_context();
|
||||
if (cudnnBackendExecute(handle, plan, args) != CUDNN_STATUS_SUCCESS) {
|
||||
// Discard the captured graph when failed.
|
||||
capture.discard = true;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
#if CUDNN_VERSION >= 90500
|
||||
bool encode_cudnn_plan_with_graph_api(
|
||||
cu::CommandEncoder& encoder,
|
||||
cudnn_frontend::ExecutionPlan& plan,
|
||||
CudaGraph& graph,
|
||||
int num_args,
|
||||
const int64_t* uids,
|
||||
void** data_ptrs) {
|
||||
return prepare_cudnn_plan(
|
||||
encoder,
|
||||
plan,
|
||||
num_args,
|
||||
uids,
|
||||
data_ptrs,
|
||||
[&](auto handle, auto plan, auto args) {
|
||||
if (!graph) {
|
||||
graph = CudaGraph(encoder.device());
|
||||
if (cudnnBackendPopulateCudaGraph(handle, plan, args, graph) !=
|
||||
CUDNN_STATUS_SUCCESS) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (cudnnBackendUpdateCudaGraph(handle, plan, args, graph) !=
|
||||
CUDNN_STATUS_SUCCESS) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
encoder.add_graph_node(graph);
|
||||
return true;
|
||||
});
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace mlx::core
|
||||
|
||||
@@ -2,28 +2,34 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "mlx/array.h"
|
||||
#include "mlx/backend/cuda/device/config.h"
|
||||
#include "mlx/backend/cuda/utils.h"
|
||||
#include "mlx/dtype_utils.h"
|
||||
|
||||
#include <cudnn_frontend.h>
|
||||
#include <cudnn_frontend_find_plan.h>
|
||||
#include <fmt/format.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
|
||||
namespace mlx::core {
|
||||
|
||||
namespace cu {
|
||||
class CommandEncoder;
|
||||
}
|
||||
|
||||
namespace fe = cudnn_frontend;
|
||||
|
||||
#define CHECK_CUDNN_FE_ERROR(cmd) \
|
||||
do { \
|
||||
auto error = cmd; \
|
||||
if (!error.is_good()) { \
|
||||
throw std::runtime_error( \
|
||||
fmt::format("{} failed: {}.", #cmd, error.get_message())); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// Return pointer alignment of |x|'s data.
|
||||
inline uint8_t get_alignment(const array& x) {
|
||||
uint8_t alignment = 1;
|
||||
uintptr_t address = reinterpret_cast<uintptr_t>(x.data<void>());
|
||||
uintptr_t address = reinterpret_cast<uintptr_t>(gpu_ptr<void>(x));
|
||||
for (; alignment < 32; alignment *= 2) {
|
||||
if (address % (alignment * 2)) {
|
||||
return alignment;
|
||||
@@ -34,8 +40,31 @@ inline uint8_t get_alignment(const array& x) {
|
||||
|
||||
// Convert the type of elements in |vec| to |T|.
|
||||
template <typename T, typename Vec>
|
||||
inline SmallVector<T> convert_vector(const Vec& vec) {
|
||||
return SmallVector<T>(vec.begin(), vec.end());
|
||||
inline std::vector<T> convert_vector(const Vec& vec) {
|
||||
return std::vector<T>(vec.begin(), vec.end());
|
||||
}
|
||||
|
||||
// Map dtype to cudnn data type.
|
||||
inline fe::DataType_t dtype_to_cudnn_type(Dtype dtype) {
|
||||
switch (dtype) {
|
||||
case int8:
|
||||
return fe::DataType_t::INT8;
|
||||
case int32:
|
||||
return fe::DataType_t::INT32;
|
||||
case uint8:
|
||||
return fe::DataType_t::UINT8;
|
||||
case float16:
|
||||
return fe::DataType_t::HALF;
|
||||
case bfloat16:
|
||||
return fe::DataType_t::BFLOAT16;
|
||||
case float32:
|
||||
return fe::DataType_t::FLOAT;
|
||||
case float64:
|
||||
return fe::DataType_t::DOUBLE;
|
||||
default:
|
||||
throw std::runtime_error(fmt::format(
|
||||
"Unsupported dtype in cuDNN: {}.", dtype_to_string(dtype)));
|
||||
}
|
||||
}
|
||||
|
||||
// Return an array that can be used as map key for |vec| with size <= MAX_NDIM.
|
||||
@@ -43,122 +72,100 @@ inline SmallVector<T> convert_vector(const Vec& vec) {
|
||||
// There are 2 differences from the const_param util from kernel_utils.cuh:
|
||||
// 1. The rest of array is filled with 0.
|
||||
// 2. This util can be used in .cpp files.
|
||||
template <typename T, template <typename U> class Vec>
|
||||
inline std::array<T, MAX_NDIM> vector_key(const Vec<T>& vec) {
|
||||
if (vec.size() > MAX_NDIM) {
|
||||
template <int NDIM = MAX_NDIM, typename T, template <typename U> class Vec>
|
||||
inline std::array<T, NDIM> vector_key(const Vec<T>& vec) {
|
||||
if (vec.size() > NDIM) {
|
||||
throw std::runtime_error(
|
||||
fmt::format("ndim can not be larger than {}.", MAX_NDIM));
|
||||
fmt::format("ndim can not be larger than {}.", NDIM));
|
||||
}
|
||||
std::array<T, MAX_NDIM> result = {};
|
||||
std::array<T, NDIM> result = {};
|
||||
std::copy_n(vec.begin(), vec.size(), result.begin());
|
||||
return result;
|
||||
}
|
||||
|
||||
// Helpers used by get_data_ptrs to get pointers.
|
||||
inline void* get_data_ptr(const array& arr) {
|
||||
return const_cast<void*>(arr.data<void>());
|
||||
}
|
||||
|
||||
template <typename T, typename = std::enable_if_t<std::is_scalar_v<T>>>
|
||||
inline void* get_data_ptr(T& scalar) {
|
||||
return &scalar;
|
||||
}
|
||||
|
||||
// Return an array filled with data pointers of args.
|
||||
template <typename... Args>
|
||||
inline std::array<void*, sizeof...(Args)> get_data_ptrs(Args&... args) {
|
||||
return {get_data_ptr(args)...};
|
||||
}
|
||||
|
||||
// Map dtype to cudnn data type.
|
||||
inline cudnnDataType_t dtype_to_cudnn_type(Dtype dtype) {
|
||||
switch (dtype) {
|
||||
case int8:
|
||||
return CUDNN_DATA_INT8;
|
||||
case int32:
|
||||
return CUDNN_DATA_INT32;
|
||||
case uint8:
|
||||
return CUDNN_DATA_UINT8;
|
||||
case float16:
|
||||
return CUDNN_DATA_HALF;
|
||||
case bfloat16:
|
||||
return CUDNN_DATA_BFLOAT16;
|
||||
case float32:
|
||||
return CUDNN_DATA_FLOAT;
|
||||
case float64:
|
||||
return CUDNN_DATA_DOUBLE;
|
||||
default:
|
||||
throw std::runtime_error(fmt::format(
|
||||
"Unsupported dtype in Convolution: {}.", dtype_to_string(dtype)));
|
||||
// Extends cuDNN graph with helpers.
|
||||
class DnnGraph : public fe::graph::Graph {
|
||||
public:
|
||||
DnnGraph(cudnnHandle_t handle, Dtype io_dtype, Dtype compute_dtype = float32)
|
||||
: handle_(handle) {
|
||||
set_io_data_type(dtype_to_cudnn_type(io_dtype));
|
||||
set_intermediate_data_type(dtype_to_cudnn_type(compute_dtype));
|
||||
set_compute_data_type(dtype_to_cudnn_type(compute_dtype));
|
||||
}
|
||||
}
|
||||
|
||||
// Create a tensor descriptor from |x|.
|
||||
cudnn_frontend::Tensor build_cudnn_tensor(int64_t id, const array& x);
|
||||
// Create a cuDNN tensor description from MLX array |x|.
|
||||
auto& tensor(
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>& attrs,
|
||||
int64_t uid,
|
||||
const array& x) {
|
||||
set_tensor_attrs(attrs, uid, x);
|
||||
return attrs;
|
||||
}
|
||||
auto tensor(const char* name, int64_t uid, const array& x) {
|
||||
auto attrs = Graph::tensor(fe::graph::Tensor_attributes().set_name(name));
|
||||
tensor(attrs, uid, x);
|
||||
return attrs;
|
||||
}
|
||||
|
||||
// Create a tensor descriptor from |x|, and transpose from NHWC to NCHW.
|
||||
cudnn_frontend::Tensor build_cudnn_tensor_nchw(int64_t id, const array& x);
|
||||
// Create a cuDNN tensor description from MLX array |x|, and transpose it from
|
||||
// NHWC layout to NCHW.
|
||||
auto& tensor_nchw(
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>& attrs,
|
||||
int64_t uid,
|
||||
const array& x) {
|
||||
set_tensor_attrs_nchw(attrs, uid, x);
|
||||
return attrs;
|
||||
}
|
||||
auto tensor_nchw(const char* name, int64_t uid, const array& x) {
|
||||
auto attrs = Graph::tensor(fe::graph::Tensor_attributes().set_name(name));
|
||||
tensor_nchw(attrs, uid, x);
|
||||
return attrs;
|
||||
}
|
||||
|
||||
// Create a tensor descriptor from |x|, make sure it is 4D, and transpose it
|
||||
// from NHWC to NCHW.
|
||||
cudnn_frontend::Tensor build_cudnn_tensor_4d_nchw(int64_t id, const array& x);
|
||||
// Create a cuDNN tensor for scalar.
|
||||
auto scalar(const char* name, int64_t uid, Dtype dtype) {
|
||||
return Graph::tensor(fe::graph::Tensor_attributes()
|
||||
.set_name(name)
|
||||
.set_uid(uid)
|
||||
.set_dim({1, 1, 1, 1})
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_is_pass_by_value(true)
|
||||
.set_data_type(dtype_to_cudnn_type(dtype)));
|
||||
}
|
||||
|
||||
// Create a 4D scalar tensor descriptor, which is passed by value.
|
||||
cudnn_frontend::Tensor build_cudnn_scalar_4d(int64_t id, Dtype dtype);
|
||||
// Call this before setting notes.
|
||||
fe::error_t prepare();
|
||||
// Call this after setting notes.
|
||||
fe::error_t build();
|
||||
|
||||
// Find a working plan for |op_graph|.
|
||||
std::optional<cudnn_frontend::ExecutionPlan> find_cudnn_plan_from_op_graph(
|
||||
cudnnHandle_t handle,
|
||||
cudnnBackendDescriptorType_t backend_type,
|
||||
Dtype dtype,
|
||||
cudnn_frontend::OperationGraph& op_graph);
|
||||
// Add cuDNN graph to CUDA graph, using native CUDA graph API.
|
||||
fe::error_t encode_graph(
|
||||
cu::CommandEncoder& encoder,
|
||||
std::unordered_map<int64_t, void*> variant_pack);
|
||||
// Add cuDNN graph to CUDA graph, using stream capture.
|
||||
fe::error_t encode_capturing(
|
||||
cu::CommandEncoder& encoder,
|
||||
std::unordered_map<int64_t, void*> variant_pack);
|
||||
|
||||
// Encode the plan to command buffer by capturing.
|
||||
bool encode_cudnn_plan_with_capturing(
|
||||
cu::CommandEncoder& encoder,
|
||||
cudnn_frontend::ExecutionPlan& plan,
|
||||
int num_args,
|
||||
const int64_t* uids,
|
||||
void** data_ptrs);
|
||||
private:
|
||||
void* prepare_workspace(cu::CommandEncoder& encoder);
|
||||
|
||||
#if CUDNN_VERSION >= 90500
|
||||
// Encode the plan to command buffer by using native graph api of cudnn. If the
|
||||
// |graph| is empty it will be populated, otherwise it will be updated.
|
||||
bool encode_cudnn_plan_with_graph_api(
|
||||
cu::CommandEncoder& encoder,
|
||||
cudnn_frontend::ExecutionPlan& plan,
|
||||
CudaGraph& graph,
|
||||
int num_args,
|
||||
const int64_t* uids,
|
||||
void** data_ptrs);
|
||||
#endif
|
||||
void set_tensor_attrs(
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
|
||||
int64_t uid,
|
||||
const array& x,
|
||||
const std::vector<int64_t>& shape,
|
||||
const std::vector<int64_t>& strides);
|
||||
void set_tensor_attrs(
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
|
||||
int64_t uid,
|
||||
const array& x);
|
||||
void set_tensor_attrs_nchw(
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
|
||||
int64_t uid,
|
||||
const array& x);
|
||||
|
||||
// Helpers to make calls like encode_cudnn_plan(..., {'x', 'y', 'z'}, x, y, z).
|
||||
template <typename... Args>
|
||||
bool encode_cudnn_plan(
|
||||
cu::CommandEncoder& encoder,
|
||||
cudnn_frontend::ExecutionPlan& plan,
|
||||
std::initializer_list<int64_t> uids,
|
||||
Args&... args) {
|
||||
assert(uids.size() == sizeof...(args));
|
||||
auto data_ptrs = get_data_ptrs(args...);
|
||||
return encode_cudnn_plan_with_capturing(
|
||||
encoder, plan, uids.size(), uids.begin(), data_ptrs.data());
|
||||
}
|
||||
|
||||
#if CUDNN_VERSION >= 90500
|
||||
template <typename... Args>
|
||||
bool encode_cudnn_plan(
|
||||
cu::CommandEncoder& encoder,
|
||||
cudnn_frontend::ExecutionPlan& plan,
|
||||
CudaGraph& graph,
|
||||
std::initializer_list<int64_t> uids,
|
||||
Args&... args) {
|
||||
assert(uids.size() == sizeof...(args));
|
||||
auto data_ptrs = get_data_ptrs(args...);
|
||||
return encode_cudnn_plan_with_graph_api(
|
||||
encoder, plan, graph, uids.size(), uids.begin(), data_ptrs.data());
|
||||
}
|
||||
#endif
|
||||
cudnnHandle_t handle_;
|
||||
};
|
||||
|
||||
} // namespace mlx::core
|
||||
|
||||
@@ -57,7 +57,7 @@ std::string build_kernel(
|
||||
const std::vector<std::string>& output_names,
|
||||
const std::vector<Dtype>& output_dtypes,
|
||||
const std::vector<std::pair<std::string, TemplateArg>>& template_args,
|
||||
const std::vector<CustomKernelShapeInfo>& shape_infos) {
|
||||
const std::vector<std::tuple<bool, bool, bool>>& shape_infos) {
|
||||
std::string kernel_source;
|
||||
kernel_source.reserve(header.size() + source.size() + 8192);
|
||||
kernel_source += default_header;
|
||||
@@ -81,17 +81,17 @@ std::string build_kernel(
|
||||
kernel_source += ",\n";
|
||||
// Add input shape, strides and ndim if present in the source
|
||||
if (arr.ndim() > 0) {
|
||||
if (shape_infos[i].shape) {
|
||||
if (std::get<0>(shape_infos[i])) {
|
||||
kernel_source += " const __grid_constant__ Shape ";
|
||||
kernel_source += name;
|
||||
kernel_source += "_shape,\n";
|
||||
}
|
||||
if (shape_infos[i].strides) {
|
||||
if (std::get<1>(shape_infos[i])) {
|
||||
kernel_source += " const __grid_constant__ Strides ";
|
||||
kernel_source += name;
|
||||
kernel_source += "_strides,\n";
|
||||
}
|
||||
if (shape_infos[i].ndim) {
|
||||
if (std::get<2>(shape_infos[i])) {
|
||||
kernel_source += " const __grid_constant__ int ";
|
||||
kernel_source += name;
|
||||
kernel_source += "_ndim,\n";
|
||||
@@ -154,12 +154,12 @@ CustomKernelFunction cuda_kernel(
|
||||
"[custom_kernel] Must specify at least one output.");
|
||||
}
|
||||
|
||||
std::vector<CustomKernelShapeInfo> shape_infos;
|
||||
std::vector<std::tuple<bool, bool, bool>> shape_infos;
|
||||
for (auto& n : input_names) {
|
||||
CustomKernelShapeInfo shape_info;
|
||||
shape_info.shape = source.find(n + "_shape") != std::string::npos;
|
||||
shape_info.strides = source.find(n + "_strides") != std::string::npos;
|
||||
shape_info.ndim = source.find(n + "_ndim") != std::string::npos;
|
||||
std::tuple<bool, bool, bool> shape_info;
|
||||
std::get<0>(shape_info) = source.find(n + "_shape") != std::string::npos;
|
||||
std::get<1>(shape_info) = source.find(n + "_strides") != std::string::npos;
|
||||
std::get<2>(shape_info) = source.find(n + "_ndim") != std::string::npos;
|
||||
shape_infos.push_back(shape_info);
|
||||
}
|
||||
|
||||
@@ -254,8 +254,8 @@ std::vector<array> precompiled_cuda_kernel(
|
||||
std::optional<float> init_value,
|
||||
bool ensure_row_contiguous,
|
||||
StreamOrDevice s) {
|
||||
std::vector<CustomKernelShapeInfo> shape_infos(
|
||||
inputs.size(), CustomKernelShapeInfo{false, false, false});
|
||||
std::vector<std::tuple<bool, bool, bool>> shape_infos(
|
||||
inputs.size(), {false, false, false});
|
||||
return array::make_arrays(
|
||||
output_shapes,
|
||||
output_dtypes,
|
||||
@@ -279,6 +279,7 @@ void CustomKernel::eval_gpu(
|
||||
std::vector<array>& outputs) {
|
||||
nvtx3::scoped_range r("CustomKernel::eval_gpu");
|
||||
auto& s = stream();
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
|
||||
std::vector<array> copies;
|
||||
|
||||
@@ -288,7 +289,7 @@ void CustomKernel::eval_gpu(
|
||||
copies.emplace_back(init_value_.value(), out.dtype());
|
||||
fill_gpu(copies.back(), out, s);
|
||||
} else {
|
||||
out.set_data(allocator::malloc(out.nbytes()));
|
||||
out.set_data(cu::malloc_async(out.nbytes(), encoder));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -326,13 +327,13 @@ void CustomKernel::eval_gpu(
|
||||
const array& in = checked_inputs[i];
|
||||
auto& shape_info = shape_infos_[i];
|
||||
args.append(in);
|
||||
if (shape_info.shape) {
|
||||
if (std::get<0>(shape_info)) {
|
||||
args.append_ndim(in.shape());
|
||||
}
|
||||
if (shape_info.strides) {
|
||||
if (std::get<1>(shape_info)) {
|
||||
args.append_ndim(in.strides());
|
||||
}
|
||||
if (shape_info.ndim) {
|
||||
if (std::get<2>(shape_info)) {
|
||||
args.append<int32_t>(in.ndim());
|
||||
}
|
||||
}
|
||||
@@ -356,7 +357,6 @@ void CustomKernel::eval_gpu(
|
||||
dim3 grid((gx + tx - 1) / tx, (gy + ty - 1) / ty, (gz + tz - 1) / tz);
|
||||
|
||||
// Call the kernel
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
for (const auto& in : checked_inputs) {
|
||||
encoder.set_input_array(in);
|
||||
}
|
||||
|
||||
@@ -14,20 +14,20 @@ namespace mlx::core::cu {
|
||||
|
||||
namespace {
|
||||
|
||||
#define CHECK_CUDNN_ERROR(cmd) check_cudnn_error(#cmd, (cmd))
|
||||
|
||||
void check_cudnn_error(const char* name, cudnnStatus_t err) {
|
||||
if (err != CUDNN_STATUS_SUCCESS) {
|
||||
throw std::runtime_error(
|
||||
fmt::format("{} failed: {}.", name, cudnnGetErrorString(err)));
|
||||
}
|
||||
bool use_cuda_graphs() {
|
||||
static bool use_graphs = env::get_var("MLX_USE_CUDA_GRAPHS", true);
|
||||
return use_graphs;
|
||||
}
|
||||
|
||||
bool use_cuda_graphs() {
|
||||
static bool use_graphs = []() {
|
||||
return env::get_var("MLX_USE_CUDA_GRAPHS", true);
|
||||
const char* save_cuda_graphs_dot_file() {
|
||||
static const char* filename = []() -> const char* {
|
||||
const char* env = std::getenv("MLX_SAVE_CUDA_GRAPHS_DOT_FILE");
|
||||
if (env && std::strlen(env) == 0) {
|
||||
return nullptr;
|
||||
}
|
||||
return env;
|
||||
}();
|
||||
return use_graphs;
|
||||
return filename;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@@ -46,6 +46,7 @@ Device::Device(int device) : device_(device) {
|
||||
"Device {} does not support synchronization in managed memory.",
|
||||
device_));
|
||||
}
|
||||
|
||||
// The cublasLt handle is used by matmul.
|
||||
make_current();
|
||||
CHECK_CUBLAS_ERROR(cublasLtCreate(<_));
|
||||
@@ -86,7 +87,7 @@ CommandEncoder::CaptureContext::CaptureContext(CommandEncoder& enc) : enc(enc) {
|
||||
return;
|
||||
}
|
||||
CHECK_CUDA_ERROR(
|
||||
cudaStreamBeginCapture(enc.stream(), cudaStreamCaptureModeGlobal));
|
||||
cudaStreamBeginCapture(enc.stream(), cudaStreamCaptureModeThreadLocal));
|
||||
}
|
||||
|
||||
CommandEncoder::CaptureContext::~CaptureContext() {
|
||||
@@ -114,18 +115,17 @@ CommandEncoder::ConcurrentContext::~ConcurrentContext() {
|
||||
}
|
||||
|
||||
// Use an empty graph node for synchronization
|
||||
CommandEncoder::GraphNode empty{NULL, 'E', std::to_string(enc.node_count_++)};
|
||||
enc.empty_node_count_++;
|
||||
CommandEncoder::GraphNode empty{NULL, "E", std::to_string(enc.node_count_++)};
|
||||
CHECK_CUDA_ERROR(cudaGraphAddEmptyNode(&empty.node, enc.graph_, NULL, 0));
|
||||
|
||||
// Insert the concurrent -> empty node dependencies
|
||||
for (auto& from : enc.concurrent_nodes_) {
|
||||
enc.from_nodes_.push_back(from.node);
|
||||
enc.to_nodes_.push_back(empty.node);
|
||||
enc.graph_key_ += from.id;
|
||||
enc.graph_key_ += from.node_type;
|
||||
enc.graph_key_ += empty.id;
|
||||
enc.graph_key_ += empty.node_type;
|
||||
enc.graph_deps_key_ += from.id;
|
||||
enc.graph_deps_key_ += "-";
|
||||
enc.graph_deps_key_ += empty.id;
|
||||
enc.graph_deps_key_ += "-";
|
||||
}
|
||||
|
||||
// Insert the input -> concurrent node dependencies without updating output
|
||||
@@ -140,9 +140,6 @@ CommandEncoder::ConcurrentContext::~ConcurrentContext() {
|
||||
}
|
||||
|
||||
void CommandEncoder::insert_graph_dependencies(GraphNode node) {
|
||||
if (node.node_type == 'G') {
|
||||
graph_node_count_++;
|
||||
}
|
||||
node.id = std::to_string(node_count_++);
|
||||
if (in_concurrent_) {
|
||||
concurrent_nodes_.push_back(std::move(node));
|
||||
@@ -154,6 +151,10 @@ void CommandEncoder::insert_graph_dependencies(GraphNode node) {
|
||||
}
|
||||
|
||||
void CommandEncoder::insert_graph_dependencies(std::vector<GraphNode> nodes) {
|
||||
for (auto& node : nodes) {
|
||||
graph_nodes_key_ += node.node_type;
|
||||
graph_nodes_key_ += "-";
|
||||
}
|
||||
std::vector<GraphNode> deps;
|
||||
{
|
||||
// Dependencies must be added in the same order to produce a consistent
|
||||
@@ -181,20 +182,49 @@ void CommandEncoder::insert_graph_dependencies(std::vector<GraphNode> nodes) {
|
||||
for (auto& to : nodes) {
|
||||
from_nodes_.push_back(from.node);
|
||||
to_nodes_.push_back(to.node);
|
||||
graph_key_ += from.id;
|
||||
graph_key_ += from.node_type;
|
||||
graph_key_ += to.id;
|
||||
graph_key_ += to.node_type;
|
||||
graph_deps_key_ += from.id;
|
||||
graph_deps_key_ += "-";
|
||||
graph_deps_key_ += to.id;
|
||||
graph_deps_key_ += "-";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Can be tuned with MLX_MAX_OPS_PER_BUFFER, MLX_MAX_MB_PER_BUFFER
|
||||
std::pair<int, int> get_graph_limits(Device& d) {
|
||||
auto cc =
|
||||
d.compute_capability_major() * 100 + d.compute_capability_minor() * 10;
|
||||
int ops = 20;
|
||||
int mb = 100;
|
||||
switch (cc) {
|
||||
case 800: // A100
|
||||
ops = 20;
|
||||
mb = 400;
|
||||
break;
|
||||
case 900: // H100
|
||||
ops = 30;
|
||||
mb = 400;
|
||||
break;
|
||||
case 1000: // B200
|
||||
ops = 50;
|
||||
mb = 500;
|
||||
break;
|
||||
case 1210: // DGX Spark
|
||||
ops = 20;
|
||||
mb = 25;
|
||||
break;
|
||||
}
|
||||
return {env::max_ops_per_buffer(ops), env::max_mb_per_buffer(mb)};
|
||||
}
|
||||
|
||||
CommandEncoder::CommandEncoder(Device& d)
|
||||
: device_(d),
|
||||
stream_(d),
|
||||
graph_(d),
|
||||
worker_(d),
|
||||
graph_cache_("MLX_CUDA_GRAPH_CACHE_SIZE", /* default_capacity */ 400) {}
|
||||
graph_cache_("MLX_CUDA_GRAPH_CACHE_SIZE", /* default_capacity */ 400) {
|
||||
std::tie(max_ops_per_graph_, max_mb_per_graph_) = get_graph_limits(d);
|
||||
}
|
||||
|
||||
void CommandEncoder::add_completed_handler(std::function<void()> task) {
|
||||
worker_.add_task(std::move(task));
|
||||
@@ -204,6 +234,7 @@ void CommandEncoder::set_input_array(const array& arr) {
|
||||
if (!use_cuda_graphs()) {
|
||||
return;
|
||||
}
|
||||
bytes_in_graph_ += arr.data_size();
|
||||
auto id = reinterpret_cast<std::uintptr_t>(arr.buffer().ptr());
|
||||
active_deps_.push_back(id);
|
||||
}
|
||||
@@ -278,13 +309,55 @@ void CommandEncoder::add_kernel_node(
|
||||
void CommandEncoder::add_kernel_node(const cudaKernelNodeParams& params) {
|
||||
cudaGraphNode_t node;
|
||||
CHECK_CUDA_ERROR(cudaGraphAddKernelNode(&node, graph_, NULL, 0, ¶ms));
|
||||
insert_graph_dependencies(GraphNode{node, 'K'});
|
||||
insert_graph_dependencies(GraphNode{node, "K"});
|
||||
}
|
||||
|
||||
void CommandEncoder::add_kernel_node(const CUDA_KERNEL_NODE_PARAMS& params) {
|
||||
CUgraphNode node;
|
||||
CHECK_CUDA_ERROR(cuGraphAddKernelNode(&node, graph_, NULL, 0, ¶ms));
|
||||
insert_graph_dependencies(GraphNode{node, 'K'});
|
||||
insert_graph_dependencies(GraphNode{node, "K"});
|
||||
}
|
||||
|
||||
bool is_graph_updatable(cudaGraph_t graph, int& cluster_dim_x) {
|
||||
// CUDA graphs do not get updated correctly if a kernel node getting updated
|
||||
// has a different cluster shape than the node it's being updated with.
|
||||
size_t num_nodes = 0;
|
||||
CHECK_CUDA_ERROR(cudaGraphGetNodes(graph, nullptr, &num_nodes));
|
||||
if (num_nodes == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<cudaGraphNode_t> nodes(num_nodes);
|
||||
CHECK_CUDA_ERROR(cudaGraphGetNodes(graph, nodes.data(), &num_nodes));
|
||||
for (const auto& node : nodes) {
|
||||
cudaGraphNodeType type;
|
||||
CHECK_CUDA_ERROR(cudaGraphNodeGetType(node, &type));
|
||||
if (type == cudaGraphNodeTypeGraph) {
|
||||
// Try to be updatable for a structure like graph -> graph -> kernel
|
||||
if (num_nodes > 1) {
|
||||
return false;
|
||||
}
|
||||
cudaGraph_t child;
|
||||
CHECK_CUDA_ERROR(cudaGraphChildGraphNodeGetGraph(node, &child));
|
||||
return is_graph_updatable(child, cluster_dim_x);
|
||||
} else if (type != cudaGraphNodeTypeKernel) {
|
||||
return false;
|
||||
} else {
|
||||
cudaLaunchAttributeValue cluster_dim;
|
||||
CHECK_CUDA_ERROR(cudaGraphKernelNodeGetAttribute(
|
||||
node, cudaLaunchAttributeClusterDimension, &cluster_dim));
|
||||
// Only dim.x can be greater than 1
|
||||
if (cluster_dim.clusterDim.y > 1 || cluster_dim.clusterDim.z > 1) {
|
||||
return false;
|
||||
}
|
||||
// Only one child node allowed when subgraph uses clusters
|
||||
if (cluster_dim.clusterDim.x > 0 && num_nodes > 1) {
|
||||
return false;
|
||||
}
|
||||
cluster_dim_x = cluster_dim.clusterDim.x;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void CommandEncoder::add_graph_node(cudaGraph_t child) {
|
||||
@@ -297,12 +370,16 @@ void CommandEncoder::add_graph_node(cudaGraph_t child) {
|
||||
return;
|
||||
}
|
||||
cudaGraphNode_t node;
|
||||
int cluster_dim_x = 0;
|
||||
is_graph_updatable_ &= is_graph_updatable(child, cluster_dim_x);
|
||||
CHECK_CUDA_ERROR(cudaGraphAddChildGraphNode(&node, graph_, NULL, 0, child));
|
||||
insert_graph_dependencies(GraphNode{node, 'G'});
|
||||
insert_graph_dependencies(
|
||||
GraphNode{node, "G" + std::to_string(cluster_dim_x)});
|
||||
}
|
||||
|
||||
int CommandEncoder::get_num_ops() {
|
||||
return node_count_;
|
||||
bool CommandEncoder::needs_commit() {
|
||||
return (node_count_ > max_ops_per_graph_) ||
|
||||
((bytes_in_graph_ >> 20) > max_mb_per_graph_);
|
||||
}
|
||||
|
||||
void CommandEncoder::commit() {
|
||||
@@ -322,53 +399,63 @@ void CommandEncoder::commit() {
|
||||
from_nodes_.size()));
|
||||
}
|
||||
|
||||
graph_key_ += ".";
|
||||
graph_key_ += std::to_string(node_count_);
|
||||
graph_key_ += ".";
|
||||
graph_key_ += std::to_string(graph_node_count_);
|
||||
graph_key_ += ".";
|
||||
graph_key_ += std::to_string(empty_node_count_);
|
||||
|
||||
CudaGraphExec& graph_exec = graph_cache_[graph_key_];
|
||||
|
||||
if (graph_exec != nullptr) {
|
||||
cudaGraphExecUpdateResult update_result;
|
||||
#if CUDART_VERSION >= 12000
|
||||
cudaGraphExecUpdateResultInfo info;
|
||||
cudaGraphExecUpdate(graph_exec, graph_, &info);
|
||||
update_result = info.result;
|
||||
#else
|
||||
cudaGraphNode_t error_node;
|
||||
cudaGraphExecUpdate(graph_exec, graph_, &error_node, &update_result);
|
||||
#endif // CUDART_VERSION >= 12000
|
||||
if (update_result != cudaGraphExecUpdateSuccess) {
|
||||
cudaGetLastError(); // reset error
|
||||
graph_exec.reset();
|
||||
}
|
||||
}
|
||||
if (graph_exec == nullptr) {
|
||||
graph_exec.instantiate(graph_);
|
||||
}
|
||||
device_.make_current();
|
||||
CHECK_CUDA_ERROR(cudaGraphLaunch(graph_exec, stream_));
|
||||
|
||||
if (!is_graph_updatable_) {
|
||||
CudaGraphExec graph_exec;
|
||||
graph_exec.instantiate(graph_);
|
||||
CHECK_CUDA_ERROR(cudaGraphLaunch(graph_exec, stream_));
|
||||
} else {
|
||||
auto graph_key = graph_nodes_key_ + ":" + graph_deps_key_;
|
||||
auto& graph_exec = graph_cache_[graph_key];
|
||||
|
||||
if (graph_exec != nullptr) {
|
||||
cudaGraphExecUpdateResult update_result;
|
||||
#if CUDART_VERSION >= 12000
|
||||
cudaGraphExecUpdateResultInfo info;
|
||||
cudaGraphExecUpdate(graph_exec, graph_, &info);
|
||||
update_result = info.result;
|
||||
#else
|
||||
cudaGraphNode_t error_node;
|
||||
cudaGraphExecUpdate(graph_exec, graph_, &error_node, &update_result);
|
||||
#endif // CUDART_VERSION >= 12000
|
||||
if (update_result != cudaGraphExecUpdateSuccess) {
|
||||
cudaGetLastError(); // reset error
|
||||
graph_exec.reset();
|
||||
}
|
||||
}
|
||||
if (graph_exec == nullptr) {
|
||||
graph_exec.instantiate(graph_);
|
||||
}
|
||||
|
||||
CHECK_CUDA_ERROR(cudaGraphLaunch(graph_exec, stream_));
|
||||
}
|
||||
|
||||
// Save cuda graph to dot file
|
||||
if (const char* filename = save_cuda_graphs_dot_file(); filename) {
|
||||
static int count = 0;
|
||||
auto path = fmt::format("{}_{}.dot", filename, ++count);
|
||||
CHECK_CUDA_ERROR(cudaGraphDebugDotPrint(graph_, path.c_str(), 0));
|
||||
}
|
||||
|
||||
// Reset state
|
||||
graph_node_count_ = 0;
|
||||
empty_node_count_ = 0;
|
||||
from_nodes_.clear();
|
||||
to_nodes_.clear();
|
||||
graph_key_.clear();
|
||||
graph_deps_key_.clear();
|
||||
graph_nodes_key_.clear();
|
||||
node_map_.clear();
|
||||
graph_ = CudaGraph(device_);
|
||||
is_graph_updatable_ = true;
|
||||
}
|
||||
|
||||
// Put completion handlers in a batch.
|
||||
worker_.commit(stream_);
|
||||
node_count_ = 0;
|
||||
bytes_in_graph_ = 0;
|
||||
}
|
||||
|
||||
void CommandEncoder::synchronize() {
|
||||
cudaStreamSynchronize(stream_);
|
||||
CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
|
||||
auto p = std::make_shared<std::promise<void>>();
|
||||
std::future<void> f = p->get_future();
|
||||
add_completed_handler([p = std::move(p)]() { p->set_value(); });
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "mlx/array.h"
|
||||
#include "mlx/backend/cuda/allocator.h"
|
||||
#include "mlx/backend/cuda/lru_cache.h"
|
||||
#include "mlx/backend/cuda/worker.h"
|
||||
#include "mlx/stream.h"
|
||||
@@ -83,7 +84,7 @@ class CommandEncoder {
|
||||
}
|
||||
|
||||
void add_completed_handler(std::function<void()> task);
|
||||
int get_num_ops();
|
||||
bool needs_commit();
|
||||
void commit();
|
||||
|
||||
Device& device() {
|
||||
@@ -105,8 +106,9 @@ class CommandEncoder {
|
||||
cudaGraphNode_t node;
|
||||
// K = kernel
|
||||
// E = empty
|
||||
// G = subgraph
|
||||
char node_type;
|
||||
// G* = subgraph (with metadata)
|
||||
// Symbols ':', '-' are reserved as separators
|
||||
std::string node_type;
|
||||
std::string id;
|
||||
};
|
||||
|
||||
@@ -118,18 +120,21 @@ class CommandEncoder {
|
||||
CudaGraph graph_;
|
||||
Worker worker_;
|
||||
char node_count_{0};
|
||||
char graph_node_count_{0};
|
||||
char empty_node_count_{0};
|
||||
bool in_concurrent_{false};
|
||||
std::vector<cudaGraphNode_t> from_nodes_;
|
||||
std::vector<cudaGraphNode_t> to_nodes_;
|
||||
std::string graph_key_;
|
||||
std::string graph_nodes_key_;
|
||||
std::string graph_deps_key_;
|
||||
std::vector<GraphNode> concurrent_nodes_;
|
||||
std::vector<std::shared_ptr<array::Data>> temporaries_;
|
||||
LRUCache<std::string, CudaGraphExec> graph_cache_;
|
||||
std::vector<std::uintptr_t> active_deps_;
|
||||
std::vector<std::uintptr_t> active_outputs_;
|
||||
std::unordered_map<std::uintptr_t, GraphNode> node_map_;
|
||||
size_t bytes_in_graph_{0};
|
||||
bool is_graph_updatable_{true};
|
||||
int max_ops_per_graph_;
|
||||
int max_mb_per_graph_;
|
||||
};
|
||||
|
||||
class Device {
|
||||
@@ -165,6 +170,7 @@ class Device {
|
||||
int device_;
|
||||
int compute_capability_major_;
|
||||
int compute_capability_minor_;
|
||||
std::string device_name_;
|
||||
cublasLtHandle_t lt_;
|
||||
cudnnHandle_t cudnn_;
|
||||
std::unordered_map<int, CommandEncoder> encoders_;
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cuda_fp8.h>
|
||||
|
||||
#include "mlx/backend/cuda/device/fp16_math.cuh"
|
||||
#include "mlx/backend/cuda/device/utils.cuh"
|
||||
|
||||
@@ -334,4 +336,17 @@ struct Tanh {
|
||||
}
|
||||
};
|
||||
|
||||
struct ToFP8 {
|
||||
template <typename T>
|
||||
__device__ uint8_t operator()(T x) {
|
||||
return __nv_fp8_e4m3(x).__x;
|
||||
}
|
||||
};
|
||||
|
||||
struct FromFP8 {
|
||||
__device__ float operator()(uint8_t x) {
|
||||
return float(*(__nv_fp8_e4m3*)(&x));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace mlx::core::cu
|
||||
|
||||
@@ -15,8 +15,10 @@ void AllReduce::eval_gpu(
|
||||
assert(inputs.size() == 1);
|
||||
assert(outputs.size() == 1);
|
||||
|
||||
auto set_input_output =
|
||||
[s = stream()](const array& in, array& out) -> std::pair<array, array> {
|
||||
auto& s = stream();
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
auto set_input_output = [&](const array& in,
|
||||
array& out) -> std::pair<array, array> {
|
||||
if (!in.flags().row_contiguous) {
|
||||
copy_gpu(in, out, CopyType::General, s);
|
||||
return {out, out};
|
||||
@@ -24,19 +26,17 @@ void AllReduce::eval_gpu(
|
||||
out.copy_shared_buffer(in);
|
||||
return {in, out};
|
||||
} else {
|
||||
out.set_data(allocator::malloc(out.nbytes()));
|
||||
out.set_data(cu::malloc_async(out.nbytes(), encoder));
|
||||
return {in, out};
|
||||
}
|
||||
};
|
||||
|
||||
auto [input, output] = set_input_output(inputs[0], outputs[0]);
|
||||
|
||||
auto& encoder = cu::get_command_encoder(stream());
|
||||
encoder.set_input_array(input);
|
||||
encoder.set_output_array(output);
|
||||
|
||||
auto capture = encoder.capture_context();
|
||||
auto& s = stream();
|
||||
|
||||
switch (reduce_type_) {
|
||||
case Sum:
|
||||
@@ -53,4 +53,69 @@ void AllReduce::eval_gpu(
|
||||
"Only all reduce sum, max, and min are supported.");
|
||||
}
|
||||
}
|
||||
|
||||
void AllGather::eval_gpu(
|
||||
const std::vector<array>& inputs,
|
||||
std::vector<array>& outputs) {
|
||||
assert(inputs.size() == 1);
|
||||
assert(outputs.size() == 1);
|
||||
|
||||
auto& s = stream();
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
|
||||
auto ensure_contiguous = [&s, &encoder](const array& x) {
|
||||
if (x.flags().row_contiguous) {
|
||||
return x;
|
||||
} else {
|
||||
array x_copy = contiguous_copy_gpu(x, s);
|
||||
encoder.add_temporary(x_copy);
|
||||
return x_copy;
|
||||
}
|
||||
};
|
||||
|
||||
auto input = ensure_contiguous(inputs[0]);
|
||||
outputs[0].set_data(cu::malloc_async(outputs[0].nbytes(), encoder));
|
||||
|
||||
encoder.set_input_array(input);
|
||||
encoder.set_output_array(outputs[0]);
|
||||
|
||||
auto capture = encoder.capture_context();
|
||||
distributed::detail::all_gather(group(), input, outputs[0], s);
|
||||
}
|
||||
|
||||
void ReduceScatter::eval_gpu(
|
||||
const std::vector<array>& inputs,
|
||||
std::vector<array>& outputs) {
|
||||
assert(inputs.size() == 1);
|
||||
assert(outputs.size() == 1);
|
||||
|
||||
auto& s = stream();
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
|
||||
auto ensure_contiguous = [&s, &encoder](const array& x) {
|
||||
if (x.flags().row_contiguous) {
|
||||
return x;
|
||||
} else {
|
||||
array x_copy = contiguous_copy_gpu(x, s);
|
||||
encoder.add_temporary(x_copy);
|
||||
return x_copy;
|
||||
}
|
||||
};
|
||||
|
||||
auto input = ensure_contiguous(inputs[0]);
|
||||
outputs[0].set_data(cu::malloc_async(outputs[0].nbytes(), encoder));
|
||||
|
||||
encoder.set_input_array(input);
|
||||
encoder.set_output_array(outputs[0]);
|
||||
|
||||
auto capture = encoder.capture_context();
|
||||
|
||||
switch (reduce_type_) {
|
||||
case Sum:
|
||||
distributed::detail::sum_scatter(group(), input, outputs[0], s);
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Only sum scatter is supported. ");
|
||||
}
|
||||
}
|
||||
} // namespace mlx::core::distributed
|
||||
|
||||
@@ -11,9 +11,6 @@
|
||||
|
||||
namespace mlx::core::gpu {
|
||||
|
||||
// Can be tuned with MLX_MAX_OPS_PER_BUFFER
|
||||
constexpr int default_max_nodes_per_graph = 20;
|
||||
|
||||
bool is_available() {
|
||||
return true;
|
||||
}
|
||||
@@ -53,8 +50,7 @@ void eval(array& arr) {
|
||||
encoder.add_temporary(s);
|
||||
}
|
||||
|
||||
if (encoder.get_num_ops() >=
|
||||
env::max_ops_per_buffer(default_max_nodes_per_graph)) {
|
||||
if (encoder.needs_commit()) {
|
||||
scheduler::notify_new_task(stream);
|
||||
encoder.add_completed_handler(
|
||||
[stream]() { scheduler::notify_task_completion(stream); });
|
||||
|
||||
@@ -305,6 +305,7 @@ void Event::wait() {
|
||||
} else {
|
||||
event->atomic->wait(value());
|
||||
}
|
||||
CHECK_CUDA_ERROR(cudaPeekAtLastError());
|
||||
}
|
||||
|
||||
void Event::wait(Stream s) {
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
|
||||
#include "mlx/fence.h"
|
||||
#include "mlx/backend/cuda/allocator.h"
|
||||
#include "mlx/backend/cuda/device.h"
|
||||
#include "mlx/backend/cuda/event.h"
|
||||
|
||||
namespace mlx::core {
|
||||
@@ -20,8 +22,24 @@ void Fence::wait(Stream s, const array&) {
|
||||
fence->event.wait(fence->count);
|
||||
}
|
||||
|
||||
void Fence::update(Stream s, const array&) {
|
||||
void Fence::update(Stream s, const array& a, bool cross_device) {
|
||||
auto* fence = static_cast<FenceImpl*>(fence_.get());
|
||||
if (cross_device) {
|
||||
// Move to managed memory if there is a device switch
|
||||
auto& cbuf =
|
||||
*static_cast<cu::CudaBuffer*>(const_cast<array&>(a).buffer().ptr());
|
||||
if (cbuf.device != -1) {
|
||||
void* new_data;
|
||||
CHECK_CUDA_ERROR(cudaMallocManaged(&new_data, cbuf.size));
|
||||
cbuf.device = -1;
|
||||
auto& encoder = cu::device(s.device).get_command_encoder(s);
|
||||
encoder.commit();
|
||||
CHECK_CUDA_ERROR(cudaMemcpyAsync(
|
||||
new_data, cbuf.data, cbuf.size, cudaMemcpyDefault, encoder.stream()));
|
||||
CHECK_CUDA_ERROR(cudaFreeAsync(cbuf.data, encoder.stream()));
|
||||
cbuf.data = new_data;
|
||||
}
|
||||
}
|
||||
fence->count++;
|
||||
fence->event.signal(s, fence->count);
|
||||
}
|
||||
|
||||
@@ -241,7 +241,7 @@ void CublasGemm::set_bias(cu::CommandEncoder& encoder, const array& bias) {
|
||||
CUBLASLT_MATMUL_DESC_EPILOGUE,
|
||||
&epilogue,
|
||||
sizeof(epilogue)));
|
||||
auto* bias_ptr = bias.data<void>();
|
||||
auto* bias_ptr = gpu_ptr<void>(bias);
|
||||
CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
|
||||
matmul_desc_,
|
||||
CUBLASLT_MATMUL_DESC_BIAS_POINTER,
|
||||
@@ -278,9 +278,9 @@ void CublasGemm::run(
|
||||
|
||||
execute(
|
||||
encoder,
|
||||
out.data<void>(),
|
||||
a.data<void>(),
|
||||
b.data<void>(),
|
||||
gpu_ptr<void>(out),
|
||||
gpu_ptr<void>(a),
|
||||
gpu_ptr<void>(b),
|
||||
nullptr,
|
||||
alpha);
|
||||
}
|
||||
@@ -321,10 +321,10 @@ void CublasGemm::run(
|
||||
|
||||
execute(
|
||||
encoder,
|
||||
out.data<void>(),
|
||||
a.data<void>(),
|
||||
b.data<void>(),
|
||||
c.data<void>(),
|
||||
gpu_ptr<void>(out),
|
||||
gpu_ptr<void>(a),
|
||||
gpu_ptr<void>(b),
|
||||
gpu_ptr<void>(c),
|
||||
alpha,
|
||||
beta);
|
||||
}
|
||||
@@ -370,11 +370,11 @@ void CublasGemm::execute(
|
||||
// Ensure workspace is 256-byte aligned
|
||||
int nbytes = cuda::ceil_div(heuristic_.workspaceSize, 256) * 256;
|
||||
array workspace(
|
||||
allocator::malloc(nbytes),
|
||||
cu::malloc_async(nbytes, encoder),
|
||||
{static_cast<int>(heuristic_.workspaceSize)},
|
||||
int8);
|
||||
encoder.add_temporary(workspace);
|
||||
workspace_ptr = workspace.data<void>();
|
||||
workspace_ptr = gpu_ptr<void>(workspace);
|
||||
}
|
||||
|
||||
auto capture = encoder.capture_context();
|
||||
|
||||
@@ -25,9 +25,10 @@ void CublasGemm::run_batched(
|
||||
for (size_t i = 0; i < nbatch; ++i) {
|
||||
execute(
|
||||
encoder,
|
||||
out.data<int8_t>() + out.itemsize() * i * batch_shape.back() * M_ * N_,
|
||||
a.data<int8_t>() + a.itemsize() * a_it.loc,
|
||||
b.data<int8_t>() + b.itemsize() * b_it.loc,
|
||||
gpu_ptr<int8_t>(out) +
|
||||
out.itemsize() * i * batch_shape.back() * M_ * N_,
|
||||
gpu_ptr<int8_t>(a) + a.itemsize() * a_it.loc,
|
||||
gpu_ptr<int8_t>(b) + b.itemsize() * b_it.loc,
|
||||
nullptr,
|
||||
alpha);
|
||||
a_it.step();
|
||||
@@ -60,10 +61,11 @@ void CublasGemm::run_batched(
|
||||
for (size_t i = 0; i < nbatch; ++i) {
|
||||
execute(
|
||||
encoder,
|
||||
out.data<int8_t>() + out.itemsize() * i * batch_shape.back() * M_ * N_,
|
||||
a.data<int8_t>() + a.itemsize() * a_it.loc,
|
||||
b.data<int8_t>() + b.itemsize() * b_it.loc,
|
||||
c.data<int8_t>() + c.itemsize() * c_it.loc,
|
||||
gpu_ptr<int8_t>(out) +
|
||||
out.itemsize() * i * batch_shape.back() * M_ * N_,
|
||||
gpu_ptr<int8_t>(a) + a.itemsize() * a_it.loc,
|
||||
gpu_ptr<int8_t>(b) + b.itemsize() * b_it.loc,
|
||||
gpu_ptr<int8_t>(c) + c.itemsize() * c_it.loc,
|
||||
alpha,
|
||||
beta);
|
||||
a_it.step();
|
||||
|
||||
@@ -163,7 +163,7 @@ void CublasGemm::run_batched(
|
||||
|
||||
// Launch kernel to set device offsets
|
||||
auto pointers = array(
|
||||
allocator::malloc(batch_count * sizeof(void*) * 3),
|
||||
cu::malloc_async(batch_count * sizeof(void*) * 3, encoder),
|
||||
{batch_count * 3},
|
||||
uint64);
|
||||
|
||||
@@ -183,10 +183,10 @@ void CublasGemm::run_batched(
|
||||
num_blocks,
|
||||
block_dims,
|
||||
0,
|
||||
pointers.data<int8_t*>(),
|
||||
a.data<int8_t>(),
|
||||
b.data<int8_t>(),
|
||||
out.data<int8_t>(),
|
||||
gpu_ptr<int8_t*>(pointers),
|
||||
gpu_ptr<int8_t>(a),
|
||||
gpu_ptr<int8_t>(b),
|
||||
gpu_ptr<int8_t>(out),
|
||||
item_size,
|
||||
const_param<ndim_constant()>(batch_shape),
|
||||
const_param<ndim_constant()>(a_batch_strides),
|
||||
@@ -200,10 +200,10 @@ void CublasGemm::run_batched(
|
||||
num_blocks,
|
||||
block_dims,
|
||||
0,
|
||||
pointers.data<int8_t*>(),
|
||||
a.data<int8_t>(),
|
||||
b.data<int8_t>(),
|
||||
out.data<int8_t>(),
|
||||
gpu_ptr<int8_t*>(pointers),
|
||||
gpu_ptr<int8_t>(a),
|
||||
gpu_ptr<int8_t>(b),
|
||||
gpu_ptr<int8_t>(out),
|
||||
item_size,
|
||||
const_param(batch_shape),
|
||||
const_param(a_batch_strides),
|
||||
@@ -219,7 +219,7 @@ void CublasGemm::run_batched(
|
||||
encoder.set_input_array(b);
|
||||
encoder.set_output_array(out);
|
||||
|
||||
auto a_pointers = pointers.data<int8_t*>();
|
||||
auto a_pointers = gpu_ptr<int8_t*>(pointers);
|
||||
auto b_pointers = a_pointers + batch_count;
|
||||
auto out_pointers = b_pointers + batch_count;
|
||||
execute(
|
||||
@@ -251,7 +251,7 @@ void CublasGemm::run_batched(
|
||||
|
||||
// Launch kernel to set device offsets
|
||||
auto pointers = array(
|
||||
allocator::malloc(batch_count * sizeof(uint64_t) * 4),
|
||||
cu::malloc_async(batch_count * sizeof(uint64_t) * 4, encoder),
|
||||
{batch_count * 4},
|
||||
uint64);
|
||||
|
||||
@@ -271,11 +271,11 @@ void CublasGemm::run_batched(
|
||||
num_blocks,
|
||||
block_dims,
|
||||
0,
|
||||
pointers.data<int8_t*>(),
|
||||
a.data<int8_t>(),
|
||||
b.data<int8_t>(),
|
||||
c.data<int8_t>(),
|
||||
out.data<int8_t>(),
|
||||
gpu_ptr<int8_t*>(pointers),
|
||||
gpu_ptr<int8_t>(a),
|
||||
gpu_ptr<int8_t>(b),
|
||||
gpu_ptr<int8_t>(c),
|
||||
gpu_ptr<int8_t>(out),
|
||||
item_size,
|
||||
const_param<ndim_constant()>(batch_shape),
|
||||
const_param<ndim_constant()>(a_batch_strides),
|
||||
@@ -290,11 +290,11 @@ void CublasGemm::run_batched(
|
||||
num_blocks,
|
||||
block_dims,
|
||||
0,
|
||||
pointers.data<int8_t*>(),
|
||||
a.data<int8_t>(),
|
||||
b.data<int8_t>(),
|
||||
c.data<int8_t>(),
|
||||
out.data<int8_t>(),
|
||||
gpu_ptr<int8_t*>(pointers),
|
||||
gpu_ptr<int8_t>(a),
|
||||
gpu_ptr<int8_t>(b),
|
||||
gpu_ptr<int8_t>(c),
|
||||
gpu_ptr<int8_t>(out),
|
||||
item_size,
|
||||
const_param(batch_shape),
|
||||
const_param(a_batch_strides),
|
||||
@@ -312,7 +312,7 @@ void CublasGemm::run_batched(
|
||||
encoder.set_input_array(c);
|
||||
encoder.set_output_array(out);
|
||||
|
||||
auto a_pointers = pointers.data<int8_t*>();
|
||||
auto a_pointers = gpu_ptr<int8_t*>(pointers);
|
||||
auto b_pointers = a_pointers + batch_count;
|
||||
auto c_pointers = b_pointers + batch_count;
|
||||
auto out_pointers = c_pointers + batch_count;
|
||||
|
||||
@@ -149,13 +149,13 @@ void gemv(
|
||||
auto vec_strides = const_param(b_batch_strides);
|
||||
|
||||
if (M == 1) {
|
||||
mat = b.data<DataType>();
|
||||
vec = a.data<DataType>();
|
||||
mat = gpu_ptr<DataType>(b);
|
||||
vec = gpu_ptr<DataType>(a);
|
||||
rows = N;
|
||||
std::swap(mat_strides, vec_strides);
|
||||
} else {
|
||||
mat = a.data<DataType>();
|
||||
vec = b.data<DataType>();
|
||||
mat = gpu_ptr<DataType>(a);
|
||||
vec = gpu_ptr<DataType>(b);
|
||||
rows = M;
|
||||
}
|
||||
uint32_t num_blocks_x = (rows + rows_per_block - 1) / rows_per_block;
|
||||
@@ -177,7 +177,7 @@ void gemv(
|
||||
0,
|
||||
mat,
|
||||
vec,
|
||||
out.data<DataType>(),
|
||||
gpu_ptr<DataType>(out),
|
||||
rows,
|
||||
cols);
|
||||
} else {
|
||||
@@ -189,7 +189,7 @@ void gemv(
|
||||
0,
|
||||
mat,
|
||||
vec,
|
||||
out.data<DataType>(),
|
||||
gpu_ptr<DataType>(out),
|
||||
rows,
|
||||
cols,
|
||||
const_param(batch_shape),
|
||||
|
||||
@@ -31,7 +31,7 @@ void append_indices_arg(
|
||||
int idx_ndim) {
|
||||
SmallVector<const void*> indices(nidx);
|
||||
for (int i = 0; i < nidx; ++i) {
|
||||
indices[i] = inputs[i + 1].data<void>();
|
||||
indices[i] = gpu_ptr<void>(inputs[i + 1]);
|
||||
}
|
||||
args.append(std::move(indices));
|
||||
SmallVector<int32_t> indices_shape(nidx * idx_ndim);
|
||||
@@ -59,7 +59,9 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
assert(inputs.size() > 0);
|
||||
const auto& src = inputs[0];
|
||||
|
||||
out.set_data(allocator::malloc(out.nbytes()));
|
||||
auto& s = stream();
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
out.set_data(cu::malloc_async(out.nbytes(), encoder));
|
||||
if (out.size() == 0) {
|
||||
return;
|
||||
}
|
||||
@@ -80,7 +82,6 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
dtype_to_string(idx_dtype),
|
||||
nidx);
|
||||
|
||||
auto& s = stream();
|
||||
cu::JitModule& mod = cu::get_jit_module(s.device, module_name, [&]() {
|
||||
std::vector<std::string> kernel_names;
|
||||
for (int ndim = 0; ndim <= MAX_NDIM; ++ndim) {
|
||||
@@ -121,7 +122,6 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
idx_ndim,
|
||||
large ? "int64_t" : "int32_t");
|
||||
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
for (const auto& in : inputs) {
|
||||
encoder.set_input_array(in);
|
||||
}
|
||||
@@ -239,7 +239,9 @@ void GatherAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
const auto& src = inputs[0];
|
||||
const auto& idx = inputs[1];
|
||||
|
||||
out.set_data(allocator::malloc(out.nbytes()));
|
||||
auto& s = stream();
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
out.set_data(cu::malloc_async(out.nbytes(), encoder));
|
||||
if (out.size() == 0) {
|
||||
return;
|
||||
}
|
||||
@@ -251,7 +253,6 @@ void GatherAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
dtype_to_string(out.dtype()),
|
||||
dtype_to_string(idx.dtype()));
|
||||
|
||||
auto& s = stream();
|
||||
cu::JitModule& mod = cu::get_jit_module(s.device, module_name, [&]() {
|
||||
std::vector<std::string> kernel_names;
|
||||
for (int ndim = 0; ndim <= MAX_NDIM; ++ndim) {
|
||||
@@ -312,7 +313,6 @@ void GatherAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
idx.flags().row_contiguous,
|
||||
large ? "int64_t" : "int32_t");
|
||||
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
for (const auto& in : inputs) {
|
||||
encoder.set_input_array(in);
|
||||
}
|
||||
|
||||
@@ -279,11 +279,14 @@ void compile(
|
||||
// Compile program.
|
||||
std::vector<const char*> args;
|
||||
bool use_sass = compiler_supports_device_sass(device);
|
||||
auto cc = device.compute_capability_major();
|
||||
std::string arch_tag = (cc == 90 || cc == 100 || cc == 121) ? "a" : "";
|
||||
std::string compute = fmt::format(
|
||||
"--gpu-architecture={}_{}{}",
|
||||
"--gpu-architecture={}_{}{}{}",
|
||||
use_sass ? "sm" : "compute",
|
||||
device.compute_capability_major(),
|
||||
device.compute_capability_minor());
|
||||
cc,
|
||||
device.compute_capability_minor(),
|
||||
arch_tag);
|
||||
args.push_back(compute.c_str());
|
||||
std::string cccl_include = cccl_dir();
|
||||
if (!cccl_include.empty()) {
|
||||
|
||||
@@ -31,7 +31,7 @@ struct KernelArgs {
|
||||
}
|
||||
|
||||
void append(const array& a) {
|
||||
append(reinterpret_cast<CUdeviceptr>(a.data<void>()));
|
||||
append(reinterpret_cast<CUdeviceptr>(gpu_ptr<void>(a)));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <type_traits>
|
||||
|
||||
#include "mlx/array.h"
|
||||
#include "mlx/backend/cuda/allocator.h"
|
||||
#include "mlx/backend/cuda/device/utils.cuh"
|
||||
|
||||
#include <cuda.h>
|
||||
|
||||
@@ -230,9 +230,10 @@ void LayerNorm::eval_gpu(
|
||||
nvtx3::scoped_range r("LayerNorm::eval_gpu");
|
||||
auto& s = stream();
|
||||
auto& out = outputs[0];
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
|
||||
// Make sure that the last dimension is contiguous.
|
||||
auto set_output = [&s, &out](const array& x) {
|
||||
auto set_output = [&s, &out, &encoder](const array& x) {
|
||||
bool no_copy = x.flags().contiguous && x.strides()[x.ndim() - 1] == 1;
|
||||
if (no_copy && x.ndim() > 1) {
|
||||
auto s = x.strides()[x.ndim() - 2];
|
||||
@@ -243,7 +244,7 @@ void LayerNorm::eval_gpu(
|
||||
out.copy_shared_buffer(x);
|
||||
} else {
|
||||
out.set_data(
|
||||
allocator::malloc(x.data_size() * x.itemsize()),
|
||||
cu::malloc_async(x.data_size() * x.itemsize(), encoder),
|
||||
x.data_size(),
|
||||
x.strides(),
|
||||
x.flags());
|
||||
@@ -265,7 +266,6 @@ void LayerNorm::eval_gpu(
|
||||
int64_t w_stride = (w.ndim() == 1) ? w.strides()[0] : 0;
|
||||
int64_t b_stride = (b.ndim() == 1) ? b.strides()[0] : 0;
|
||||
|
||||
auto& encoder = cu::get_command_encoder(s);
|
||||
encoder.set_input_array(x);
|
||||
encoder.set_input_array(w);
|
||||
encoder.set_input_array(b);
|
||||
@@ -280,10 +280,10 @@ void LayerNorm::eval_gpu(
|
||||
n_rows,
|
||||
block_dim(),
|
||||
0,
|
||||
x.data<DataType>(),
|
||||
w.data<DataType>(),
|
||||
b.data<DataType>(),
|
||||
out.data<DataType>(),
|
||||
gpu_ptr<DataType>(x),
|
||||
gpu_ptr<DataType>(w),
|
||||
gpu_ptr<DataType>(b),
|
||||
gpu_ptr<DataType>(out),
|
||||
eps_,
|
||||
axis_size,
|
||||
w_stride,
|
||||
@@ -335,7 +335,7 @@ void LayerNormVJP::eval_gpu(
|
||||
gx.copy_shared_buffer(g);
|
||||
g_in_gx = true;
|
||||
} else {
|
||||
gx.set_data(allocator::malloc(gx.nbytes()));
|
||||
gx.set_data(cu::malloc_async(gx.nbytes(), encoder));
|
||||
}
|
||||
if (g_copied && !g_in_gx) {
|
||||
encoder.add_temporary(g);
|
||||
@@ -355,7 +355,7 @@ void LayerNormVJP::eval_gpu(
|
||||
g_in_gw = true;
|
||||
gw_temp.copy_shared_buffer(g);
|
||||
} else {
|
||||
gw_temp.set_data(allocator::malloc(gw_temp.nbytes()));
|
||||
gw_temp.set_data(cu::malloc_async(gw_temp.nbytes(), encoder));
|
||||
encoder.add_temporary(gw_temp);
|
||||
}
|
||||
}
|
||||
@@ -393,11 +393,11 @@ void LayerNormVJP::eval_gpu(
|
||||
n_rows,
|
||||
block_dim(),
|
||||
0,
|
||||
x.data<DataType>(),
|
||||
w.data<DataType>(),
|
||||
g.data<DataType>(),
|
||||
gx.data<DataType>(),
|
||||
gw_temp.data<DataType>(),
|
||||
gpu_ptr<DataType>(x),
|
||||
gpu_ptr<DataType>(w),
|
||||
gpu_ptr<DataType>(g),
|
||||
gpu_ptr<DataType>(gx),
|
||||
gpu_ptr<DataType>(gw_temp),
|
||||
eps_,
|
||||
axis_size,
|
||||
w_stride);
|
||||
|
||||
60
mlx/backend/cuda/load.cpp
Normal file
60
mlx/backend/cuda/load.cpp
Normal file
@@ -0,0 +1,60 @@
|
||||
// Copyright © 2023 Apple Inc.
|
||||
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
|
||||
#include "mlx/backend/cuda/device.h"
|
||||
#include "mlx/backend/cuda/utils.h"
|
||||
#include "mlx/primitives.h"
|
||||
|
||||
namespace {
|
||||
|
||||
template <const uint8_t scalar_size>
|
||||
void swap_endianness(uint8_t* data_bytes, size_t N) {
|
||||
struct Elem {
|
||||
uint8_t bytes[scalar_size];
|
||||
};
|
||||
|
||||
Elem* data = reinterpret_cast<Elem*>(data_bytes);
|
||||
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
for (size_t j = 0; j < (scalar_size / 2); j++) {
|
||||
std::swap(data[i].bytes[j], data[i].bytes[scalar_size - j - 1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace mlx::core {
|
||||
|
||||
void Load::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
auto& encoder = cu::get_command_encoder(stream());
|
||||
auto size = out.size();
|
||||
auto nbytes = size * out.itemsize();
|
||||
out.set_data(cu::malloc_async(nbytes, encoder));
|
||||
auto out_ptr = malloc(nbytes);
|
||||
reader_->read(static_cast<char*>(out_ptr), nbytes, offset_);
|
||||
if (swap_endianness_) {
|
||||
switch (out.itemsize()) {
|
||||
case 2:
|
||||
swap_endianness<2>(reinterpret_cast<uint8_t*>(out_ptr), size);
|
||||
break;
|
||||
case 4:
|
||||
swap_endianness<4>(reinterpret_cast<uint8_t*>(out_ptr), size);
|
||||
break;
|
||||
case 8:
|
||||
swap_endianness<8>(reinterpret_cast<uint8_t*>(out_ptr), size);
|
||||
break;
|
||||
}
|
||||
}
|
||||
CHECK_CUDA_ERROR(cudaMemcpyAsync(
|
||||
gpu_ptr<void>(out),
|
||||
out_ptr,
|
||||
nbytes,
|
||||
cudaMemcpyDefault,
|
||||
encoder.stream()));
|
||||
CHECK_CUDA_ERROR(cudaLaunchHostFunc(encoder.stream(), free, out_ptr));
|
||||
}
|
||||
|
||||
} // namespace mlx::core
|
||||
@@ -115,7 +115,7 @@ void LogSumExp::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
|
||||
auto in = ensure_contiguous(inputs[0]);
|
||||
if (in.flags().row_contiguous) {
|
||||
out.set_data(allocator::malloc(out.nbytes()));
|
||||
out.set_data(cu::malloc_async(out.nbytes(), encoder));
|
||||
} else {
|
||||
auto n = in.shape(-1);
|
||||
auto flags = in.flags();
|
||||
@@ -130,7 +130,7 @@ void LogSumExp::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
}
|
||||
flags.col_contiguous = col_contig;
|
||||
out.set_data(
|
||||
allocator::malloc(in.nbytes() / n),
|
||||
cu::malloc_async(in.nbytes() / n, encoder),
|
||||
in.data_size() / n,
|
||||
std::move(strides),
|
||||
flags);
|
||||
@@ -151,8 +151,8 @@ void LogSumExp::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
n_rows,
|
||||
block_dim(),
|
||||
0,
|
||||
in.data<DataType>(),
|
||||
out.data<DataType>(),
|
||||
gpu_ptr<DataType>(in),
|
||||
gpu_ptr<DataType>(out),
|
||||
axis_size);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -135,12 +135,19 @@ class LRUCache {
|
||||
};
|
||||
|
||||
// Turn a POD struct into a container key by doing bytes compare.
|
||||
//
|
||||
// Usage:
|
||||
// BytesKey<MyKey> key;
|
||||
// key.pod = { ... };
|
||||
template <typename T>
|
||||
struct BytesKey {
|
||||
T pod;
|
||||
static_assert(std::is_standard_layout_v<T>, "T is not POD");
|
||||
|
||||
BytesKey(T pod) : pod(std::move(pod)) {}
|
||||
BytesKey() {
|
||||
// Make sure the paddings between members are filled with 0.
|
||||
memset(&pod, 0, sizeof(T));
|
||||
}
|
||||
|
||||
BytesKey(const BytesKey& other) {
|
||||
memcpy(&pod, &other.pod, sizeof(T));
|
||||
|
||||
@@ -121,7 +121,7 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
return;
|
||||
}
|
||||
|
||||
out.set_data(allocator::malloc(out.nbytes()));
|
||||
out.set_data(cu::malloc_async(out.nbytes(), encoder));
|
||||
|
||||
int M = a_pre.shape(-2);
|
||||
int N = b_pre.shape(-1);
|
||||
@@ -163,7 +163,7 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
|
||||
if (beta_ == 1 && a.dtype() != complex64 && c.strides(-1) == 1 &&
|
||||
c.data_size() == out.shape(-1)) {
|
||||
out.set_data(allocator::malloc(out.nbytes()));
|
||||
out.set_data(cu::malloc_async(out.nbytes(), encoder));
|
||||
gemm_and_bias(
|
||||
encoder,
|
||||
M,
|
||||
@@ -187,10 +187,10 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
|
||||
auto sty = c.strides()[c.ndim() - 1];
|
||||
if (sty == 1 && stx == c.shape(-1)) {
|
||||
ldc = stx;
|
||||
out.set_data(allocator::malloc(out.nbytes()));
|
||||
out.set_data(cu::malloc_async(out.nbytes(), encoder));
|
||||
} else if (sty == 1 && stx == 0) {
|
||||
ldc = 0;
|
||||
out.set_data(allocator::malloc(out.nbytes()));
|
||||
out.set_data(cu::malloc_async(out.nbytes(), encoder));
|
||||
} else {
|
||||
// Copy C into out and set C to out
|
||||
ldc = c.shape(-1);
|
||||
|
||||
@@ -28,7 +28,6 @@ NO_GPU(FFT)
|
||||
NO_GPU(GatherMM)
|
||||
NO_GPU(GatherQMM)
|
||||
NO_GPU(Hadamard)
|
||||
NO_GPU(Load)
|
||||
NO_GPU_MULTI(LUF)
|
||||
NO_GPU_MULTI(QRF)
|
||||
NO_GPU(QuantizedMatmul)
|
||||
@@ -38,9 +37,9 @@ NO_GPU(Inverse)
|
||||
NO_GPU(Cholesky)
|
||||
NO_GPU_MULTI(Eig)
|
||||
NO_GPU_MULTI(Eigh)
|
||||
NO_GPU(MaskedScatter)
|
||||
|
||||
namespace distributed {
|
||||
NO_GPU_MULTI(AllGather)
|
||||
NO_GPU_MULTI(Send)
|
||||
NO_GPU_MULTI(Recv)
|
||||
} // namespace distributed
|
||||
|
||||
@@ -262,10 +262,10 @@ void affine_quantize(
|
||||
num_blocks,
|
||||
block_dims,
|
||||
0,
|
||||
w.data<T>(),
|
||||
wq.data<uint8_t>(),
|
||||
scales.data<T>(),
|
||||
biases.data<T>(),
|
||||
gpu_ptr<T>(w),
|
||||
gpu_ptr<uint8_t>(wq),
|
||||
gpu_ptr<T>(scales),
|
||||
gpu_ptr<T>(biases),
|
||||
w.size());
|
||||
});
|
||||
});
|
||||
@@ -306,7 +306,7 @@ void affine_dequantize(
|
||||
enc.set_input_array(scales);
|
||||
enc.set_input_array(biases);
|
||||
enc.set_output_array(w);
|
||||
dispatch_float_types(w.dtype(), "affine_quantize", [&](auto type_tag) {
|
||||
dispatch_float_types(w.dtype(), "affine_dequantize", [&](auto type_tag) {
|
||||
dispatch_groups(group_size_, [&](auto group_size) {
|
||||
dispatch_bits(bits_, [&](auto bits) {
|
||||
using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
|
||||
@@ -318,10 +318,10 @@ void affine_dequantize(
|
||||
num_blocks,
|
||||
block_dims,
|
||||
0,
|
||||
wq.data<uint8_t>(),
|
||||
scales.data<T>(),
|
||||
biases.data<T>(),
|
||||
w.data<T>(),
|
||||
gpu_ptr<uint8_t>(wq),
|
||||
gpu_ptr<T>(scales),
|
||||
gpu_ptr<T>(biases),
|
||||
gpu_ptr<T>(w),
|
||||
w.size());
|
||||
});
|
||||
});
|
||||
|
||||
19
mlx/backend/cuda/quantized/convert_fp8.cu
Normal file
19
mlx/backend/cuda/quantized/convert_fp8.cu
Normal file
@@ -0,0 +1,19 @@
|
||||
// Copyright © 2025 Apple Inc.
|
||||
#include "mlx/backend/cuda/unary/unary.cuh"
|
||||
#include "mlx/fast_primitives.h"
|
||||
|
||||
namespace mlx::core {
|
||||
void fast::ConvertFP8::eval_gpu(
|
||||
const std::vector<array>& inputs,
|
||||
std::vector<array>& outputs) {
|
||||
nvtx3::scoped_range r("ConvertFP8::eval_gpu");
|
||||
auto& in = inputs[0];
|
||||
auto& out = outputs[0];
|
||||
auto& s = out.primitive().stream();
|
||||
if (to_fp8_) {
|
||||
unary_op_gpu<cu::ToFP8>(inputs, out, name(), s);
|
||||
} else {
|
||||
unary_op_gpu<cu::FromFP8>(inputs, out, name(), s);
|
||||
}
|
||||
}
|
||||
} // namespace mlx::core
|
||||
83
mlx/backend/cuda/quantized/cuda_fp4.h
Normal file
83
mlx/backend/cuda/quantized/cuda_fp4.h
Normal file
@@ -0,0 +1,83 @@
|
||||
#pragma once
|
||||
|
||||
struct __nv_fp8_e8m0 {
|
||||
__device__ __nv_fp8_e8m0(float x) {
|
||||
if (!std::isfinite(x)) {
|
||||
__x = 0xFF;
|
||||
return;
|
||||
}
|
||||
if (x < 0.0f) {
|
||||
__x = 0x00;
|
||||
return;
|
||||
}
|
||||
float le = std::log2f(x);
|
||||
int n = static_cast<int>(std::nearbyintf(le));
|
||||
|
||||
n = n < -127 ? -127 : n;
|
||||
n = n > 127 ? 127 : n;
|
||||
__x = static_cast<uint8_t>(n + 127);
|
||||
}
|
||||
|
||||
__device__ operator float() {
|
||||
if (__x == 0xFF) {
|
||||
return std::numeric_limits<float>::quiet_NaN();
|
||||
}
|
||||
return std::ldexp(1.0f, static_cast<int>(__x) - 127);
|
||||
}
|
||||
|
||||
uint8_t __x{0};
|
||||
};
|
||||
|
||||
struct __nv_fp4_e2m1 {
|
||||
__device__ __nv_fp4_e2m1(float x) {
|
||||
if (std::isnan(x)) {
|
||||
__x = 0x7;
|
||||
return;
|
||||
}
|
||||
|
||||
const uint8_t sign_bit = (std::signbit(x)) ? 0x8 : 0x0;
|
||||
x = std::abs(x);
|
||||
|
||||
if (x > 5.0f) {
|
||||
__x = 0x7;
|
||||
} else if (x >= 3.5f) {
|
||||
__x = 0x6;
|
||||
} else if (x > 2.5f) {
|
||||
__x = 0x5;
|
||||
} else if (x >= 1.75f) {
|
||||
__x = 0x4;
|
||||
} else if (x > 1.25f) {
|
||||
__x = 0x3;
|
||||
} else if (x >= 0.75f) {
|
||||
__x = 0x2;
|
||||
} else if (x > 0.25f) {
|
||||
__x = 0x1;
|
||||
} else {
|
||||
__x = 0x0;
|
||||
}
|
||||
__x |= sign_bit;
|
||||
}
|
||||
|
||||
__device__ operator float() {
|
||||
static const float LUT[16] = {
|
||||
0.0f,
|
||||
0.5f,
|
||||
1.0f,
|
||||
1.5f,
|
||||
2.0f,
|
||||
3.0f,
|
||||
4.0f,
|
||||
6.0f,
|
||||
-0.0f,
|
||||
-0.5f,
|
||||
-1.0f,
|
||||
-1.5f,
|
||||
-2.0f,
|
||||
-3.0f,
|
||||
-4.0f,
|
||||
-6.0f};
|
||||
|
||||
return LUT[__x];
|
||||
}
|
||||
uint8_t __x{0};
|
||||
};
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user