mirror of
https://github.com/ml-explore/mlx.git
synced 2025-12-16 01:49:05 +08:00
Compare commits
3 Commits
997cfc7699
...
6245824d42
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6245824d42 | ||
|
|
39289ef025 | ||
|
|
aefc9bd3f6 |
3
.github/workflows/release.yml
vendored
3
.github/workflows/release.yml
vendored
@@ -131,6 +131,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
arch: ['x86_64', 'aarch64']
|
arch: ['x86_64', 'aarch64']
|
||||||
|
toolkit: ['cuda-12.9', 'cuda-13.0']
|
||||||
runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22-large' || 'ubuntu-22-large-arm' }}
|
runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22-large' || 'ubuntu-22-large-arm' }}
|
||||||
env:
|
env:
|
||||||
PYPI_RELEASE: 1
|
PYPI_RELEASE: 1
|
||||||
@@ -139,7 +140,7 @@ jobs:
|
|||||||
- uses: actions/checkout@v6
|
- uses: actions/checkout@v6
|
||||||
- uses: ./.github/actions/setup-linux
|
- uses: ./.github/actions/setup-linux
|
||||||
with:
|
with:
|
||||||
toolkit: 'cuda-12.9'
|
toolkit: ${{ matrix.toolkit }}
|
||||||
- name: Build Python package
|
- name: Build Python package
|
||||||
uses: ./.github/actions/build-cuda-release
|
uses: ./.github/actions/build-cuda-release
|
||||||
with:
|
with:
|
||||||
|
|||||||
@@ -29,17 +29,20 @@ MLX has a CUDA backend which you can install with:
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
pip install mlx[cuda]
|
pip install mlx[cuda12]
|
||||||
|
|
||||||
|
|
||||||
To install the CUDA package from PyPi your system must meet the following
|
To install the CUDA package from PyPi your system must meet the following
|
||||||
requirements:
|
requirements:
|
||||||
|
|
||||||
- Nvidia architecture >= SM 7.0 (Volta)
|
- Nvidia architecture >= SM 7.5
|
||||||
- Nvidia driver >= 550.54.14
|
- Nvidia driver >= 550.54.14
|
||||||
- CUDA toolkit >= 12.0
|
- CUDA toolkit >= 12.0
|
||||||
- Linux distribution with glibc >= 2.35
|
- Linux distribution with glibc >= 2.35
|
||||||
- Python >= 3.10
|
- Python >= 3.10
|
||||||
|
|
||||||
|
For CUDA 13 use ``pip install mlx[cuda13]``. The CUDA 13 package requires
|
||||||
|
an Nvidia driver >= 580 or an appropriate CUDA compatibility package.
|
||||||
|
|
||||||
CPU-only (Linux)
|
CPU-only (Linux)
|
||||||
^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
target_sources(
|
target_sources(
|
||||||
mlx
|
mlx
|
||||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
|
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/array.cpp
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/array.cpp
|
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/compile.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/compile.cpp
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/dtype.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/dtype.cpp
|
||||||
|
|||||||
@@ -1,24 +0,0 @@
|
|||||||
// Copyright © 2023 Apple Inc.
|
|
||||||
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <sstream>
|
|
||||||
|
|
||||||
#include "mlx/allocator.h"
|
|
||||||
|
|
||||||
namespace mlx::core::allocator {
|
|
||||||
|
|
||||||
Buffer malloc(size_t size) {
|
|
||||||
auto buffer = allocator().malloc(size);
|
|
||||||
if (size && !buffer.ptr()) {
|
|
||||||
std::ostringstream msg;
|
|
||||||
msg << "[malloc] Unable to allocate " << size << " bytes.";
|
|
||||||
throw std::runtime_error(msg.str());
|
|
||||||
}
|
|
||||||
return buffer;
|
|
||||||
}
|
|
||||||
|
|
||||||
void free(Buffer buffer) {
|
|
||||||
allocator().free(buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace mlx::core::allocator
|
|
||||||
@@ -28,10 +28,6 @@ class Buffer {
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
Buffer malloc(size_t size);
|
|
||||||
|
|
||||||
void free(Buffer buffer);
|
|
||||||
|
|
||||||
class Allocator {
|
class Allocator {
|
||||||
/** Abstract base class for a memory allocator. */
|
/** Abstract base class for a memory allocator. */
|
||||||
public:
|
public:
|
||||||
@@ -49,4 +45,12 @@ class Allocator {
|
|||||||
|
|
||||||
Allocator& allocator();
|
Allocator& allocator();
|
||||||
|
|
||||||
|
inline Buffer malloc(size_t size) {
|
||||||
|
return allocator().malloc(size);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void free(Buffer buffer) {
|
||||||
|
allocator().free(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace mlx::core::allocator
|
} // namespace mlx::core::allocator
|
||||||
|
|||||||
@@ -157,16 +157,14 @@ CudaAllocator::malloc_async(size_t size, int device, cudaStream_t stream) {
|
|||||||
cudaError_t err;
|
cudaError_t err;
|
||||||
void* data = nullptr;
|
void* data = nullptr;
|
||||||
if (device == -1) {
|
if (device == -1) {
|
||||||
err = cudaMallocManaged(&data, size);
|
CHECK_CUDA_ERROR(cudaMallocManaged(&data, size));
|
||||||
} else {
|
} else {
|
||||||
err = cudaMallocAsync(&data, size, stream);
|
CHECK_CUDA_ERROR(cudaMallocAsync(&data, size, stream));
|
||||||
}
|
|
||||||
if (err != cudaSuccess && err != cudaErrorMemoryAllocation) {
|
|
||||||
throw std::runtime_error(fmt::format(
|
|
||||||
"cudaMallocManaged failed: {}.", cudaGetErrorString(err)));
|
|
||||||
}
|
}
|
||||||
if (!data) {
|
if (!data) {
|
||||||
return Buffer{nullptr};
|
std::ostringstream msg;
|
||||||
|
msg << "[malloc] Unable to allocate " << size << " bytes.";
|
||||||
|
throw std::runtime_error(msg.str());
|
||||||
}
|
}
|
||||||
buf = new CudaBuffer{data, size, device};
|
buf = new CudaBuffer{data, size, device};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -95,11 +95,14 @@ void copy_general_input(
|
|||||||
const InType* in_ptr = gpu_ptr<InType>(in) + offset_in;
|
const InType* in_ptr = gpu_ptr<InType>(in) + offset_in;
|
||||||
OutType* out_ptr = gpu_ptr<OutType>(out) + offset_out;
|
OutType* out_ptr = gpu_ptr<OutType>(out) + offset_out;
|
||||||
int ndim = shape.size();
|
int ndim = shape.size();
|
||||||
int work_per_thread = 1;
|
|
||||||
|
int work_per_thread = 8;
|
||||||
auto dim0 = ndim > 0 ? shape.back() : 1;
|
auto dim0 = ndim > 0 ? shape.back() : 1;
|
||||||
auto rest = out.size() / dim0;
|
auto rest = out.size() / dim0;
|
||||||
if (dim0 >= 4) {
|
if (dim0 >= 4 && dim0 < 8) {
|
||||||
work_per_thread = 4;
|
work_per_thread = 4;
|
||||||
|
} else if (dim0 < 4) {
|
||||||
|
work_per_thread = 1;
|
||||||
}
|
}
|
||||||
dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
|
dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
|
||||||
auto block_dims = get_block_dims(dim0, rest, 1);
|
auto block_dims = get_block_dims(dim0, rest, 1);
|
||||||
@@ -110,7 +113,10 @@ void copy_general_input(
|
|||||||
dispatch_1_2_3(ndim, [&](auto dims_constant) {
|
dispatch_1_2_3(ndim, [&](auto dims_constant) {
|
||||||
auto kernel =
|
auto kernel =
|
||||||
cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 1>;
|
cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 1>;
|
||||||
if (work_per_thread == 4) {
|
if (work_per_thread == 8) {
|
||||||
|
kernel =
|
||||||
|
cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 8>;
|
||||||
|
} else if (work_per_thread == 4) {
|
||||||
kernel =
|
kernel =
|
||||||
cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 4>;
|
cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 4>;
|
||||||
}
|
}
|
||||||
@@ -127,7 +133,9 @@ void copy_general_input(
|
|||||||
});
|
});
|
||||||
} else { // ndim >= 4
|
} else { // ndim >= 4
|
||||||
auto kernel = cu::copy_g<InType, OutType, IdxT, 1>;
|
auto kernel = cu::copy_g<InType, OutType, IdxT, 1>;
|
||||||
if (work_per_thread == 4) {
|
if (work_per_thread == 8) {
|
||||||
|
kernel = cu::copy_g<InType, OutType, IdxT, 8>;
|
||||||
|
} else if (work_per_thread == 4) {
|
||||||
kernel = cu::copy_g<InType, OutType, IdxT, 4>;
|
kernel = cu::copy_g<InType, OutType, IdxT, 4>;
|
||||||
}
|
}
|
||||||
encoder.add_kernel_node(
|
encoder.add_kernel_node(
|
||||||
|
|||||||
@@ -7,8 +7,6 @@
|
|||||||
|
|
||||||
namespace mlx::core {
|
namespace mlx::core {
|
||||||
|
|
||||||
void copy_gpu(const array& in, array& out, CopyType ctype, const Stream& s);
|
|
||||||
|
|
||||||
void copy_gpu(const array& in, array& out, CopyType ctype) {
|
void copy_gpu(const array& in, array& out, CopyType ctype) {
|
||||||
copy_gpu(in, out, ctype, out.primitive().stream());
|
copy_gpu(in, out, ctype, out.primitive().stream());
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -149,7 +149,9 @@ Buffer MetalAllocator::malloc(size_t size) {
|
|||||||
buf = device_->newBuffer(size, resource_options);
|
buf = device_->newBuffer(size, resource_options);
|
||||||
}
|
}
|
||||||
if (!buf) {
|
if (!buf) {
|
||||||
return Buffer{nullptr};
|
std::ostringstream msg;
|
||||||
|
msg << "[malloc] Unable to allocate " << size << " bytes.";
|
||||||
|
throw std::runtime_error(msg.str());
|
||||||
}
|
}
|
||||||
lk.lock();
|
lk.lock();
|
||||||
num_resources_++;
|
num_resources_++;
|
||||||
|
|||||||
40
setup.py
40
setup.py
@@ -7,13 +7,21 @@ import re
|
|||||||
import subprocess
|
import subprocess
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from subprocess import run
|
|
||||||
|
|
||||||
from setuptools import Command, Extension, find_namespace_packages, setup
|
from setuptools import Command, Extension, find_namespace_packages, setup
|
||||||
from setuptools.command.bdist_wheel import bdist_wheel
|
from setuptools.command.bdist_wheel import bdist_wheel
|
||||||
from setuptools.command.build_ext import build_ext
|
from setuptools.command.build_ext import build_ext
|
||||||
|
|
||||||
|
|
||||||
|
def cuda_toolkit_major_version():
|
||||||
|
out = subprocess.check_output(["nvcc", "--version"], stderr=subprocess.STDOUT)
|
||||||
|
text = out.decode()
|
||||||
|
m = re.search(r"release (\d+)", text)
|
||||||
|
if m:
|
||||||
|
return int(m.group(1))
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_version():
|
def get_version():
|
||||||
with open("mlx/version.h", "r") as fid:
|
with open("mlx/version.h", "r") as fid:
|
||||||
for l in fid:
|
for l in fid:
|
||||||
@@ -31,7 +39,7 @@ def get_version():
|
|||||||
version = f"{version}.dev{today.year}{today.month:02d}{today.day:02d}"
|
version = f"{version}.dev{today.year}{today.month:02d}{today.day:02d}"
|
||||||
if not pypi_release and not dev_release:
|
if not pypi_release and not dev_release:
|
||||||
git_hash = (
|
git_hash = (
|
||||||
run(
|
subprocess.run(
|
||||||
"git rev-parse --short HEAD".split(),
|
"git rev-parse --short HEAD".split(),
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
check=True,
|
check=True,
|
||||||
@@ -284,7 +292,11 @@ if __name__ == "__main__":
|
|||||||
install_requires.append(
|
install_requires.append(
|
||||||
f'mlx-metal=={version}; platform_system == "Darwin"'
|
f'mlx-metal=={version}; platform_system == "Darwin"'
|
||||||
)
|
)
|
||||||
extras["cuda"] = [f'mlx-cuda=={version}; platform_system == "Linux"']
|
extras["cuda"] = [f'mlx-cuda-12=={version}; platform_system == "Linux"']
|
||||||
|
for toolkit in [12, 13]:
|
||||||
|
extras[f"cuda{toolkit}"] = [
|
||||||
|
f'mlx-cuda-{toolkit}=={version}; platform_system == "Linux"'
|
||||||
|
]
|
||||||
extras["cpu"] = [f'mlx-cpu=={version}; platform_system == "Linux"']
|
extras["cpu"] = [f'mlx-cpu=={version}; platform_system == "Linux"']
|
||||||
|
|
||||||
_setup(
|
_setup(
|
||||||
@@ -299,13 +311,25 @@ if __name__ == "__main__":
|
|||||||
if build_macos:
|
if build_macos:
|
||||||
name = "mlx-metal"
|
name = "mlx-metal"
|
||||||
elif build_cuda:
|
elif build_cuda:
|
||||||
name = "mlx-cuda"
|
toolkit = cuda_toolkit_major_version()
|
||||||
|
name = f"mlx-cuda-{toolkit}"
|
||||||
|
if toolkit == 12:
|
||||||
|
install_requires += [
|
||||||
|
"nvidia-cublas-cu12==12.9.*",
|
||||||
|
"nvidia-cuda-nvrtc-cu12==12.9.*",
|
||||||
|
]
|
||||||
|
elif toolkit == 13:
|
||||||
|
install_requires += [
|
||||||
|
"nvidia-cublas-cu13",
|
||||||
|
"nvidia-cuda-nvrtc-cu13",
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown toolkit {toolkit}")
|
||||||
install_requires += [
|
install_requires += [
|
||||||
"nvidia-cublas-cu12==12.9.*",
|
f"nvidia-cudnn-cu{toolkit}==9.*",
|
||||||
"nvidia-cuda-nvrtc-cu12==12.9.*",
|
f"nvidia-nccl-cu{toolkit}",
|
||||||
"nvidia-cudnn-cu12==9.*",
|
|
||||||
"nvidia-nccl-cu12",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
name = "mlx-cpu"
|
name = "mlx-cpu"
|
||||||
_setup(
|
_setup(
|
||||||
|
|||||||
Reference in New Issue
Block a user