auto build linux release (#2341 )

[CUDA] Do vectorized store/load in binary ops (#2330 )
Build with all cpu cores by default (#2336 )
2025-12-16 01:49:05 +08:00 · 2025-07-07 09:29:23 -07:00 · 2025-07-07 08:44:14 -07:00 · 2025-07-07 06:06:45 -07:00 · 2025-07-07 06:06:01 -07:00
8 changed files with 167 additions and 61 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -41,7 +41,7 @@ jobs:
            pip install --upgrade pip
            pip install --upgrade cmake
            pip install -r docs/requirements.txt
-            CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` pip install . -v
+            pip install . -v
      - when:
          condition:
            not: << parameters.upload-docs >>
@@ -97,10 +97,8 @@ jobs:
          name: Install Python package
          command: |
            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
              python3 setup.py build_ext --inplace
            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
              python3 setup.py develop
      - run:
          name: Generate package stubs
@@ -157,8 +155,7 @@ jobs:
          name: Install Python package
          command: |
            source env/bin/activate
-            DEBUG=1 CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
-            CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
+            DEBUG=1 CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
              pip install -e . -v
      - run:
          name: Generate package stubs
@@ -208,7 +205,6 @@ jobs:
          name: Run Python tests with JIT
          command: |
            source env/bin/activate
-            CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
            CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
              pip install -e . -v
            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 \
@@ -228,7 +224,6 @@ jobs:
            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
            python -m venv env
            source env/bin/activate
-            CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
            CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
              pip install -e ".[dev]"
      - run:
@@ -278,7 +273,6 @@ jobs:
          command: |
            source env/bin/activate
            env -u MACOSX_DEPLOYMENT_TARGET DEV_RELEASE=1 \
-              CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
              pip install . -v
      - run:
          name: Generate package stubs
@@ -290,9 +284,7 @@ jobs:
          name: Build Python package
          command: |
            source env/bin/activate
-            << parameters.build_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
-              python -m build -w
+            << parameters.build_env >> python -m build -w
      - when:
          condition: << parameters.build_env >>
          steps:
@@ -340,14 +332,10 @@ jobs:
            pip install patchelf
            pip install build
            pip install twine
-            << parameters.extra_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
-              pip install . -v
+            << parameters.extra_env >> pip install . -v
            pip install typing_extensions
            python setup.py generate_stubs
-            << parameters.extra_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
-              python -m build --wheel
+            << parameters.extra_env >> python -m build --wheel
            auditwheel show dist/*
            auditwheel repair dist/* --plat manylinux_2_31_x86_64
      - run:
@@ -383,12 +371,10 @@ jobs:
            pip install build
            pip install twine
            << parameters.extra_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
              pip install ".[dev]" -v
            python setup.py generate_stubs
            << parameters.extra_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
              python -m build --wheel
            bash python/scripts/repair_cuda.sh
@@ -506,6 +492,16 @@ workflows:
            branches:
              ignore: /.*/
          upload-docs: true
+      - build_linux_release:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          matrix:
+            parameters:
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              extra_env: ["PYPI_RELEASE=1"]

  prb:
    when:
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -88,20 +88,20 @@ Then simply build and install MLX using pip:

 .. code-block:: shell

-  CMAKE_BUILD_PARALLEL_LEVEL=8 pip install .
+  pip install .

 For developing, install the package with development dependencies, and use an
 editable install:

 .. code-block:: shell

-  CMAKE_BUILD_PARALLEL_LEVEL=8 pip install -e ".[dev]"
+  pip install -e ".[dev]"

 Once the development dependencies are installed, you can build faster with:

 .. code-block:: shell

- CMAKE_BUILD_PARALLEL_LEVEL=8 python setup.py build_ext --inplace
+ python setup.py build_ext --inplace

 Run the tests with:

@@ -262,7 +262,7 @@ When building either the Python or C++ APIs make sure to pass the cmake flag

 .. code-block:: shell

-  CMAKE_BUILD_PARALLEL_LEVEL=8 CMAKE_ARGS="-DMLX_BUILD_CUDA=ON" pip install -e ".[dev]"
+  CMAKE_ARGS="-DMLX_BUILD_CUDA=ON" pip install -e ".[dev]"

 To build the C++ package run:

--- a/mlx/backend/cuda/binary.cu
+++ b/mlx/backend/cuda/binary.cu
@@ -17,35 +17,106 @@ namespace cu {

 namespace cg = cooperative_groups;

-template <typename Op, typename In, typename Out, typename IdxT>
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void binary_ss(const In* a, const In* b, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    out[index] = Op{}(a[0], b[0]);
+  int remaining = size - index * N_READS;
+  if (remaining <= 0) {
+    return;
+  }
+
+  if (remaining < N_READS) {
+    for (int i = 0; i < remaining; ++i) {
+      IdxT offset = index * N_READS + i;
+      out[offset] = Op{}(a[0], b[0]);
+    }
+  } else {
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = Op{}(a[0], b[0]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
  }
 }

-template <typename Op, typename In, typename Out, typename IdxT>
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void binary_sv(const In* a, const In* b, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    out[index] = Op{}(a[0], b[index]);
+  int remaining = size - index * N_READS;
+  if (remaining <= 0) {
+    return;
+  }
+
+  if (remaining < N_READS) {
+    for (int i = 0; i < remaining; ++i) {
+      IdxT offset = index * N_READS + i;
+      out[offset] = Op{}(a[0], b[offset]);
+    }
+  } else {
+    auto b_vec = load_vector<N_READS>(b, index);
+
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = Op{}(a[0], b_vec.val[i]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
  }
 }

-template <typename Op, typename In, typename Out, typename IdxT>
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void binary_vs(const In* a, const In* b, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    out[index] = Op{}(a[index], b[0]);
+  int remaining = size - index * N_READS;
+  if (remaining <= 0) {
+    return;
+  }
+
+  if (remaining < N_READS) {
+    for (int i = 0; i < remaining; ++i) {
+      IdxT offset = index * N_READS + i;
+      out[offset] = Op{}(a[offset], b[0]);
+    }
+  } else {
+    auto a_vec = load_vector<N_READS>(a, index);
+
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = Op{}(a_vec.val[i], b[0]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
  }
 }

-template <typename Op, typename In, typename Out, typename IdxT>
+template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
 __global__ void binary_vv(const In* a, const In* b, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();
-  if (index < size) {
-    out[index] = Op{}(a[index], b[index]);
+  int remaining = size - index * N_READS;
+  if (remaining <= 0) {
+    return;
+  }
+
+  if (remaining < N_READS) {
+    for (int i = 0; i < remaining; ++i) {
+      IdxT offset = index * N_READS + i;
+      out[offset] = Op{}(a[offset], b[offset]);
+    }
+  } else {
+    auto a_vec = load_vector<N_READS>(a, index);
+    auto b_vec = load_vector<N_READS>(b, index);
+
+    AlignedVector<Out, N_READS> out_vec;
+#pragma unroll
+    for (int i = 0; i < N_READS; ++i) {
+      out_vec.val[i] = Op{}(a_vec.val[i], b_vec.val[i]);
+    }
+
+    store_vector<N_READS>(out, index, out_vec);
  }
 }

@@ -198,16 +269,23 @@ void binary_op_gpu_inplace(
        } else {
          dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
            using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
-            auto kernel = cu::binary_ss<Op, InType, OutType, IdxT>;
+            // TODO: Choose optimized value based on type size.
+            constexpr int N_READS = 4;
+            auto kernel = cu::binary_ss<Op, InType, OutType, IdxT, N_READS>;
            if (bopt == BinaryOpType::ScalarVector) {
-              kernel = cu::binary_sv<Op, InType, OutType, IdxT>;
+              kernel = cu::binary_sv<Op, InType, OutType, IdxT, N_READS>;
            } else if (bopt == BinaryOpType::VectorScalar) {
-              kernel = cu::binary_vs<Op, InType, OutType, IdxT>;
+              kernel = cu::binary_vs<Op, InType, OutType, IdxT, N_READS>;
            } else if (bopt == BinaryOpType::VectorVector) {
-              kernel = cu::binary_vv<Op, InType, OutType, IdxT>;
+              kernel = cu::binary_vv<Op, InType, OutType, IdxT, N_READS>;
            }
            auto [num_blocks, block_dims] = get_launch_args(
-                kernel, out.data_size(), out.shape(), out.strides(), large());
+                kernel,
+                out.data_size(),
+                out.shape(),
+                out.strides(),
+                large(),
+                N_READS);
            encoder.add_kernel_node(
                kernel,
                num_blocks,
--- a/mlx/backend/cuda/device/utils.cuh
+++ b/mlx/backend/cuda/device/utils.cuh
@@ -28,6 +28,27 @@ namespace mlx::core::cu {
 using Shape = cuda::std::array<int32_t, MAX_NDIM>;
 using Strides = cuda::std::array<int64_t, MAX_NDIM>;

+// Vectorized load/store.
+template <typename T, int N>
+struct alignas(sizeof(T) * N) AlignedVector {
+  T val[N];
+};
+
+template <int N, typename T>
+inline __device__ AlignedVector<T, N> load_vector(
+    const T* ptr,
+    uint32_t offset) {
+  auto* from = reinterpret_cast<const AlignedVector<T, N>*>(ptr);
+  return from[offset];
+}
+
+template <int N, typename T>
+inline __device__ void
+store_vector(T* ptr, uint32_t offset, const AlignedVector<T, N>& vec) {
+  auto* to = reinterpret_cast<AlignedVector<T, N>*>(ptr);
+  to[offset] = vec;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // Type limits utils
 ///////////////////////////////////////////////////////////////////////////////
--- a/mlx/backend/metal/kernels/layer_norm.metal
+++ b/mlx/backend/metal/kernels/layer_norm.metal
@@ -31,6 +31,7 @@ inline void threadgroup_sum(
  for (int i = 0; i < N; i++) {
    x[i] = simd_sum(x[i]);
  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
  if (simd_lane_id == 0) {
    for (int i = 0; i < N; i++) {
      xs[N * simd_group_id + i] = x[i];
--- a/python/mlx/extension.py
+++ b/python/mlx/extension.py
@@ -53,11 +53,7 @@ class CMakeBuild(build_ext):
        # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
        # across all generators.
        if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
-            # self.parallel is a Python 3 only way to set parallel jobs by hand
-            # using -j in the build_ext call, not supported by pip or PyPA-build.
-            if hasattr(self, "parallel") and self.parallel:
-                # CMake 3.12+ only.
-                build_args += [f"-j{self.parallel}"]
+            build_args += [f"-j{os.cpu_count()}"]

        build_temp = Path(self.build_temp) / ext.name
        if not build_temp.exists():
--- a/python/src/fast.cpp
+++ b/python/src/fast.cpp
@@ -175,10 +175,11 @@ void init_fast(nb::module_& parent_module) {
        * `Grouped Query Attention <https://arxiv.org/abs/2305.13245>`_
        * `Multi-Query Attention <https://arxiv.org/abs/1911.02150>`_

-        Note: The softmax operation is performed in ``float32`` regardless of
-        the input precision.
+        .. note::

-        Note: For Grouped Query Attention and Multi-Query Attention, the ``k``
+          * The softmax operation is performed in ``float32`` regardless of
+            the input precision.
+          * For Grouped Query Attention and Multi-Query Attention, the ``k``
            and ``v`` inputs should not be pre-tiled to match ``q``.

        In the following the dimensions are given by:
@@ -195,13 +196,30 @@ void init_fast(nb::module_& parent_module) {
            k (array): Keys with shape ``[B, N_kv, T_kv, D]``.
            v (array): Values with shape ``[B, N_kv, T_kv, D]``.
            scale (float): Scale for queries (typically ``1.0 / sqrt(q.shape(-1)``)
-            mask (Union[None, str, array], optional): A causal, boolean or additive
-               mask to apply to the query-key scores. The mask can have at most 4
-               dimensions and must be broadcast-compatible with the shape
-               ``[B, N, T_q, T_kv]``. If an additive mask is given its type must
-               promote to the promoted type of ``q``, ``k``, and ``v``.
+            mask (Union[None, str, array], optional): The mask to apply to the
+               query-key scores. The mask can be an array or a string indicating
+               the mask type. The only supported string type is ``"causal"``. If
+               the mask is an array it can be a boolean or additive mask. The mask
+               can have at most 4 dimensions and must be broadcast-compatible with
+               the shape ``[B, N, T_q, T_kv]``. If an additive mask is given its
+               type must promote to the promoted type of ``q``, ``k``, and ``v``.
        Returns:
            array: The output array.
+
+        Example:
+
+          .. code-block:: python
+
+            B = 2
+            N_q = N_kv = 32
+            T_q = T_kv = 1000
+            D = 128
+
+            q = mx.random.normal(shape=(B, N_q, T_q, D))
+            k = mx.random.normal(shape=(B, N_kv, T_kv, D))
+            v = mx.random.normal(shape=(B, N_kv, T_kv, D))
+            scale = D ** -0.5
+            out = mx.fast.scaled_dot_product_attention(q, k, v, scale=scale, mask="causal")
      )pbdoc");

  m.def(
--- a/setup.py
+++ b/setup.py
@@ -97,11 +97,7 @@ class CMakeBuild(build_ext):
        # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
        # across all generators.
        if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
-            # self.parallel is a Python 3 only way to set parallel jobs by hand
-            # using -j in the build_ext call, not supported by pip or PyPA-build.
-            if hasattr(self, "parallel") and self.parallel:
-                # CMake 3.12+ only.
-                build_args += [f"-j{self.parallel}"]
+            build_args += [f"-j{os.cpu_count()}"]

        build_temp = Path(self.build_temp) / ext.name
        if not build_temp.exists():
Author	SHA1	Message	Date
Awni Hannun	a4fcc893cd	auto build linux release (#2341 )	2025-07-07 09:29:23 -07:00
Cheng	9d10239af7	[CUDA] Do vectorized store/load in binary ops (#2330 )	2025-07-07 08:44:14 -07:00
Cheng	19facd4b20	Build with all cpu cores by default (#2336 )	2025-07-07 06:06:45 -07:00
Angelos Katharopoulos	f5299f72cd	Fix layernorm race condition (#2340 )	2025-07-07 06:06:01 -07:00