From 02efb310cac667bc547d1b96f21596c221f84fe7 Mon Sep 17 00:00:00 2001
From: Awni Hannun <awni@apple.com>
Date: Tue, 10 Sep 2024 15:15:17 -0700
Subject: [PATCH] Xcode 160 (#1384)

* xcode 16.0 with debug tests

* limit nproc for builds

* vmap bug

* assert bug

* run python tests in debug mode

* fix view, bool copies preserve bits'

* actual view fix
---
 .circleci/config.yml              | 47 ++++++++++++++++++++-----------
 docs/src/install.rst              |  6 ++--
 mlx/backend/common/primitives.cpp | 13 +++++++--
 mlx/backend/metal/rope.cpp        |  1 -
 python/src/transforms.cpp         |  2 +-
 5 files changed, 45 insertions(+), 24 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index bd207ae03..8548c15c4 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -38,8 +38,12 @@ jobs:
       - run:
           name: Install Python package
           command: |
-            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" CMAKE_BUILD_PARALLEL_LEVEL="" python3 setup.py build_ext --inplace
-            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" CMAKE_BUILD_PARALLEL_LEVEL="" python3 setup.py develop
+            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" \
+              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
+              python3 setup.py build_ext --inplace
+            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" \
+              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
+              python3 setup.py develop
       - run:
           name: Generate package stubs
           command: |
@@ -53,7 +57,9 @@ jobs:
       - run:
           name: Build CPP only
           command: |
-            mkdir -p build && cd build && cmake .. -DMLX_BUILD_METAL=OFF && make -j
+            mkdir -p build && cd build 
+            cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
+            make -j `nproc`
       - run:
           name: Run CPP tests
           command: ./build/tests/tests
@@ -86,7 +92,7 @@ jobs:
           name: Install Python package
           command: |
             source env/bin/activate
-            CMAKE_BUILD_PARALLEL_LEVEL="" pip install -e . -v
+            DEBUG=1 CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` pip install -e . -v
       - run:
           name: Generate package stubs
           command: |
@@ -113,7 +119,7 @@ jobs:
           name: Build CPP only
           command: |
             source env/bin/activate
-            mkdir -p build && cd build && cmake .. && make -j
+            mkdir -p build && cd build && cmake .. && make -j `sysctl -n hw.ncpu`
       - run:
           name: Run CPP tests
           command: |
@@ -123,14 +129,23 @@ jobs:
           command: |
             source env/bin/activate
             cd build/
-            cmake .. -DCMAKE_BUILD_TYPE=MinSizeRel -DBUILD_SHARED_LIBS=ON -DMLX_BUILD_CPU=OFF -DMLX_BUILD_SAFETENSORS=OFF -DMLX_BUILD_GGUF=OFF -DMLX_METAL_JIT=ON
-            make -j
+            cmake .. -DCMAKE_BUILD_TYPE=MinSizeRel \
+              -DBUILD_SHARED_LIBS=ON \
+              -DMLX_BUILD_CPU=OFF \
+              -DMLX_BUILD_SAFETENSORS=OFF \
+              -DMLX_BUILD_GGUF=OFF \
+              -DMLX_METAL_JIT=ON
+            make -j `sysctl -n hw.ncpu`
       - run:
           name: Run Python tests with JIT
           command: |
             source env/bin/activate
-            CMAKE_BUILD_PARALLEL_LEVEL="" CMAKE_ARGS="-DMLX_METAL_JIT=ON" pip install -e . -v
-            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu_jit
+            CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
+              CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
+              pip install -e . -v
+            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 \
+              METAL_DEBUG_ERROR_MODE=0 \
+              python -m xmlrunner discover -v python/tests -o test-results/gpu_jit
 
   build_release:
     parameters:
@@ -167,7 +182,7 @@ jobs:
           command: |
             source env/bin/activate
             DEV_RELEASE=1 \
-              CMAKE_BUILD_PARALLEL_LEVEL="" \
+              CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
               pip install . -v
       - run:
           name: Generate package stubs
@@ -180,7 +195,7 @@ jobs:
           command: |
             source env/bin/activate
             << parameters.build_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL="" \
+              CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
               python -m build -w
       - when:
           condition: << parameters.build_env >>
@@ -229,12 +244,12 @@ jobs:
             pip install patchelf
             pip install build
             << parameters.extra_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL="" \
+              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
               pip install . -v
             pip install typing_extensions
             python setup.py generate_stubs 
             << parameters.extra_env >> \
-              CMAKE_BUILD_PARALLEL_LEVEL="" \
+              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
               python -m build --wheel
             auditwheel show dist/*
             auditwheel repair dist/* --plat manylinux_2_31_x86_64
@@ -255,7 +270,7 @@ workflows:
       - mac_build_and_test:
           matrix:
             parameters:
-              xcode_version: ["15.0.0", "15.2.0"]
+              xcode_version: ["15.0.0", "15.2.0", "16.0.0"]
       - linux_build_and_test
 
   build_pypi_release:
@@ -290,7 +305,7 @@ workflows:
           requires: [ hold ]
           matrix:
             parameters:
-              xcode_version: ["15.0.0", "15.2.0"]
+              xcode_version: ["15.0.0", "15.2.0", "16.0.0"]
       - linux_build_and_test:
           requires: [ hold ]
   nightly_build:
@@ -314,7 +329,7 @@ workflows:
           matrix:
             parameters:
               python_version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
-              xcode_version: ["15.0.0", "15.2.0"]
+              xcode_version: ["15.0.0", "15.2.0", "16.0.0"]
               build_env: ["DEV_RELEASE=1"]
   linux_test_release:
     when:
diff --git a/docs/src/install.rst b/docs/src/install.rst
index c8cf5723b..edc3d6143 100644
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -74,20 +74,20 @@ Then simply build and install MLX using pip:
 
 .. code-block:: shell
 
-  CMAKE_BUILD_PARALLEL_LEVEL="" pip install .
+  CMAKE_BUILD_PARALLEL_LEVEL=8 pip install .
 
 For developing, install the package with development dependencies, and use an
 editable install:
 
 .. code-block:: shell
 
-  CMAKE_BUILD_PARALLEL_LEVEL="" pip install -e ".[dev]"
+  CMAKE_BUILD_PARALLEL_LEVEL=8 pip install -e ".[dev]"
 
 Once the development dependencies are installed, you can build faster with:
 
 .. code-block:: shell
 
- CMAKE_BUILD_PARALLEL_LEVEL="" python setup.py build_ext -j --inplace
+ CMAKE_BUILD_PARALLEL_LEVEL=8 python setup.py build_ext --inplace
 
 Run the tests with:
 
diff --git a/mlx/backend/common/primitives.cpp b/mlx/backend/common/primitives.cpp
index 14aa52bad..23c5efa19 100644
--- a/mlx/backend/common/primitives.cpp
+++ b/mlx/backend/common/primitives.cpp
@@ -612,11 +612,18 @@ void View::eval_cpu(const std::vector<array>& inputs, array& out) {
       strides[i] /= obytes;
     }
     out.copy_shared_buffer(
-        in, strides, in.flags(), in.data_size() * obytes / ibytes);
+        in, strides, in.flags(), in.data_size() * ibytes / obytes);
   } else {
-    auto tmp = array(in.shape(), in.dtype(), nullptr, {});
+    auto tmp = array(
+        in.shape(), in.dtype() == bool_ ? uint8 : in.dtype(), nullptr, {});
     tmp.set_data(allocator::malloc_or_wait(tmp.nbytes()));
-    copy_inplace(in, tmp, CopyType::General);
+    if (in.dtype() == bool_) {
+      auto in_tmp = array(in.shape(), uint8, nullptr, {});
+      in_tmp.copy_shared_buffer(in);
+      copy_inplace(in_tmp, tmp, CopyType::General);
+    } else {
+      copy_inplace(in, tmp, CopyType::General);
+    }
 
     auto flags = out.flags();
     flags.contiguous = true;
diff --git a/mlx/backend/metal/rope.cpp b/mlx/backend/metal/rope.cpp
index d1d07df2c..fc6aa347c 100644
--- a/mlx/backend/metal/rope.cpp
+++ b/mlx/backend/metal/rope.cpp
@@ -10,7 +10,6 @@ constexpr int n_per_thread = 4;
 void RoPE::eval_gpu(
     const std::vector<array>& inputs,
     std::vector<array>& outputs) {
-  assert(inputs.size() == 1);
   assert(outputs.size() == 1);
   auto& in = inputs[0];
   auto& out = outputs[0];
diff --git a/python/src/transforms.cpp b/python/src/transforms.cpp
index 32c5b94b8..82759cfcc 100644
--- a/python/src/transforms.cpp
+++ b/python/src/transforms.cpp
@@ -803,7 +803,7 @@ class PyCustomFunction {
             "[custom vmap] Function should only accept positional arguments");
       }
 
-      int arr_index;
+      int arr_index = 0;
       auto new_axes =
           nb::cast<nb::tuple>(tree_map(args, [&](nb::handle element) {
             int axis = axes[arr_index++];