diff --git a/docs/src/install.rst b/docs/src/install.rst index bdad740ea..11a70ca4e 100644 --- a/docs/src/install.rst +++ b/docs/src/install.rst @@ -186,8 +186,8 @@ should point to the path to the built metal library. Binary Size Minimization ~~~~~~~~~~~~~~~~~~~~~~~~ -To produce a smaller binary use the CMake flags `CMAKE_BUILD_TYPE=MinSizeRel` -and `BUILD_SHARED_LIBS=ON`. +To produce a smaller binary use the CMake flags ``CMAKE_BUILD_TYPE=MinSizeRel`` +and ``BUILD_SHARED_LIBS=ON``. The MLX CMake build has several additional options to make smaller binaries. For example, if you don't need the CPU backend or support for safetensors and @@ -203,7 +203,7 @@ GGUF, you can do: -DMLX_BUILD_GGUF=OFF \ -DMLX_METAL_JIT=ON -THE `MLX_METAL_JIT` flag minimizes the size of the MLX Metal library which +THE ``MLX_METAL_JIT`` flag minimizes the size of the MLX Metal library which contains pre-built GPU kernels. This substantially reduces the size of the Metal library by run-time compiling kernels the first time they are used in MLX on a given machine. Note run-time compilation incurs a cold-start cost which can diff --git a/mlx/ops.cpp b/mlx/ops.cpp index 6606cb5f1..f0d155c96 100644 --- a/mlx/ops.cpp +++ b/mlx/ops.cpp @@ -4321,8 +4321,9 @@ array bitwise_impl( } auto inputs = broadcast_arrays(astype(a, out_type, s), astype(b, out_type, s), s); + auto& out_shape = inputs[0].shape(); return array( - a.shape(), + out_shape, out_type, std::make_shared(to_stream(s), op), std::move(inputs)); diff --git a/python/tests/test_ops.py b/python/tests/test_ops.py index 0256154dd..fb724d2c9 100644 --- a/python/tests/test_ops.py +++ b/python/tests/test_ops.py @@ -2291,6 +2291,13 @@ class TestOps(mlx_tests.MLXTestCase): out_np = getattr(np, op)(a_np, b_np) self.assertTrue(np.array_equal(np.array(out_mlx), out_np)) + # Check broadcasting + a = mx.ones((3, 1, 5), dtype=mx.bool_) + b = mx.zeros((1, 2, 5), dtype=mx.bool_) + c = a | b + self.assertEqual(c.shape, (3, 2, 5)) + self.assertTrue(mx.array_equal(c, mx.ones((3, 2, 5), dtype=mx.bool_))) + def test_conjugate(self): shape = (3, 5, 7) a = np.random.normal(size=shape) + 1j * np.random.normal(size=shape)