diff --git a/.circleci/config.yml b/.circleci/config.yml
index 33e83c6d4..62c1715eb 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -272,7 +272,17 @@ jobs:
           name: Build Python package
           command: |
             source env/bin/activate
-            << parameters.build_env >> python -m build -w
+            << parameters.build_env >> MLX_BUILD_STAGE=1 python -m build -w
+      - when:
+          condition:
+            equal: ["3.9", << parameters.python_version >>]
+          steps:
+            - run:
+                name: Build common package
+                command: |
+                  source env/bin/activate
+                  python setup.py clean --all
+                  << parameters.build_env >> MLX_BUILD_STAGE=2 python -m build -w
       - when:
           condition: << parameters.build_env >>
           steps:
@@ -292,42 +302,39 @@ jobs:
       build_env:
         type: string
         default: ""
-    docker:
-      - image: ubuntu:20.04
+    machine:
+      image: ubuntu-2204:current
+      resource_class: large
     steps:
       - checkout
       - run:
           name: Build wheel
           command: |
             PYTHON=python<< parameters.python_version >>
-            apt-get update
-            apt-get upgrade -y
-            DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
-            apt-get install -y apt-utils
-            apt-get install -y software-properties-common
-            add-apt-repository -y ppa:deadsnakes/ppa
-            apt-get install -y $PYTHON $PYTHON-dev $PYTHON-full
-            apt-get install -y libblas-dev liblapack-dev liblapacke-dev
-            apt-get install -y build-essential git
+            export DEBIAN_FRONTEND=noninteractive
+            export NEEDRESTART_MODE=a
+            sudo apt-get update
+            sudo apt-get upgrade -y
+            TZ=Etc/UTC sudo apt-get -y install tzdata
+            sudo apt-get install -y apt-utils
+            sudo apt-get install -y software-properties-common
+            sudo add-apt-repository -y ppa:deadsnakes/ppa
+            sudo apt-get install -y $PYTHON $PYTHON-dev $PYTHON-full
+            sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
+            sudo apt-get install -y build-essential git
             $PYTHON -m venv env
             source env/bin/activate
             pip install --upgrade pip
             pip install --upgrade cmake
-            pip install nanobind==2.4.0
-            pip install --upgrade setuptools
-            pip install numpy
             pip install auditwheel
             pip install patchelf
             pip install build
             pip install twine
-            << parameters.build_env >> pip install . -v
+            << parameters.build_env >> pip install ".[dev]" -v
             pip install typing_extensions
             python setup.py generate_stubs
-            << parameters.build_env >> python -m build --wheel
-            auditwheel show dist/*
-            auditwheel repair dist/* --plat manylinux_2_31_x86_64
-            << parameters.build_env >> MLX_BUILD_COMMON=1 \
-              python -m build --wheel --outdir wheelhouse
+            MLX_BUILD_STAGE=1 << parameters.build_env >> python -m build -w
+            bash python/scripts/repair_linux.sh
       - when:
           condition:
             equal: ["3.9", << parameters.python_version >>]
@@ -336,8 +343,10 @@ jobs:
                 name: Build common package
                 command: |
                   source env/bin/activate
-                  << parameters.build_env >> MLX_BUILD_COMMON=1 \
-                    python -m build --wheel --outdir wheelhouse
+                  python setup.py clean --all
+                  << parameters.build_env >> MLX_BUILD_STAGE=2 \
+                    python -m build -w
+                  auditwheel repair dist/mlx_cpu*.whl --plat manylinux_2_35_x86_64
       - when:
           condition: << parameters.build_env >>
           steps:
@@ -371,9 +380,9 @@ jobs:
             pip install patchelf
             pip install build
             pip install twine
-            << parameters.build_env >> \
+            << parameters.build_env >> MLX_BUILD_STAGE=2 \
               CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
-              python -m build --wheel
+              python -m build -w
             bash python/scripts/repair_cuda.sh
       - when:
           condition: << parameters.build_env >>
@@ -506,7 +515,6 @@ workflows:
               ignore: /.*/
           matrix:
             parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
               build_env: ["PYPI_RELEASE=1"]
 
   prb:
@@ -587,20 +595,7 @@ workflows:
                 xcode_version: "15.0.0"
                 python_version: "3.13"
       - build_linux_release:
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-          matrix:
-            parameters:
-              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
-      - build_cuda_release:
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
           matrix:
             parameters:
               python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+      - build_cuda_release
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4bf8d2d3e..9e67e4bf2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,10 +64,8 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
       message(WARNING "Building for x86_64 arch is not officially supported.")
     endif()
   endif()
-
 else()
   set(MLX_BUILD_METAL OFF)
-  message(WARNING "MLX is prioritised for Apple silicon systems using macOS.")
 endif()
 
 # ----------------------------- Lib -----------------------------
diff --git a/docs/src/install.rst b/docs/src/install.rst
index 7c1a02b62..70491ac64 100644
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -23,13 +23,6 @@ To install from PyPI you must meet the following requirements:
     MLX is only available on devices running macOS >= 13.5
     It is highly recommended to use macOS 14 (Sonoma)
 
-
-MLX is also available on conda-forge. To install MLX with conda do:
-
-.. code-block:: shell
-
-   conda install conda-forge::mlx
-
 CUDA
 ^^^^
 
@@ -40,7 +33,7 @@ and SM 7.0 (Volta) and up. To install MLX with CUDA support, run:
 
     pip install "mlx[cuda]"
 
-CPU only (Linux)
+CPU-only (Linux)
 ^^^^^^^^^^^^^^^^
 
 For a CPU-only version of MLX that runs on Linux use:
diff --git a/pyproject.toml b/pyproject.toml
index ad0d2e328..6fcd5d16c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [build-system]
 requires = [
-  "setuptools>=42",
+  "setuptools>=80",
   "nanobind==2.4.0",
   "cmake>=3.25",
 ]
diff --git a/python/scripts/repair_cuda.sh b/python/scripts/repair_cuda.sh
index 3584a7a8a..ec0a89930 100644
--- a/python/scripts/repair_cuda.sh
+++ b/python/scripts/repair_cuda.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 auditwheel repair dist/* \
-  --plat manylinux_2_35_x86_64 \
+  --plat manylinux_2_39_x86_64 \
   --exclude libcublas* \
   --exclude libnvrtc* \
   -w wheel_tmp
@@ -12,10 +12,12 @@ cd wheel_tmp
 repaired_wheel=$(find . -name "*.whl" -print -quit)
 unzip -q "${repaired_wheel}"
 rm "${repaired_wheel}"
-core_so=$(find mlx -name "core*.so" -print -quit)
-rpath=$(patchelf --print-rpath "${core_so}")
-rpath=$rpath:\$ORIGIN/../nvidia/cublas/lib:\$ORIGIN/../nvidia/cuda_nvrtc/lib
-patchelf --force-rpath --set-rpath "$rpath" "$core_so"
+mlx_so="mlx/lib/libmlx.so"
+rpath=$(patchelf --print-rpath "${mlx_so}")
+base="\$ORIGIN/../../nvidia"
+rpath=$rpath:${base}/cublas/lib:${base}/cuda_nvrtc/lib
+patchelf --force-rpath --set-rpath "$rpath" "$mlx_so"
+python ../python/scripts/repair_record.py ${mlx_so}
 
 # Re-zip the repaired wheel
 zip -r -q "../wheelhouse/${repaired_wheel}" .
diff --git a/python/scripts/repair_linux.sh b/python/scripts/repair_linux.sh
new file mode 100644
index 000000000..82cf49060
--- /dev/null
+++ b/python/scripts/repair_linux.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+auditwheel repair dist/* \
+  --plat manylinux_2_35_x86_64 \
+  --exclude libmlx* \
+  -w wheel_tmp
+
+mkdir wheelhouse
+cd wheel_tmp
+repaired_wheel=$(find . -name "*.whl" -print -quit)
+unzip -q "${repaired_wheel}"
+rm "${repaired_wheel}"
+core_so=$(find mlx -name "core*.so" -print -quit)
+rpath="\$ORIGIN/lib"
+patchelf --force-rpath --set-rpath "$rpath" "$core_so"
+python ../python/scripts/repair_record.py ${core_so}
+
+# Re-zip the repaired wheel
+zip -r -q "../wheelhouse/${repaired_wheel}" .
diff --git a/python/scripts/repair_record.py b/python/scripts/repair_record.py
new file mode 100644
index 000000000..1738fd5ad
--- /dev/null
+++ b/python/scripts/repair_record.py
@@ -0,0 +1,33 @@
+import base64
+import glob
+import hashlib
+import sys
+
+filename = sys.argv[1]
+
+
+# Compute the new hash and size
+def urlsafe_b64encode(data: bytes) -> bytes:
+    return base64.urlsafe_b64encode(data).rstrip(b"=")
+
+
+hasher = hashlib.sha256()
+with open(filename, "rb") as f:
+    data = f.read()
+    hasher.update(data)
+hash_str = urlsafe_b64encode(hasher.digest()).decode("ascii")
+size = len(data)
+
+# Update the record file
+record_file = glob.glob("*/RECORD")[0]
+with open(record_file, "r") as f:
+    lines = [l.split(",") for l in f.readlines()]
+
+for l in lines:
+    if filename == l[0]:
+        l[1] = hash_str
+        l[2] = f"{size}\n"
+
+with open(record_file, "w") as f:
+    for l in lines:
+        f.write(",".join(l))
diff --git a/setup.py b/setup.py
index 4f141c517..6cc4015c3 100644
--- a/setup.py
+++ b/setup.py
@@ -10,6 +10,7 @@ from pathlib import Path
 from subprocess import run
 
 from setuptools import Command, Extension, setup
+from setuptools.command.bdist_wheel import bdist_wheel
 from setuptools.command.build_ext import build_ext
 
 
@@ -42,6 +43,9 @@ def get_version():
     return version
 
 
+build_stage = int(os.environ.get("MLX_BUILD_STAGE", 0))
+
+
 # A CMakeExtension needs a sourcedir instead of a file list.
 # The name must be the _single_ output extension from the CMake build.
 # If you need multiple extensions, see scikit-build.
@@ -60,13 +64,22 @@ class CMakeBuild(build_ext):
         debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug
         cfg = "Debug" if debug else "Release"
 
-        # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
-        # EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code
-        # from Python.
+        build_temp = Path(self.build_temp) / ext.name
+        if not build_temp.exists():
+            build_temp.mkdir(parents=True)
+
+        build_python = "ON"
+        install_prefix = f"{extdir}{os.sep}"
+        if build_stage == 1:
+            # Don't include MLX libraries in the wheel
+            install_prefix = f"{build_temp}"
+        elif build_stage == 2:
+            # Don't include Python bindings in the wheel
+            build_python = "OFF"
         cmake_args = [
-            f"-DCMAKE_INSTALL_PREFIX={extdir}{os.sep}",
+            f"-DCMAKE_INSTALL_PREFIX={install_prefix}",
             f"-DCMAKE_BUILD_TYPE={cfg}",
-            "-DMLX_BUILD_PYTHON_BINDINGS=ON",
+            f"-DMLX_BUILD_PYTHON_BINDINGS={build_python}",
             "-DMLX_BUILD_TESTS=OFF",
             "-DMLX_BUILD_BENCHMARKS=OFF",
             "-DMLX_BUILD_EXAMPLES=OFF",
@@ -100,10 +113,6 @@ class CMakeBuild(build_ext):
         if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
             build_args += [f"-j{os.cpu_count()}"]
 
-        build_temp = Path(self.build_temp) / ext.name
-        if not build_temp.exists():
-            build_temp.mkdir(parents=True)
-
         subprocess.run(
             ["cmake", ext.sourcedir, *cmake_args], cwd=build_temp, check=True
         )
@@ -159,15 +168,22 @@ class GenerateStubs(Command):
         subprocess.run(stub_cmd + ["-o", f"{out_path}/__init__.pyi"])
 
 
+class MLXBdistWheel(bdist_wheel):
+    def get_tag(self) -> tuple[str, str, str]:
+        impl, abi, plat_name = super().get_tag()
+        if build_stage == 2:
+            impl = self.python_tag
+            abi = "none"
+        return (impl, abi, plat_name)
+
+
 # Read the content of README.md
 with open(Path(__file__).parent / "README.md", encoding="utf-8") as f:
     long_description = f.read()
 
-# The information here can also be placed in setup.cfg - better separation of
-# logic and declaration, and simpler if you include description/version in a file.
+
 if __name__ == "__main__":
     package_dir = {"": "python"}
-    package_data = {"mlx": ["lib/*", "include/*", "share/*"], "mlx.core": ["*.pyi"]}
     packages = [
         "mlx",
         "mlx.nn",
@@ -175,10 +191,8 @@ if __name__ == "__main__":
         "mlx.optimizers",
     ]
 
-    is_release = "PYPI_RELEASE" in os.environ
     build_macos = platform.system() == "Darwin"
     build_cuda = "MLX_BUILD_CUDA=ON" in os.environ.get("CMAKE_ARGS", "")
-    build_common = "MLX_BUILD_COMMON" in os.environ
 
     install_requires = []
     if build_cuda:
@@ -195,19 +209,26 @@ if __name__ == "__main__":
         long_description_content_type="text/markdown",
         license="MIT",
         url="https://github.com/ml-explore/mlx",
+        include_package_data=True,
         package_dir=package_dir,
-        package_data=package_data,
         zip_safe=False,
         python_requires=">=3.9",
-        install_requires=install_requires,
+        ext_modules=[CMakeExtension("mlx.core")],
+        cmdclass={
+            "build_ext": CMakeBuild,
+            "generate_stubs": GenerateStubs,
+            "bdist_wheel": MLXBdistWheel,
+        },
     )
 
+    package_data = {"mlx": ["lib/*", "include/*", "share/*"], "mlx.core": ["*.pyi"]}
+
     extras = {
         "dev": [
             "nanobind==2.4.0",
             "numpy",
             "pre-commit",
-            "setuptools>=42",
+            "setuptools>=80",
             "torch",
             "typing_extensions",
         ],
@@ -219,32 +240,46 @@ if __name__ == "__main__":
         ]
     }
 
-    if not is_release or build_macos:
+    # Release builds for PyPi are in two stages.
+    # Each stage should be run from a clean build:
+    #   python setup.py clean --all
+    #
+    # Stage 1:
+    #  - Triggered with `MLX_BUILD_STAGE=1`
+    #  - Include everything except backend-specific binaries (e.g. libmlx.so, mlx.metallib, etc)
+    #  - Wheel has Python ABI and platform tags
+    #  - Wheel should be built for the cross-product of python version and platforms
+    #  - Package name is mlx and it depends on subpackage in stage 2 (e.g. mlx-metal)
+    # Stage 2:
+    #  - Triggered with `MLX_BUILD_STAGE=2`
+    #  - Includes only backend-specific binaries (e.g. libmlx.so, mlx.metallib, etc)
+    #  - Wheel has only platform tags
+    #  - Wheel should be built only for different platforms
+    #  - Package name is back-end specific, e.g mlx-metal
+    if build_stage != 2:
+        if build_stage == 1:
+            if build_macos:
+                install_requires += [f"mlx-metal=={version}"]
+            else:
+                extras["cuda"] = [f"mlx-cuda=={version}"]
+                extras["cpu"] = [f"mlx-cpu=={version}"]
+
         _setup(
             name="mlx",
-            include_package_data=True,
             packages=packages,
             extras_require=extras,
             entry_points=entry_points,
-            ext_modules=[CMakeExtension("mlx.core")],
-            cmdclass={"build_ext": CMakeBuild, "generate_stubs": GenerateStubs},
-        )
-    elif build_common:
-        extras["cpu"] = [f"mlx-cpu=={version}"]
-        extras["cuda"] = [f"mlx-cuda=={version}"]
-        _setup(
-            name="mlx",
-            packages=["mlx"],
-            extras_require=extras,
-            entry_points=entry_points,
-            exclude_package_data=package_data,
+            install_requires=install_requires,
+            package_data=package_data,
         )
     else:
+        if build_macos:
+            name = "mlx-metal"
+        elif build_cuda:
+            name = "mlx-cuda"
+        else:
+            name = "mlx-cpu"
         _setup(
-            name="mlx-cuda" if build_cuda else "mlx-cpu",
-            include_package_data=True,
-            packages=packages,
-            extras_require=extras,
-            ext_modules=[CMakeExtension("mlx.core")],
-            cmdclass={"build_ext": CMakeBuild, "generate_stubs": GenerateStubs},
+            name=name,
+            packages=["mlx"],
         )