diff --git a/.circleci/config.yml b/.circleci/config.yml
index 0ea9303db..205a930af 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -16,6 +16,9 @@ parameters:
   linux_release:
     type: boolean
     default: false
+  cuda_release:
+    type: boolean
+    default: false
 
 jobs:
   build_documentation:
@@ -104,7 +107,7 @@ jobs:
           command: |
             echo "stubs"
             pip install typing_extensions
-            python setup.py generate_stubs 
+            python setup.py generate_stubs
       - run:
           name: Run Python tests
           command: |
@@ -162,7 +165,7 @@ jobs:
           command: |
             source env/bin/activate
             pip install typing_extensions
-            python setup.py generate_stubs 
+            python setup.py generate_stubs
       - run:
           name: Run Python tests
           command: |
@@ -223,7 +226,6 @@ jobs:
           command: |
             sudo apt-get update
             sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
-            sudo apt-get install openmpi-bin openmpi-common libopenmpi-dev
             python -m venv env
             source env/bin/activate
             CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
@@ -283,7 +285,7 @@ jobs:
           command: |
             source env/bin/activate
             pip install typing_extensions
-            python setup.py generate_stubs 
+            python setup.py generate_stubs
       - run:
           name: Build Python package
           command: |
@@ -342,7 +344,7 @@ jobs:
               CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
               pip install . -v
             pip install typing_extensions
-            python setup.py generate_stubs 
+            python setup.py generate_stubs
             << parameters.extra_env >> \
               CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
               python -m build --wheel
@@ -356,6 +358,48 @@ jobs:
       - store_artifacts:
           path: wheelhouse/
 
+  build_cuda_release:
+    parameters:
+      python_version:
+        type: string
+        default: "3.9"
+      extra_env:
+        type: string
+        default: "DEV_RELEASE=1"
+    machine:
+      image: linux-cuda-12:default
+      resource_class: gpu.nvidia.small.gen2
+    steps:
+      - checkout
+      - run:
+          name: Build wheel
+          command: |
+            sudo apt-get update
+            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
+            python -m venv env
+            source env/bin/activate
+            pip install auditwheel
+            pip install patchelf
+            pip install build
+            pip install twine
+            << parameters.extra_env >> \
+              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
+              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+              pip install ".[dev]" -v
+            python setup.py generate_stubs
+            << parameters.extra_env >> \
+              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
+              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+              python -m build --wheel
+            bash python/scripts/repair_cuda.sh
+      - run:
+          name: Upload package
+          command: |
+            source env/bin/activate
+            twine upload wheelhouse/*.whl
+      - store_artifacts:
+          path: wheelhouse/
+
 workflows:
   build_and_test:
     when:
@@ -625,3 +669,14 @@ workflows:
             parameters:
               python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
               extra_env: ["PYPI_RELEASE=1"]
+  cuda_test_release:
+    when:
+      and:
+        - equal: [ main, << pipeline.git.branch >> ]
+        - << pipeline.parameters.cuda_release >>
+    jobs:
+      - build_cuda_release:
+          matrix:
+            parameters:
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              extra_env: ["PYPI_RELEASE=1"]
diff --git a/docs/src/install.rst b/docs/src/install.rst
index 059b2cba4..22de94f90 100644
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -30,6 +30,16 @@ MLX is also available on conda-forge. To install MLX with conda do:
 
    conda install conda-forge::mlx
 
+CUDA
+^^^^
+
+MLX has a CUDA backend which you can use on any Linux platform with CUDA 12
+and SM 7.0 (Volta) and up. To install MLX with CUDA support, run:
+
+.. code-block:: shell
+
+    pip install mlx-cuda
+
 
 Troubleshooting
 ^^^^^^^^^^^^^^^
@@ -65,6 +75,8 @@ Build Requirements
 Python API
 ^^^^^^^^^^
 
+.. _python install:
+
 To build and install the MLX python library from source, first, clone MLX from
 `its GitHub repo <https://github.com/ml-explore/mlx>`_:
 
@@ -107,6 +119,8 @@ IDE:
 C++ API
 ^^^^^^^
 
+.. _cpp install:
+
 Currently, MLX must be built and installed from source.
 
 Similarly to the python library, to build and install the MLX C++ library start
@@ -185,6 +199,7 @@ should point to the path to the built metal library.
 
       xcrun -sdk macosx --show-sdk-version
 
+
 Binary Size Minimization
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -213,6 +228,50 @@ be anwywhere from a few hundred millisecond to a few seconds depending on the
 application. Once a kernel is compiled, it will be cached by the system. The
 Metal kernel cache persists across reboots.
 
+Linux
+^^^^^
+
+To build from source on Linux (CPU only), install the BLAS and LAPACK headers.
+For example on Ubuntu, run the following:
+
+.. code-block:: shell
+
+   apt-get update -y
+   apt-get install libblas-dev liblapack-dev liblapacke-dev -y
+
+From here follow the instructions to install either the :ref:`Python <python
+install>` or :ref:`C++ <cpp install>` APIs.
+
+CUDA
+^^^^
+
+To build from source on Linux with CUDA, install the BLAS and LAPACK headers
+and the CUDA toolkit. For example on Ubuntu, run the following:
+
+.. code-block:: shell
+
+   wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+   dpkg -i cuda-keyring_1.1-1_all.deb
+   apt-get update -y
+   apt-get -y install cuda-toolkit-12-9
+   apt-get install libblas-dev liblapack-dev liblapacke-dev -y
+
+
+When building either the Python or C++ APIs make sure to pass the cmake flag
+``MLX_BUILD_CUDA=ON``. For example, to build the Python API run:
+
+.. code-block:: shell
+
+  CMAKE_BUILD_PARALLEL_LEVEL=8 CMAKE_ARGS="-DMLX_BUILD_CUDA=ON" pip install -e ".[dev]"
+
+To build the C++ package run:
+
+.. code-block:: shell
+
+   mkdir -p build && cd build
+   cmake .. -DMLX_BUILD_CUDA=ON && make -j
+
+
 Troubleshooting
 ^^^^^^^^^^^^^^^
 
diff --git a/mlx/backend/cuda/device.cpp b/mlx/backend/cuda/device.cpp
index fcf7fdf5e..ba31c0e45 100644
--- a/mlx/backend/cuda/device.cpp
+++ b/mlx/backend/cuda/device.cpp
@@ -114,7 +114,7 @@ void CommandEncoder::synchronize() {
   std::future<void> f = p->get_future();
   add_completed_handler([p = std::move(p)]() { p->set_value(); });
   worker_.end_batch();
-  worker_.commit();
+  commit();
   f.wait();
 }
 
diff --git a/python/scripts/repair_cuda.sh b/python/scripts/repair_cuda.sh
new file mode 100644
index 000000000..21e6a977a
--- /dev/null
+++ b/python/scripts/repair_cuda.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+auditwheel repair dist/* \
+  --plat manylinux_2_35_x86_64 \
+  --exclude libcublas* \
+  --exclude libnvrtc*
+
+cd wheelhouse
+repaired_wheel=$(find . -name "*.whl" -print -quit)
+unzip -q "${repaired_wheel}"
+core_so=$(find mlx -name "core*.so" -print -quit)
+rpath=$(patchelf --print-rpath "${core_so}")
+rpath=$rpath:\$ORIGIN/../nvidia/cublas/lib:\$ORIGIN/../nvidia/cuda_nvrtc/lib
+patchelf --force-rpath --set-rpath "$rpath" "$core_so"
+
+# Re-zip the repaired wheel
+zip -r -q "${repaired_wheel}" .
diff --git a/setup.py b/setup.py
index d742e6595..35f2e68ef 100644
--- a/setup.py
+++ b/setup.py
@@ -174,20 +174,26 @@ if __name__ == "__main__":
     )
     package_dir = {"": "python"}
     package_data = {"mlx": ["lib/*", "include/*", "share/*"], "mlx.core": ["*.pyi"]}
+    install_requires = []
+    build_cuda = "MLX_BUILD_CUDA=ON" in os.environ.get("CMAKE_ARGS", "")
+    if build_cuda:
+        install_requires = ["nvidia-cublas-cu12", "nvidia-cuda-nvrtc-cu12"]
 
     setup(
-        name="mlx",
+        name="mlx-cuda" if build_cuda else "mlx",
         version=get_version(),
         author="MLX Contributors",
         author_email="mlx@group.apple.com",
         description="A framework for machine learning on Apple silicon.",
         long_description=long_description,
         long_description_content_type="text/markdown",
+        license="MIT",
         url="https://github.com/ml-explore/mlx",
         packages=packages,
         package_dir=package_dir,
         package_data=package_data,
         include_package_data=True,
+        install_requires=install_requires,
         extras_require={
             "dev": [
                 "nanobind==2.4.0",