diff --git a/.circleci/config.yml b/.circleci/config.yml
index b5636fa6c..7157949b7 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -38,6 +38,11 @@ jobs:
           name: Run the python tests
           command: |
             python3 -m unittest discover python/tests
+      # TODO: Reenable when extension api becomes stable
+      # - run:
+      #     name: Build example extension
+      #     command: |
+      #       cd examples/extensions && python3 -m pip install . 
       - run:
           name: Build CPP only
           command: |
@@ -78,6 +83,13 @@ jobs:
             conda activate runner-env
             DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
             DEVICE=gpu python -m xmlrunner discover -v python/tests -o test-results/gpu
+      # TODO: Reenable when extension api becomes stable
+      # - run:
+      #     name: Build example extension
+      #     command: |
+      #       eval "$(conda shell.bash hook)"
+      #       conda activate runner-env
+      #       cd examples/extensions && python -m pip install . 
       - store_test_results:
           path: test-results
       - run:
diff --git a/examples/extensions/axpby/axpby.cpp b/examples/extensions/axpby/axpby.cpp
index 6da2ff591..732dc43b6 100644
--- a/examples/extensions/axpby/axpby.cpp
+++ b/examples/extensions/axpby/axpby.cpp
@@ -104,7 +104,10 @@ void axpby_impl(
 }
 
 /** Fall back implementation for evaluation on CPU */
-void Axpby::eval(const std::vector<array>& inputs, array& out) {
+void Axpby::eval(
+    const std::vector<array>& inputs,
+    std::vector<array>& out_arr) {
+  auto out = out_arr[0];
   // Check the inputs (registered in the op while constructing the out array)
   assert(inputs.size() == 2);
   auto& x = inputs[0];
@@ -175,7 +178,10 @@ void axpby_impl_accelerate(
 }
 
 /** Evaluate primitive on CPU using accelerate specializations */
-void Axpby::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Axpby::eval_cpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outarr) {
+  auto out = outarr[0];
   assert(inputs.size() == 2);
   auto& x = inputs[0];
   auto& y = inputs[1];
@@ -189,13 +195,15 @@ void Axpby::eval_cpu(const std::vector<array>& inputs, array& out) {
   }
 
   // Fall back to common backend if specializations are not available
-  eval(inputs, out);
+  eval(inputs, outarr);
 }
 
 #else // Accelerate not available
 
 /** Evaluate primitive on CPU falling back to common backend */
-void Axpby::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Axpby::eval_cpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& out) {
   eval(inputs, out);
 }
 
@@ -208,8 +216,11 @@ void Axpby::eval_cpu(const std::vector<array>& inputs, array& out) {
 #ifdef _METAL_
 
 /** Evaluate primitive on GPU */
-void Axpby::eval_gpu(const std::vector<array>& inputs, array& out) {
+void Axpby::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outarr) {
   // Prepare inputs
+  auto out = outarr[0];
   assert(inputs.size() == 2);
   auto& x = inputs[0];
   auto& y = inputs[1];
@@ -295,7 +306,9 @@ void Axpby::eval_gpu(const std::vector<array>& inputs, array& out) {
 #else // Metal is not available
 
 /** Fail evaluation on GPU */
-void Axpby::eval_gpu(const std::vector<array>& inputs, array& out) {
+void Axpby::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& out) {
   throw std::runtime_error("Axpby has no GPU implementation.");
 }
 
@@ -306,7 +319,7 @@ void Axpby::eval_gpu(const std::vector<array>& inputs, array& out) {
 ///////////////////////////////////////////////////////////////////////////////
 
 /** The Jacobian-vector product. */
-array Axpby::jvp(
+std::vector<array> Axpby::jvp(
     const std::vector<array>& primals,
     const std::vector<array>& tangents,
     const std::vector<int>& argnums) {
@@ -321,32 +334,33 @@ array Axpby::jvp(
   if (argnums.size() > 1) {
     auto scale = argnums[0] == 0 ? alpha_ : beta_;
     auto scale_arr = array(scale, tangents[0].dtype());
-    return multiply(scale_arr, tangents[0], stream());
+    return {multiply(scale_arr, tangents[0], stream())};
   }
   // If, argnums = {0, 1}, we take contributions from both
   // which gives us jvp = tangent_x * alpha + tangent_y * beta
   else {
-    return axpby(tangents[0], tangents[1], alpha_, beta_, stream());
+    return {axpby(tangents[0], tangents[1], alpha_, beta_, stream())};
   }
 }
 
 /** The vector-Jacobian product. */
 std::vector<array> Axpby::vjp(
     const std::vector<array>& primals,
-    const array& cotan,
-    const std::vector<int>& argnums) {
+    const std::vector<array>& cotangents,
+    const std::vector<int>& argnums,
+    const std::vector<array>&) {
   // Reverse mode diff
   std::vector<array> vjps;
   for (auto arg : argnums) {
     auto scale = arg == 0 ? alpha_ : beta_;
-    auto scale_arr = array(scale, cotan.dtype());
-    vjps.push_back(multiply(scale_arr, cotan, stream()));
+    auto scale_arr = array(scale, cotangents[0].dtype());
+    vjps.push_back(multiply(scale_arr, cotangents[0], stream()));
   }
   return vjps;
 }
 
 /** Vectorize primitive along given axis */
-std::pair<array, int> Axpby::vmap(
+std::pair<std::vector<array>, std::vector<int>> Axpby::vmap(
     const std::vector<array>& inputs,
     const std::vector<int>& axes) {
   throw std::runtime_error("Axpby has no vmap implementation.");
diff --git a/examples/extensions/axpby/axpby.h b/examples/extensions/axpby/axpby.h
index 2b85dadb2..649d9600a 100644
--- a/examples/extensions/axpby/axpby.h
+++ b/examples/extensions/axpby/axpby.h
@@ -42,11 +42,13 @@ class Axpby : public Primitive {
    * To avoid unnecessary allocations, the evaluation function
    * is responsible for allocating space for the array.
    */
-  void eval_cpu(const std::vector<array>& inputs, array& out) override;
-  void eval_gpu(const std::vector<array>& inputs, array& out) override;
+  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& out)
+      override;
+  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& out)
+      override;
 
   /** The Jacobian-vector product. */
-  array jvp(
+  std::vector<array> jvp(
       const std::vector<array>& primals,
       const std::vector<array>& tangents,
       const std::vector<int>& argnums) override;
@@ -54,8 +56,9 @@ class Axpby : public Primitive {
   /** The vector-Jacobian product. */
   std::vector<array> vjp(
       const std::vector<array>& primals,
-      const array& cotan,
-      const std::vector<int>& argnums) override;
+      const std::vector<array>& cotangents,
+      const std::vector<int>& argnums,
+      const std::vector<array>& outputs) override;
 
   /**
    * The primitive must know how to vectorize itself across
@@ -63,7 +66,7 @@ class Axpby : public Primitive {
    * representing the vectorized computation and the axis which
    * corresponds to the output vectorized dimension.
    */
-  std::pair<array, int> vmap(
+  std::pair<std::vector<array>, std::vector<int>> vmap(
       const std::vector<array>& inputs,
       const std::vector<int>& axes) override;
 
@@ -80,7 +83,7 @@ class Axpby : public Primitive {
   float beta_;
 
   /** Fall back implementation for evaluation on CPU */
-  void eval(const std::vector<array>& inputs, array& out);
+  void eval(const std::vector<array>& inputs, std::vector<array>& out);
 };
 
 } // namespace mlx::core
\ No newline at end of file
diff --git a/examples/extensions/pyproject.toml b/examples/extensions/pyproject.toml
new file mode 100644
index 000000000..1c5302936
--- /dev/null
+++ b/examples/extensions/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools>=42", "pybind11>=2.10", "cmake>=3.24", "mlx @ git+https://github.com/mlx-explore/mlx@main"]
+build-backend = "setuptools.build_meta"
\ No newline at end of file
diff --git a/mlx/io/CMakeLists.txt b/mlx/io/CMakeLists.txt
index f12b2bd85..8e80cc4c5 100644
--- a/mlx/io/CMakeLists.txt
+++ b/mlx/io/CMakeLists.txt
@@ -14,6 +14,11 @@ target_include_directories(
     $<BUILD_INTERFACE:${json_SOURCE_DIR}/single_include/nlohmann>
     $<INSTALL_INTERFACE:include/json>
 )
+install(
+    DIRECTORY ${json_SOURCE_DIR}/
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/json
+    COMPONENT json_source
+)
 
 MESSAGE(STATUS "Downloading gguflib")
 FetchContent_Declare(gguflib
@@ -26,6 +31,12 @@ target_include_directories(
     $<BUILD_INTERFACE:${gguflib_SOURCE_DIR}>
     $<INSTALL_INTERFACE:include/gguflib>
 )
+install(
+  DIRECTORY ${gguflib_SOURCE_DIR}/
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/gguflib
+  COMPONENT gguflib_source
+)
+
 add_library(
   gguflib STATIC
   ${gguflib_SOURCE_DIR}/fp16.c