From 4fe2fa2a6404725f6cab743d74a1c71d85f45844 Mon Sep 17 00:00:00 2001
From: Juarez Bochi <jbochi@gmail.com>
Date: Tue, 23 Jan 2024 18:43:57 -0500
Subject: [PATCH] GGUF: Avoid dequantization when format is compatible  (#426)

* GGUF: Don't dequantize q4_1

* Fix weight order. First in low bits

* Add unpacking for q4_0

* Don't dequantize q8_0

* rebase quants and split file

* don't quantize every weight

* reapply patch

* error handling

---------

Co-authored-by: Awni Hannun <awni@apple.com>
---
 mlx/io/CMakeLists.txt  |   1 +
 mlx/io/gguf.cpp        |  51 ++++++++-----
 mlx/io/gguf.h          |  20 ++++++
 mlx/io/gguf_quants.cpp | 158 +++++++++++++++++++++++++++++++++++++++++
 tests/metal_tests.cpp  |   2 -
 5 files changed, 211 insertions(+), 21 deletions(-)
 create mode 100644 mlx/io/gguf.h
 create mode 100644 mlx/io/gguf_quants.cpp
diff --git a/mlx/io/CMakeLists.txt b/mlx/io/CMakeLists.txt
index 8e80cc4c5..9b799dfa3 100644
--- a/mlx/io/CMakeLists.txt
+++ b/mlx/io/CMakeLists.txt
@@ -4,6 +4,7 @@ target_sources(
   ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/safetensor.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/gguf.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/gguf_quants.cpp
 )
 
 MESSAGE(STATUS "Downloading json")
diff --git a/mlx/io/gguf.cpp b/mlx/io/gguf.cpp
index 8f3c6871f..f4047d1a0 100644
--- a/mlx/io/gguf.cpp
+++ b/mlx/io/gguf.cpp
@@ -1,17 +1,10 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.
 
 #include <cstdint>
 #include <cstring>
 #include <numeric>
 
-#include "mlx/io.h"
-#include "mlx/primitives.h"
-#include "mlx/transforms.h"
-#include "mlx/utils.h"
-
-extern "C" {
-#include <gguflib.h>
-}
+#include <mlx/io/gguf.h>
 
 namespace mlx::core {
 
@@ -52,7 +45,16 @@ std::optional<Dtype> gguf_type_to_dtype(const uint32_t& gguf_type) {
   }
 }
 
-std::pair<allocator::Buffer, Dtype> extract_tensor_data(gguf_tensor* tensor) {
+std::vector<int> get_shape(const gguf_tensor& tensor) {
+  std::vector<int> shape;
+  // The dimension order in GGML is the reverse of the order used in MLX.
+  for (int i = tensor.ndim - 1; i >= 0; i--) {
+    shape.push_back(tensor.dim[i]);
+  }
+  return shape;
+}
+
+std::tuple<allocator::Buffer, Dtype> extract_tensor_data(gguf_tensor* tensor) {
   std::optional<Dtype> equivalent_dtype = gguf_type_to_dtype(tensor->type);
   // If there's an equivalent type, we can simply copy.
   if (equivalent_dtype.has_value()) {
@@ -203,16 +205,27 @@ std::unordered_map<std::string, MetaData> load_metadata(gguf_ctx* ctx) {
 std::unordered_map<std::string, array> load_arrays(gguf_ctx* ctx) {
   std::unordered_map<std::string, array> array_map;
   gguf_tensor tensor;
-  while (gguf_get_tensor(ctx, &tensor)) {
-    std::vector<int> shape;
-    // The dimension order in GGML is the reverse of the order used in MLX.
-    for (int i = tensor.ndim - 1; i >= 0; i--) {
-      shape.push_back(tensor.dim[i]);
+
+  auto check_insert = [](auto inserted) {
+    if (!inserted.second) {
+      std::ostringstream msg;
+      msg << "[load_gguf] Duplicate parameter name " << inserted.first->second
+          << " this can happend when loading quantized tensors.";
+      throw std::runtime_error(msg.str());
+    }
+  };
+
+  while (gguf_get_tensor(ctx, &tensor)) {
+    if (tensor.type == GGUF_TYPE_Q4_0 || tensor.type == GGUF_TYPE_Q4_1 ||
+        tensor.type == GGUF_TYPE_Q8_0) {
+      gguf_load_quantized(array_map, tensor);
+    } else {
+      std::string name = std::string(tensor.name, tensor.namelen);
+
+      const auto& [data, dtype] = extract_tensor_data(&tensor);
+      array loaded_array = array(data, get_shape(tensor), dtype);
+      array_map.insert({name, loaded_array});
     }
-    const auto& [data, dtype] = extract_tensor_data(&tensor);
-    array loaded_array = array(data, shape, dtype);
-    std::string name = std::string(tensor.name, tensor.namelen);
-    array_map.insert({name, loaded_array});
   }
   return array_map;
 }
diff --git a/mlx/io/gguf.h b/mlx/io/gguf.h
new file mode 100644
index 000000000..170fd6b0a
--- /dev/null
+++ b/mlx/io/gguf.h
@@ -0,0 +1,20 @@
+// Copyright © 2023-2024 Apple Inc.
+#pragma once
+
+#include "mlx/io.h"
+#include "mlx/primitives.h"
+#include "mlx/transforms.h"
+#include "mlx/utils.h"
+
+extern "C" {
+#include <gguflib.h>
+}
+
+namespace mlx::core {
+
+std::vector<int> get_shape(const gguf_tensor& tensor);
+void gguf_load_quantized(
+    std::unordered_map<std::string, array>& a,
+    const gguf_tensor& tensor);
+
+} // namespace mlx::core
diff --git a/mlx/io/gguf_quants.cpp b/mlx/io/gguf_quants.cpp
new file mode 100644
index 000000000..636648bc7
--- /dev/null
+++ b/mlx/io/gguf_quants.cpp
@@ -0,0 +1,158 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include <cstdint>
+#include <cstring>
+
+#include <mlx/io/gguf.h>
+
+namespace mlx::core {
+
+void unpack_32_4(uint8_t* data, int8_t* dst) {
+  for (int64_t j = 0; j < 16; ++j) {
+    uint8_t x = (data[j + 2] & 0x0F); // j+2 to skip scale bytes.
+    if (j % 2 != 0) {
+      x <<= 4;
+    }
+    dst[j / 2] += x;
+  }
+  // Last 16 weights are in the higher bits
+  for (int64_t j = 0; j < 16; ++j) {
+    uint8_t x = (data[j + 2] >> 4);
+    if (j % 2 != 0) {
+      x <<= 4;
+    }
+    dst[8 + j / 2] += x;
+  }
+}
+
+// Extracts (weight, scales, biases) from Q4_0 tensors.
+// Data layout is: |16 bit scale|32 x 4bit weights|.
+void extract_q4_0_data(
+    const gguf_tensor& tensor,
+    array& weights_arr,
+    array& scales_arr,
+    array& biases_arr) {
+  const uint64_t bytes_per_block = 18; // 2 bytes scale, 32x0.5 byte weights
+  auto data = static_cast<uint8_t*>(tensor.weights_data);
+  auto weights = weights_arr.data<int8_t>();
+  auto scales = scales_arr.data<float16_t>();
+  auto biases = biases_arr.data<float16_t>();
+  for (int64_t i = 0; i < scales_arr.size(); i++) {
+    scales[i] = *((float16_t*)data);
+    biases[i] = -8 * scales[i];
+    unpack_32_4(data, weights);
+    weights += 16;
+    data += bytes_per_block;
+  }
+}
+
+// Extracts (weight, scales, biases) from Q4_1 tensors.
+// Data layout is: |16 bit scale|16 bit bias|32 x 4bit weights|.
+void extract_q4_1_data(
+    const gguf_tensor& tensor,
+    array& weights_arr,
+    array& scales_arr,
+    array& biases_arr) {
+  const uint64_t bytes_per_block =
+      20; // 2 bytes scale, 2 bytes bias, 32x0.5 byte weights
+  auto data = static_cast<uint8_t*>(tensor.weights_data);
+  auto weights = weights_arr.data<int8_t>();
+  auto scales = scales_arr.data<float16_t>();
+  auto biases = biases_arr.data<float16_t>();
+  for (int64_t i = 0; i < scales_arr.size(); i++) {
+    scales[i] = *((float16_t*)data);
+    biases[i] = *((float16_t*)(data) + 1);
+    unpack_32_4(data, weights);
+    weights += 16;
+    data += bytes_per_block;
+  }
+}
+
+// Extracts (weight, scales, biases) from Q8_0 tensors.
+// Data layout is: |16 bit scale|32 x 8bit weights|.
+void extract_q8_0_data(
+    const gguf_tensor& tensor,
+    array& weights_arr,
+    array& scales_arr,
+    array& biases_arr) {
+  const uint64_t weights_per_block = 32;
+  const uint64_t bytes_per_block = 34; // 2 bytes scale, 32x1 byte weights
+  auto data = static_cast<uint8_t*>(tensor.weights_data);
+  auto weights = weights_arr.data<int8_t>();
+  auto scales = scales_arr.data<float16_t>();
+  auto biases = biases_arr.data<float16_t>();
+  for (int64_t i = 0; i < scales_arr.size(); i++) {
+    uint8_t* block_data = data + i * bytes_per_block;
+    scales[i] = *((float16_t*)block_data);
+    biases[i] = -128 * scales[i];
+    for (int64_t j = 0; j < weights_per_block; ++j) {
+      uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes.
+      // Original data is in int8_t, so we add a bias of -128 and invert the
+      // first bit.
+      x ^= 1 << 7;
+      weights[i * weights_per_block + j] = x;
+    }
+  }
+}
+
+void gguf_load_quantized(
+    std::unordered_map<std::string, array>& a,
+    const gguf_tensor& tensor) {
+  uint64_t weights_per_byte;
+  if (tensor.type == GGUF_TYPE_Q4_0 || tensor.type == GGUF_TYPE_Q4_1) {
+    weights_per_byte = 2;
+  } else { // tensor.type == GGUF_TYPE_Q8_0
+    weights_per_byte = 1;
+  }
+
+  std::string name = std::string(tensor.name, tensor.namelen);
+  std::vector<int> shape = get_shape(tensor);
+  const uint64_t weights_per_block = 32;
+  if (shape[shape.size() - 1] % weights_per_block != 0) {
+    std::ostringstream msg;
+    msg << "[load_gguf] tensor " << name
+        << "has incompatible last dim shape: " << shape[shape.size() - 1];
+    throw std::runtime_error(msg.str());
+  }
+  const uint64_t num_blocks = tensor.num_weights / weights_per_block;
+
+  std::vector<int> weights_shape = shape;
+  weights_shape.back() /= (weights_per_byte * 4);
+
+  array weights(std::move(weights_shape), uint32, nullptr, {});
+  weights.set_data(allocator::malloc(weights.nbytes()));
+
+  // For scales and bias
+  shape[shape.size() - 1] = shape[shape.size() - 1] / weights_per_block;
+  array scales(shape, float16, nullptr, {});
+  array biases(std::move(shape), float16, nullptr, {});
+  scales.set_data(allocator::malloc(scales.nbytes()));
+  biases.set_data(allocator::malloc(biases.nbytes()));
+
+  if (tensor.type == GGUF_TYPE_Q4_0) {
+    extract_q4_0_data(tensor, weights, scales, biases);
+  } else if (tensor.type == GGUF_TYPE_Q4_1) {
+    extract_q4_1_data(tensor, weights, scales, biases);
+  } else if (tensor.type == GGUF_TYPE_Q8_0) {
+    extract_q8_0_data(tensor, weights, scales, biases);
+  }
+
+  a.insert({name, weights});
+
+  auto check_insert = [](auto inserted) {
+    if (!inserted.second) {
+      std::ostringstream msg;
+      msg << "[load_gguf] Duplicate parameter name " << inserted.first->second
+          << " this can happend when loading quantized tensors.";
+      throw std::runtime_error(msg.str());
+    }
+  };
+
+  const std::string weight_suffix = ".weight";
+  const std::string name_prefix =
+      name.substr(0, name.length() - weight_suffix.length());
+  check_insert(a.insert({name_prefix + ".scales", scales}));
+  check_insert(a.insert({name_prefix + ".biases", biases}));
+}
+
+} // namespace mlx::core
diff --git a/tests/metal_tests.cpp b/tests/metal_tests.cpp
index 1c748268e..ff4e3bb0f 100644
--- a/tests/metal_tests.cpp
+++ b/tests/metal_tests.cpp
@@ -500,7 +500,6 @@ TEST_CASE("test metal enable/disable cache") {
     auto buf = a.malloc(size, false);
     auto buf_ptr = static_cast<MTL::Buffer*>(buf.ptr());
     unsigned char first_byte = *reinterpret_cast<unsigned char*>(buf_ptr);
-    printf("first byte: %d\n", first_byte);
 
     // Release a
     a.free(buf);
@@ -508,7 +507,6 @@ TEST_CASE("test metal enable/disable cache") {
     // If release successfully, the first byte should be different from the
     // first byte before release
     unsigned char new_first_byte = *reinterpret_cast<unsigned char*>(buf_ptr);
-    printf("new first byte: %d\n", new_first_byte);
 
     CHECK_NE(new_first_byte, first_byte);
   }