From 4fe2fa2a6404725f6cab743d74a1c71d85f45844 Mon Sep 17 00:00:00 2001 From: Juarez Bochi Date: Tue, 23 Jan 2024 18:43:57 -0500 Subject: [PATCH] GGUF: Avoid dequantization when format is compatible (#426) * GGUF: Don't dequantize q4_1 * Fix weight order. First in low bits * Add unpacking for q4_0 * Don't dequantize q8_0 * rebase quants and split file * don't quantize every weight * reapply patch * error handling --------- Co-authored-by: Awni Hannun --- mlx/io/CMakeLists.txt | 1 + mlx/io/gguf.cpp | 51 ++++++++----- mlx/io/gguf.h | 20 ++++++ mlx/io/gguf_quants.cpp | 158 +++++++++++++++++++++++++++++++++++++++++ tests/metal_tests.cpp | 2 - 5 files changed, 211 insertions(+), 21 deletions(-) create mode 100644 mlx/io/gguf.h create mode 100644 mlx/io/gguf_quants.cpp diff --git a/mlx/io/CMakeLists.txt b/mlx/io/CMakeLists.txt index 8e80cc4c5..9b799dfa3 100644 --- a/mlx/io/CMakeLists.txt +++ b/mlx/io/CMakeLists.txt @@ -4,6 +4,7 @@ target_sources( ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp ${CMAKE_CURRENT_SOURCE_DIR}/safetensor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/gguf.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/gguf_quants.cpp ) MESSAGE(STATUS "Downloading json") diff --git a/mlx/io/gguf.cpp b/mlx/io/gguf.cpp index 8f3c6871f..f4047d1a0 100644 --- a/mlx/io/gguf.cpp +++ b/mlx/io/gguf.cpp @@ -1,17 +1,10 @@ -// Copyright © 2023 Apple Inc. +// Copyright © 2023-2024 Apple Inc. #include #include #include -#include "mlx/io.h" -#include "mlx/primitives.h" -#include "mlx/transforms.h" -#include "mlx/utils.h" - -extern "C" { -#include -} +#include namespace mlx::core { @@ -52,7 +45,16 @@ std::optional gguf_type_to_dtype(const uint32_t& gguf_type) { } } -std::pair extract_tensor_data(gguf_tensor* tensor) { +std::vector get_shape(const gguf_tensor& tensor) { + std::vector shape; + // The dimension order in GGML is the reverse of the order used in MLX. + for (int i = tensor.ndim - 1; i >= 0; i--) { + shape.push_back(tensor.dim[i]); + } + return shape; +} + +std::tuple extract_tensor_data(gguf_tensor* tensor) { std::optional equivalent_dtype = gguf_type_to_dtype(tensor->type); // If there's an equivalent type, we can simply copy. if (equivalent_dtype.has_value()) { @@ -203,16 +205,27 @@ std::unordered_map load_metadata(gguf_ctx* ctx) { std::unordered_map load_arrays(gguf_ctx* ctx) { std::unordered_map array_map; gguf_tensor tensor; - while (gguf_get_tensor(ctx, &tensor)) { - std::vector shape; - // The dimension order in GGML is the reverse of the order used in MLX. - for (int i = tensor.ndim - 1; i >= 0; i--) { - shape.push_back(tensor.dim[i]); + + auto check_insert = [](auto inserted) { + if (!inserted.second) { + std::ostringstream msg; + msg << "[load_gguf] Duplicate parameter name " << inserted.first->second + << " this can happend when loading quantized tensors."; + throw std::runtime_error(msg.str()); + } + }; + + while (gguf_get_tensor(ctx, &tensor)) { + if (tensor.type == GGUF_TYPE_Q4_0 || tensor.type == GGUF_TYPE_Q4_1 || + tensor.type == GGUF_TYPE_Q8_0) { + gguf_load_quantized(array_map, tensor); + } else { + std::string name = std::string(tensor.name, tensor.namelen); + + const auto& [data, dtype] = extract_tensor_data(&tensor); + array loaded_array = array(data, get_shape(tensor), dtype); + array_map.insert({name, loaded_array}); } - const auto& [data, dtype] = extract_tensor_data(&tensor); - array loaded_array = array(data, shape, dtype); - std::string name = std::string(tensor.name, tensor.namelen); - array_map.insert({name, loaded_array}); } return array_map; } diff --git a/mlx/io/gguf.h b/mlx/io/gguf.h new file mode 100644 index 000000000..170fd6b0a --- /dev/null +++ b/mlx/io/gguf.h @@ -0,0 +1,20 @@ +// Copyright © 2023-2024 Apple Inc. +#pragma once + +#include "mlx/io.h" +#include "mlx/primitives.h" +#include "mlx/transforms.h" +#include "mlx/utils.h" + +extern "C" { +#include +} + +namespace mlx::core { + +std::vector get_shape(const gguf_tensor& tensor); +void gguf_load_quantized( + std::unordered_map& a, + const gguf_tensor& tensor); + +} // namespace mlx::core diff --git a/mlx/io/gguf_quants.cpp b/mlx/io/gguf_quants.cpp new file mode 100644 index 000000000..636648bc7 --- /dev/null +++ b/mlx/io/gguf_quants.cpp @@ -0,0 +1,158 @@ +// Copyright © 2023-2024 Apple Inc. + +#include +#include + +#include + +namespace mlx::core { + +void unpack_32_4(uint8_t* data, int8_t* dst) { + for (int64_t j = 0; j < 16; ++j) { + uint8_t x = (data[j + 2] & 0x0F); // j+2 to skip scale bytes. + if (j % 2 != 0) { + x <<= 4; + } + dst[j / 2] += x; + } + // Last 16 weights are in the higher bits + for (int64_t j = 0; j < 16; ++j) { + uint8_t x = (data[j + 2] >> 4); + if (j % 2 != 0) { + x <<= 4; + } + dst[8 + j / 2] += x; + } +} + +// Extracts (weight, scales, biases) from Q4_0 tensors. +// Data layout is: |16 bit scale|32 x 4bit weights|. +void extract_q4_0_data( + const gguf_tensor& tensor, + array& weights_arr, + array& scales_arr, + array& biases_arr) { + const uint64_t bytes_per_block = 18; // 2 bytes scale, 32x0.5 byte weights + auto data = static_cast(tensor.weights_data); + auto weights = weights_arr.data(); + auto scales = scales_arr.data(); + auto biases = biases_arr.data(); + for (int64_t i = 0; i < scales_arr.size(); i++) { + scales[i] = *((float16_t*)data); + biases[i] = -8 * scales[i]; + unpack_32_4(data, weights); + weights += 16; + data += bytes_per_block; + } +} + +// Extracts (weight, scales, biases) from Q4_1 tensors. +// Data layout is: |16 bit scale|16 bit bias|32 x 4bit weights|. +void extract_q4_1_data( + const gguf_tensor& tensor, + array& weights_arr, + array& scales_arr, + array& biases_arr) { + const uint64_t bytes_per_block = + 20; // 2 bytes scale, 2 bytes bias, 32x0.5 byte weights + auto data = static_cast(tensor.weights_data); + auto weights = weights_arr.data(); + auto scales = scales_arr.data(); + auto biases = biases_arr.data(); + for (int64_t i = 0; i < scales_arr.size(); i++) { + scales[i] = *((float16_t*)data); + biases[i] = *((float16_t*)(data) + 1); + unpack_32_4(data, weights); + weights += 16; + data += bytes_per_block; + } +} + +// Extracts (weight, scales, biases) from Q8_0 tensors. +// Data layout is: |16 bit scale|32 x 8bit weights|. +void extract_q8_0_data( + const gguf_tensor& tensor, + array& weights_arr, + array& scales_arr, + array& biases_arr) { + const uint64_t weights_per_block = 32; + const uint64_t bytes_per_block = 34; // 2 bytes scale, 32x1 byte weights + auto data = static_cast(tensor.weights_data); + auto weights = weights_arr.data(); + auto scales = scales_arr.data(); + auto biases = biases_arr.data(); + for (int64_t i = 0; i < scales_arr.size(); i++) { + uint8_t* block_data = data + i * bytes_per_block; + scales[i] = *((float16_t*)block_data); + biases[i] = -128 * scales[i]; + for (int64_t j = 0; j < weights_per_block; ++j) { + uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes. + // Original data is in int8_t, so we add a bias of -128 and invert the + // first bit. + x ^= 1 << 7; + weights[i * weights_per_block + j] = x; + } + } +} + +void gguf_load_quantized( + std::unordered_map& a, + const gguf_tensor& tensor) { + uint64_t weights_per_byte; + if (tensor.type == GGUF_TYPE_Q4_0 || tensor.type == GGUF_TYPE_Q4_1) { + weights_per_byte = 2; + } else { // tensor.type == GGUF_TYPE_Q8_0 + weights_per_byte = 1; + } + + std::string name = std::string(tensor.name, tensor.namelen); + std::vector shape = get_shape(tensor); + const uint64_t weights_per_block = 32; + if (shape[shape.size() - 1] % weights_per_block != 0) { + std::ostringstream msg; + msg << "[load_gguf] tensor " << name + << "has incompatible last dim shape: " << shape[shape.size() - 1]; + throw std::runtime_error(msg.str()); + } + const uint64_t num_blocks = tensor.num_weights / weights_per_block; + + std::vector weights_shape = shape; + weights_shape.back() /= (weights_per_byte * 4); + + array weights(std::move(weights_shape), uint32, nullptr, {}); + weights.set_data(allocator::malloc(weights.nbytes())); + + // For scales and bias + shape[shape.size() - 1] = shape[shape.size() - 1] / weights_per_block; + array scales(shape, float16, nullptr, {}); + array biases(std::move(shape), float16, nullptr, {}); + scales.set_data(allocator::malloc(scales.nbytes())); + biases.set_data(allocator::malloc(biases.nbytes())); + + if (tensor.type == GGUF_TYPE_Q4_0) { + extract_q4_0_data(tensor, weights, scales, biases); + } else if (tensor.type == GGUF_TYPE_Q4_1) { + extract_q4_1_data(tensor, weights, scales, biases); + } else if (tensor.type == GGUF_TYPE_Q8_0) { + extract_q8_0_data(tensor, weights, scales, biases); + } + + a.insert({name, weights}); + + auto check_insert = [](auto inserted) { + if (!inserted.second) { + std::ostringstream msg; + msg << "[load_gguf] Duplicate parameter name " << inserted.first->second + << " this can happend when loading quantized tensors."; + throw std::runtime_error(msg.str()); + } + }; + + const std::string weight_suffix = ".weight"; + const std::string name_prefix = + name.substr(0, name.length() - weight_suffix.length()); + check_insert(a.insert({name_prefix + ".scales", scales})); + check_insert(a.insert({name_prefix + ".biases", biases})); +} + +} // namespace mlx::core diff --git a/tests/metal_tests.cpp b/tests/metal_tests.cpp index 1c748268e..ff4e3bb0f 100644 --- a/tests/metal_tests.cpp +++ b/tests/metal_tests.cpp @@ -500,7 +500,6 @@ TEST_CASE("test metal enable/disable cache") { auto buf = a.malloc(size, false); auto buf_ptr = static_cast(buf.ptr()); unsigned char first_byte = *reinterpret_cast(buf_ptr); - printf("first byte: %d\n", first_byte); // Release a a.free(buf); @@ -508,7 +507,6 @@ TEST_CASE("test metal enable/disable cache") { // If release successfully, the first byte should be different from the // first byte before release unsigned char new_first_byte = *reinterpret_cast(buf_ptr); - printf("new first byte: %d\n", new_first_byte); CHECK_NE(new_first_byte, first_byte); }