mirror of
https://github.com/ml-explore/mlx.git
synced 2025-06-24 17:31:16 +08:00
GGUF: Avoid dequantization when format is compatible (#426)
* GGUF: Don't dequantize q4_1 * Fix weight order. First in low bits * Add unpacking for q4_0 * Don't dequantize q8_0 * rebase quants and split file * don't quantize every weight * reapply patch * error handling --------- Co-authored-by: Awni Hannun <awni@apple.com>
This commit is contained in:
parent
37fc9db82c
commit
4fe2fa2a64
@ -4,6 +4,7 @@ target_sources(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/safetensor.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/gguf.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/gguf_quants.cpp
|
||||
)
|
||||
|
||||
MESSAGE(STATUS "Downloading json")
|
||||
|
@ -1,17 +1,10 @@
|
||||
// Copyright © 2023 Apple Inc.
|
||||
// Copyright © 2023-2024 Apple Inc.
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <numeric>
|
||||
|
||||
#include "mlx/io.h"
|
||||
#include "mlx/primitives.h"
|
||||
#include "mlx/transforms.h"
|
||||
#include "mlx/utils.h"
|
||||
|
||||
extern "C" {
|
||||
#include <gguflib.h>
|
||||
}
|
||||
#include <mlx/io/gguf.h>
|
||||
|
||||
namespace mlx::core {
|
||||
|
||||
@ -52,7 +45,16 @@ std::optional<Dtype> gguf_type_to_dtype(const uint32_t& gguf_type) {
|
||||
}
|
||||
}
|
||||
|
||||
std::pair<allocator::Buffer, Dtype> extract_tensor_data(gguf_tensor* tensor) {
|
||||
std::vector<int> get_shape(const gguf_tensor& tensor) {
|
||||
std::vector<int> shape;
|
||||
// The dimension order in GGML is the reverse of the order used in MLX.
|
||||
for (int i = tensor.ndim - 1; i >= 0; i--) {
|
||||
shape.push_back(tensor.dim[i]);
|
||||
}
|
||||
return shape;
|
||||
}
|
||||
|
||||
std::tuple<allocator::Buffer, Dtype> extract_tensor_data(gguf_tensor* tensor) {
|
||||
std::optional<Dtype> equivalent_dtype = gguf_type_to_dtype(tensor->type);
|
||||
// If there's an equivalent type, we can simply copy.
|
||||
if (equivalent_dtype.has_value()) {
|
||||
@ -203,16 +205,27 @@ std::unordered_map<std::string, MetaData> load_metadata(gguf_ctx* ctx) {
|
||||
std::unordered_map<std::string, array> load_arrays(gguf_ctx* ctx) {
|
||||
std::unordered_map<std::string, array> array_map;
|
||||
gguf_tensor tensor;
|
||||
while (gguf_get_tensor(ctx, &tensor)) {
|
||||
std::vector<int> shape;
|
||||
// The dimension order in GGML is the reverse of the order used in MLX.
|
||||
for (int i = tensor.ndim - 1; i >= 0; i--) {
|
||||
shape.push_back(tensor.dim[i]);
|
||||
|
||||
auto check_insert = [](auto inserted) {
|
||||
if (!inserted.second) {
|
||||
std::ostringstream msg;
|
||||
msg << "[load_gguf] Duplicate parameter name " << inserted.first->second
|
||||
<< " this can happend when loading quantized tensors.";
|
||||
throw std::runtime_error(msg.str());
|
||||
}
|
||||
};
|
||||
|
||||
while (gguf_get_tensor(ctx, &tensor)) {
|
||||
if (tensor.type == GGUF_TYPE_Q4_0 || tensor.type == GGUF_TYPE_Q4_1 ||
|
||||
tensor.type == GGUF_TYPE_Q8_0) {
|
||||
gguf_load_quantized(array_map, tensor);
|
||||
} else {
|
||||
std::string name = std::string(tensor.name, tensor.namelen);
|
||||
|
||||
const auto& [data, dtype] = extract_tensor_data(&tensor);
|
||||
array loaded_array = array(data, get_shape(tensor), dtype);
|
||||
array_map.insert({name, loaded_array});
|
||||
}
|
||||
const auto& [data, dtype] = extract_tensor_data(&tensor);
|
||||
array loaded_array = array(data, shape, dtype);
|
||||
std::string name = std::string(tensor.name, tensor.namelen);
|
||||
array_map.insert({name, loaded_array});
|
||||
}
|
||||
return array_map;
|
||||
}
|
||||
|
20
mlx/io/gguf.h
Normal file
20
mlx/io/gguf.h
Normal file
@ -0,0 +1,20 @@
|
||||
// Copyright © 2023-2024 Apple Inc.
|
||||
#pragma once
|
||||
|
||||
#include "mlx/io.h"
|
||||
#include "mlx/primitives.h"
|
||||
#include "mlx/transforms.h"
|
||||
#include "mlx/utils.h"
|
||||
|
||||
extern "C" {
|
||||
#include <gguflib.h>
|
||||
}
|
||||
|
||||
namespace mlx::core {
|
||||
|
||||
std::vector<int> get_shape(const gguf_tensor& tensor);
|
||||
void gguf_load_quantized(
|
||||
std::unordered_map<std::string, array>& a,
|
||||
const gguf_tensor& tensor);
|
||||
|
||||
} // namespace mlx::core
|
158
mlx/io/gguf_quants.cpp
Normal file
158
mlx/io/gguf_quants.cpp
Normal file
@ -0,0 +1,158 @@
|
||||
// Copyright © 2023-2024 Apple Inc.
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
||||
#include <mlx/io/gguf.h>
|
||||
|
||||
namespace mlx::core {
|
||||
|
||||
void unpack_32_4(uint8_t* data, int8_t* dst) {
|
||||
for (int64_t j = 0; j < 16; ++j) {
|
||||
uint8_t x = (data[j + 2] & 0x0F); // j+2 to skip scale bytes.
|
||||
if (j % 2 != 0) {
|
||||
x <<= 4;
|
||||
}
|
||||
dst[j / 2] += x;
|
||||
}
|
||||
// Last 16 weights are in the higher bits
|
||||
for (int64_t j = 0; j < 16; ++j) {
|
||||
uint8_t x = (data[j + 2] >> 4);
|
||||
if (j % 2 != 0) {
|
||||
x <<= 4;
|
||||
}
|
||||
dst[8 + j / 2] += x;
|
||||
}
|
||||
}
|
||||
|
||||
// Extracts (weight, scales, biases) from Q4_0 tensors.
|
||||
// Data layout is: |16 bit scale|32 x 4bit weights|.
|
||||
void extract_q4_0_data(
|
||||
const gguf_tensor& tensor,
|
||||
array& weights_arr,
|
||||
array& scales_arr,
|
||||
array& biases_arr) {
|
||||
const uint64_t bytes_per_block = 18; // 2 bytes scale, 32x0.5 byte weights
|
||||
auto data = static_cast<uint8_t*>(tensor.weights_data);
|
||||
auto weights = weights_arr.data<int8_t>();
|
||||
auto scales = scales_arr.data<float16_t>();
|
||||
auto biases = biases_arr.data<float16_t>();
|
||||
for (int64_t i = 0; i < scales_arr.size(); i++) {
|
||||
scales[i] = *((float16_t*)data);
|
||||
biases[i] = -8 * scales[i];
|
||||
unpack_32_4(data, weights);
|
||||
weights += 16;
|
||||
data += bytes_per_block;
|
||||
}
|
||||
}
|
||||
|
||||
// Extracts (weight, scales, biases) from Q4_1 tensors.
|
||||
// Data layout is: |16 bit scale|16 bit bias|32 x 4bit weights|.
|
||||
void extract_q4_1_data(
|
||||
const gguf_tensor& tensor,
|
||||
array& weights_arr,
|
||||
array& scales_arr,
|
||||
array& biases_arr) {
|
||||
const uint64_t bytes_per_block =
|
||||
20; // 2 bytes scale, 2 bytes bias, 32x0.5 byte weights
|
||||
auto data = static_cast<uint8_t*>(tensor.weights_data);
|
||||
auto weights = weights_arr.data<int8_t>();
|
||||
auto scales = scales_arr.data<float16_t>();
|
||||
auto biases = biases_arr.data<float16_t>();
|
||||
for (int64_t i = 0; i < scales_arr.size(); i++) {
|
||||
scales[i] = *((float16_t*)data);
|
||||
biases[i] = *((float16_t*)(data) + 1);
|
||||
unpack_32_4(data, weights);
|
||||
weights += 16;
|
||||
data += bytes_per_block;
|
||||
}
|
||||
}
|
||||
|
||||
// Extracts (weight, scales, biases) from Q8_0 tensors.
|
||||
// Data layout is: |16 bit scale|32 x 8bit weights|.
|
||||
void extract_q8_0_data(
|
||||
const gguf_tensor& tensor,
|
||||
array& weights_arr,
|
||||
array& scales_arr,
|
||||
array& biases_arr) {
|
||||
const uint64_t weights_per_block = 32;
|
||||
const uint64_t bytes_per_block = 34; // 2 bytes scale, 32x1 byte weights
|
||||
auto data = static_cast<uint8_t*>(tensor.weights_data);
|
||||
auto weights = weights_arr.data<int8_t>();
|
||||
auto scales = scales_arr.data<float16_t>();
|
||||
auto biases = biases_arr.data<float16_t>();
|
||||
for (int64_t i = 0; i < scales_arr.size(); i++) {
|
||||
uint8_t* block_data = data + i * bytes_per_block;
|
||||
scales[i] = *((float16_t*)block_data);
|
||||
biases[i] = -128 * scales[i];
|
||||
for (int64_t j = 0; j < weights_per_block; ++j) {
|
||||
uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes.
|
||||
// Original data is in int8_t, so we add a bias of -128 and invert the
|
||||
// first bit.
|
||||
x ^= 1 << 7;
|
||||
weights[i * weights_per_block + j] = x;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void gguf_load_quantized(
|
||||
std::unordered_map<std::string, array>& a,
|
||||
const gguf_tensor& tensor) {
|
||||
uint64_t weights_per_byte;
|
||||
if (tensor.type == GGUF_TYPE_Q4_0 || tensor.type == GGUF_TYPE_Q4_1) {
|
||||
weights_per_byte = 2;
|
||||
} else { // tensor.type == GGUF_TYPE_Q8_0
|
||||
weights_per_byte = 1;
|
||||
}
|
||||
|
||||
std::string name = std::string(tensor.name, tensor.namelen);
|
||||
std::vector<int> shape = get_shape(tensor);
|
||||
const uint64_t weights_per_block = 32;
|
||||
if (shape[shape.size() - 1] % weights_per_block != 0) {
|
||||
std::ostringstream msg;
|
||||
msg << "[load_gguf] tensor " << name
|
||||
<< "has incompatible last dim shape: " << shape[shape.size() - 1];
|
||||
throw std::runtime_error(msg.str());
|
||||
}
|
||||
const uint64_t num_blocks = tensor.num_weights / weights_per_block;
|
||||
|
||||
std::vector<int> weights_shape = shape;
|
||||
weights_shape.back() /= (weights_per_byte * 4);
|
||||
|
||||
array weights(std::move(weights_shape), uint32, nullptr, {});
|
||||
weights.set_data(allocator::malloc(weights.nbytes()));
|
||||
|
||||
// For scales and bias
|
||||
shape[shape.size() - 1] = shape[shape.size() - 1] / weights_per_block;
|
||||
array scales(shape, float16, nullptr, {});
|
||||
array biases(std::move(shape), float16, nullptr, {});
|
||||
scales.set_data(allocator::malloc(scales.nbytes()));
|
||||
biases.set_data(allocator::malloc(biases.nbytes()));
|
||||
|
||||
if (tensor.type == GGUF_TYPE_Q4_0) {
|
||||
extract_q4_0_data(tensor, weights, scales, biases);
|
||||
} else if (tensor.type == GGUF_TYPE_Q4_1) {
|
||||
extract_q4_1_data(tensor, weights, scales, biases);
|
||||
} else if (tensor.type == GGUF_TYPE_Q8_0) {
|
||||
extract_q8_0_data(tensor, weights, scales, biases);
|
||||
}
|
||||
|
||||
a.insert({name, weights});
|
||||
|
||||
auto check_insert = [](auto inserted) {
|
||||
if (!inserted.second) {
|
||||
std::ostringstream msg;
|
||||
msg << "[load_gguf] Duplicate parameter name " << inserted.first->second
|
||||
<< " this can happend when loading quantized tensors.";
|
||||
throw std::runtime_error(msg.str());
|
||||
}
|
||||
};
|
||||
|
||||
const std::string weight_suffix = ".weight";
|
||||
const std::string name_prefix =
|
||||
name.substr(0, name.length() - weight_suffix.length());
|
||||
check_insert(a.insert({name_prefix + ".scales", scales}));
|
||||
check_insert(a.insert({name_prefix + ".biases", biases}));
|
||||
}
|
||||
|
||||
} // namespace mlx::core
|
@ -500,7 +500,6 @@ TEST_CASE("test metal enable/disable cache") {
|
||||
auto buf = a.malloc(size, false);
|
||||
auto buf_ptr = static_cast<MTL::Buffer*>(buf.ptr());
|
||||
unsigned char first_byte = *reinterpret_cast<unsigned char*>(buf_ptr);
|
||||
printf("first byte: %d\n", first_byte);
|
||||
|
||||
// Release a
|
||||
a.free(buf);
|
||||
@ -508,7 +507,6 @@ TEST_CASE("test metal enable/disable cache") {
|
||||
// If release successfully, the first byte should be different from the
|
||||
// first byte before release
|
||||
unsigned char new_first_byte = *reinterpret_cast<unsigned char*>(buf_ptr);
|
||||
printf("new first byte: %d\n", new_first_byte);
|
||||
|
||||
CHECK_NE(new_first_byte, first_byte);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user