mirror of
https://github.com/antirez/gguf-tools.git
synced 2025-09-17 02:28:07 +08:00
Q4_K dequantization.
This commit is contained in:
11
gguf-tools.c
11
gguf-tools.c
@@ -9,6 +9,11 @@
|
|||||||
#include "sds.h"
|
#include "sds.h"
|
||||||
#include "fp16.h"
|
#include "fp16.h"
|
||||||
|
|
||||||
|
/* Global options that can could be used for all the subcommands. */
|
||||||
|
struct {
|
||||||
|
int verbose; // --verbose option
|
||||||
|
} Opt = {0};
|
||||||
|
|
||||||
/* ========================== Utility functions ============================ */
|
/* ========================== Utility functions ============================ */
|
||||||
|
|
||||||
/* Glob-style pattern matching. Return 1 on match, 0 otherwise. */
|
/* Glob-style pattern matching. Return 1 on match, 0 otherwise. */
|
||||||
@@ -170,6 +175,8 @@ void gguf_tools_show(const char *filename) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ======================= 'split-mixtral' subcommand ======================= */
|
||||||
|
|
||||||
/* Read a Mixtral MoE model and creates a new non-MoE GGUF file based
|
/* Read a Mixtral MoE model and creates a new non-MoE GGUF file based
|
||||||
* on the weights of the experts with IDs in the array of 'experts_id'.
|
* on the weights of the experts with IDs in the array of 'experts_id'.
|
||||||
* The array must contain 32 integers, one for each layer. */
|
* The array must contain 32 integers, one for each layer. */
|
||||||
@@ -312,6 +319,8 @@ void gguf_tools_split_mixtral(int *experts_id, const char *mixtral_filename, con
|
|||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ====================== 'inspect-weights' subcommand ====================== */
|
||||||
|
|
||||||
void gguf_tools_inspect_weights(const char *filename, const char *tname, uint64_t count) {
|
void gguf_tools_inspect_weights(const char *filename, const char *tname, uint64_t count) {
|
||||||
gguf_ctx *ctx = gguf_init(filename);
|
gguf_ctx *ctx = gguf_init(filename);
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
@@ -362,7 +371,7 @@ void gguf_tools_inspect_weights(const char *filename, const char *tname, uint64_
|
|||||||
/* ======================= Main and CLI options parsing ===================== */
|
/* ======================= Main and CLI options parsing ===================== */
|
||||||
|
|
||||||
void gguf_tools_usage(const char *progname) {
|
void gguf_tools_usage(const char *progname) {
|
||||||
printf("Usage: %s <subcommand> [options...]\n"
|
printf("Usage: %s <subcommand> [arguments...] [options...]\n"
|
||||||
"Subcommands:\n"
|
"Subcommands:\n"
|
||||||
" show <filename> -- show GGUF model keys and tensors.\n"
|
" show <filename> -- show GGUF model keys and tensors.\n"
|
||||||
" inspect-tensor <filename> <tensor-name> [count] -- show tensor weights.\n"
|
" inspect-tensor <filename> <tensor-name> [count] -- show tensor weights.\n"
|
||||||
|
79
gguflib.c
79
gguflib.c
@@ -511,9 +511,11 @@ float *gguf_tensor_to_float(gguf_tensor *tensor) {
|
|||||||
gguf_get_tensor_type_features(tensor->type);
|
gguf_get_tensor_type_features(tensor->type);
|
||||||
uint64_t block_size = tf->bytes_per_block;
|
uint64_t block_size = tf->bytes_per_block;
|
||||||
float *f = malloc(tensor->num_weights*sizeof(float));
|
float *f = malloc(tensor->num_weights*sizeof(float));
|
||||||
if (tensor->type == GUFF_TYPE_Q8_0) {
|
if (tensor->type == GGUF_TYPE_Q8_0) {
|
||||||
|
/* Very simple layout: |16 bit delta|32 x 8bit weights|
|
||||||
|
* Each weight is delta * quantized_weight[0..31] */
|
||||||
int8_t *block = (int8_t*)tensor->weights_data;
|
int8_t *block = (int8_t*)tensor->weights_data;
|
||||||
uint64_t i = 0;
|
uint64_t i = 0; // i-th weight to dequantize.
|
||||||
while(i < tensor->num_weights) {
|
while(i < tensor->num_weights) {
|
||||||
/* For each block get the delta and convert all the
|
/* For each block get the delta and convert all the
|
||||||
* weights in the block. */
|
* weights in the block. */
|
||||||
@@ -524,6 +526,79 @@ float *gguf_tensor_to_float(gguf_tensor *tensor) {
|
|||||||
}
|
}
|
||||||
block += block_size; // Go to the next block.
|
block += block_size; // Go to the next block.
|
||||||
}
|
}
|
||||||
|
} else if (tensor->type == GGUF_TYPE_Q4_K) {
|
||||||
|
uint8_t *block = (uint8_t*)tensor->weights_data;
|
||||||
|
uint64_t i = 0; // i-th weight to dequantize.
|
||||||
|
while(i < tensor->num_weights) {
|
||||||
|
/* Q4_K super-blocks have 256 total weights, split in 8 sub-block.
|
||||||
|
* Each 8 sub-blocks have a different set of deltas/mins, so
|
||||||
|
* there are 16 total values for deltas/mins, but the deltas/mins
|
||||||
|
* are also quantized (6 bits each) using two different deltas:
|
||||||
|
* delta_of_deltas and delta_of_mins, that are two FP16 values
|
||||||
|
* at the start of the super block, so:
|
||||||
|
*
|
||||||
|
* |FP16 d_of_deltas | +
|
||||||
|
* |FP16 d_of_mins | +
|
||||||
|
* |16 6 bit integers d,m pairs, one per sub-block of 32 ele | +
|
||||||
|
* |256 x 4bit weights|
|
||||||
|
*/
|
||||||
|
float deltas_delta = from_half(*((uint16_t*)block));
|
||||||
|
float mins_delta = from_half(*((uint16_t*)(block+2)));
|
||||||
|
block += 4;
|
||||||
|
|
||||||
|
/* Extract the 16 x 6 bit values deltas-mins pairs. The
|
||||||
|
* encoding of those values is odd because of performance
|
||||||
|
* reasons:
|
||||||
|
*
|
||||||
|
* dddddddd dddddddd dddddddd dddddddd mmmmmmmm mmmmmmmm
|
||||||
|
* 44000000|55111111|66222222|77333333|44000000|55111111
|
||||||
|
*
|
||||||
|
* mmmmmmmm mmmmmmmm mmmmdddd mmmmdddd mmmmdddd mmmmdddd
|
||||||
|
* 66222222|77333333|44444444|55555555|66666666|77777777
|
||||||
|
*
|
||||||
|
* In the above diagram you can see the 12 bytes and the
|
||||||
|
* deltas/mins 6 bits encodings. */
|
||||||
|
|
||||||
|
/* Scale deltas/mins. */
|
||||||
|
float deltas[8], mins[8];
|
||||||
|
for (int j = 0; j < 8; j++) {
|
||||||
|
uint8_t d,m;
|
||||||
|
if (j < 4) {
|
||||||
|
d = block[j] & 63;
|
||||||
|
m = block[j+4] & 63;
|
||||||
|
} else {
|
||||||
|
d = (block[j+4] & 0xF) | ((block[j-4] >> 6) << 4);
|
||||||
|
m = (block[j+4] >> 4) | ((block[j-0] >> 6) << 4);
|
||||||
|
}
|
||||||
|
deltas[j] = d * deltas_delta;
|
||||||
|
mins[j] = m * mins_delta;
|
||||||
|
}
|
||||||
|
block += 12; // Seek 4-bit weights start.
|
||||||
|
|
||||||
|
/* Finally we can extract the 256 weights.
|
||||||
|
* We process two blocks per time, because each
|
||||||
|
* 32 bytes have 64 weights stored like this:
|
||||||
|
* First 32 weights of the first block are the higher 4
|
||||||
|
* bits of each byte. Second 32 weights of the second
|
||||||
|
* block are lower 4 bits of each byte. */
|
||||||
|
for (uint32_t b = 0; b < 8; b += 2) {
|
||||||
|
float delta = deltas[b];
|
||||||
|
float min = mins[b];
|
||||||
|
/* First set: higher bits. */
|
||||||
|
for (uint32_t j = 0; j < 32; j++) {
|
||||||
|
uint8_t w = block[j] & 0xf;
|
||||||
|
f[i++] = w * delta - min;
|
||||||
|
if (i == tensor->num_weights) return f;
|
||||||
|
}
|
||||||
|
/* Second set: lower bits. */
|
||||||
|
for (uint32_t j = 0; j < 32; j++) {
|
||||||
|
uint8_t w = block[j] >> 4;
|
||||||
|
f[i++] = w * delta - min;
|
||||||
|
if (i == tensor->num_weights) return f;
|
||||||
|
}
|
||||||
|
block += 32; // Skip the two processed blocks.
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
errno = EINVAL;
|
errno = EINVAL;
|
||||||
return NULL;
|
return NULL;
|
||||||
|
40
gguflib.h
40
gguflib.h
@@ -13,27 +13,27 @@
|
|||||||
/* ============================ Enums and structures ======================== */
|
/* ============================ Enums and structures ======================== */
|
||||||
|
|
||||||
enum gguf_tensor_type {
|
enum gguf_tensor_type {
|
||||||
GUFF_TYPE_F32 = 0,
|
GGUF_TYPE_F32 = 0,
|
||||||
GUFF_TYPE_F16 = 1,
|
GGUF_TYPE_F16 = 1,
|
||||||
GUFF_TYPE_Q4_0 = 2,
|
GGUF_TYPE_Q4_0 = 2,
|
||||||
GUFF_TYPE_Q4_1 = 3,
|
GGUF_TYPE_Q4_1 = 3,
|
||||||
// GUFF_TYPE_Q4_2 = 4, support has been removed
|
// GGUF_TYPE_Q4_2 = 4, support has been removed
|
||||||
// GUFF_TYPE_Q4_3 (5) support has been removed
|
// GGUF_TYPE_Q4_3 (5) support has been removed
|
||||||
GUFF_TYPE_Q5_0 = 6,
|
GGUF_TYPE_Q5_0 = 6,
|
||||||
GUFF_TYPE_Q5_1 = 7,
|
GGUF_TYPE_Q5_1 = 7,
|
||||||
GUFF_TYPE_Q8_0 = 8,
|
GGUF_TYPE_Q8_0 = 8,
|
||||||
GUFF_TYPE_Q8_1 = 9,
|
GGUF_TYPE_Q8_1 = 9,
|
||||||
// k-quantizations
|
// k-quantizations
|
||||||
GUFF_TYPE_Q2_K = 10,
|
GGUF_TYPE_Q2_K = 10,
|
||||||
GUFF_TYPE_Q3_K = 11,
|
GGUF_TYPE_Q3_K = 11,
|
||||||
GUFF_TYPE_Q4_K = 12,
|
GGUF_TYPE_Q4_K = 12,
|
||||||
GUFF_TYPE_Q5_K = 13,
|
GGUF_TYPE_Q5_K = 13,
|
||||||
GUFF_TYPE_Q6_K = 14,
|
GGUF_TYPE_Q6_K = 14,
|
||||||
GUFF_TYPE_Q8_K = 15,
|
GGUF_TYPE_Q8_K = 15,
|
||||||
GUFF_TYPE_I8,
|
GGUF_TYPE_I8,
|
||||||
GUFF_TYPE_I16,
|
GGUF_TYPE_I16,
|
||||||
GUFF_TYPE_I32,
|
GGUF_TYPE_I32,
|
||||||
GUFF_TYPE_COUNT,
|
GGUF_TYPE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum gguf_value_type {
|
enum gguf_value_type {
|
||||||
|
Reference in New Issue
Block a user