F16 output for dequantization.

This commit is contained in:
antirez
2024-01-09 18:46:26 +01:00
parent 26e3a59233
commit eec3dc9f54
2 changed files with 90 additions and 23 deletions

112
gguflib.c
View File

@@ -504,9 +504,22 @@ int gguf_append_tensor_data(gguf_ctx *ctx, void *tensor, uint64_t tensor_size) {
/* ============================ GGUF dequantization ========================= */ /* ============================ GGUF dequantization ========================= */
/* This callback is used by dequantization functions to store dequantized
* weights in a different format than f32. By default all the dequantization
* functions will store f32 floats just just f[j] = weight, but if
* a store callback is passed, the function will be used. */
typedef void (*store_float_callback)(void *dst, uint64_t idx, float f);
/* Callback used to store F16 when dequantizing. */
void gguf_store_f16_callback(void *dst, uint64_t idx, float f) {
uint16_t *f16 = dst;
f16[idx] = to_half(f);
}
/* Q8_0 blocks dequantization to floats. /* Q8_0 blocks dequantization to floats.
* 'y' is supposed to have enough space for 'count' weights. */ * 'dst' is supposed to have enough space for 'count' weights. */
void gguf_q8_0_to_float(void *weights_data, float *y, uint64_t count) { void gguf_q8_0_to_float(void *weights_data, void *dst, uint64_t count, store_float_callback store_callback) {
float *f = dst;
struct gguf_tensor_type_features *tf = struct gguf_tensor_type_features *tf =
gguf_get_tensor_type_features(GGUF_TYPE_Q8_0); gguf_get_tensor_type_features(GGUF_TYPE_Q8_0);
@@ -519,8 +532,12 @@ void gguf_q8_0_to_float(void *weights_data, float *y, uint64_t count) {
* weights in the block. */ * weights in the block. */
float scale = from_half(*((uint16_t*)block)); float scale = from_half(*((uint16_t*)block));
for (uint32_t j = 0; j < tf->items_per_block; j++) { for (uint32_t j = 0; j < tf->items_per_block; j++) {
y[i++] = block[j+2] * scale; // j+2 to skip the scale bytes. float weight = block[j+2] * scale; // j+2 to skip the scale bytes.
if (i == count) break; if (store_callback)
store_callback(dst,i,weight);
else
f[i] = weight;
if (++i == count) break;
} }
block += tf->bytes_per_block; // Go to the next block. block += tf->bytes_per_block; // Go to the next block.
} }
@@ -528,7 +545,8 @@ void gguf_q8_0_to_float(void *weights_data, float *y, uint64_t count) {
/* Q4_K blocks dequantization to floats. /* Q4_K blocks dequantization to floats.
* 'y' is supposed to have enough space for 'count' weights. */ * 'y' is supposed to have enough space for 'count' weights. */
void gguf_q4_k_to_float(void *weights_data, float *y, uint64_t count) { void gguf_q4_k_to_float(void *weights_data, void *dst, uint64_t count, store_float_callback store_callback) {
float *f = dst;
uint8_t *block = weights_data; uint8_t *block = weights_data;
uint64_t i = 0; // i-th weight to dequantize. uint64_t i = 0; // i-th weight to dequantize.
while(i < count) { while(i < count) {
@@ -593,14 +611,22 @@ void gguf_q4_k_to_float(void *weights_data, float *y, uint64_t count) {
/* First set: higher bits. */ /* First set: higher bits. */
for (uint32_t j = 0; j < 32; j++) { for (uint32_t j = 0; j < 32; j++) {
uint8_t w = block[j] & 0xf; uint8_t w = block[j] & 0xf;
y[i++] = w * scale - min; float weight = w * scale - min;
if (i == count) return; if (store_callback)
store_callback(dst,i,weight);
else
f[i] = weight;
if (++i == count) return;
} }
/* Second set: lower bits. */ /* Second set: lower bits. */
for (uint32_t j = 0; j < 32; j++) { for (uint32_t j = 0; j < 32; j++) {
uint8_t w = block[j] >> 4; uint8_t w = block[j] >> 4;
y[i++] = w * scale - min; float weight = w * scale - min;
if (i == count) return; if (store_callback)
store_callback(dst,i,weight);
else
f[i] = weight;
if (++i == count) return;
} }
block += 32; // Skip the two processed blocks. block += 32; // Skip the two processed blocks.
} }
@@ -609,7 +635,8 @@ void gguf_q4_k_to_float(void *weights_data, float *y, uint64_t count) {
/* Q6_K blocks dequantization to floats. /* Q6_K blocks dequantization to floats.
* 'y' is supposed to have enough space for 'count' weights. */ * 'y' is supposed to have enough space for 'count' weights. */
void gguf_q6_k_to_float(void *weights_data, float *y, uint64_t count) { void gguf_q6_k_to_float(void *weights_data, void *dst, uint64_t count, store_float_callback store_callback) {
float *f = dst;
uint8_t *block = weights_data; uint8_t *block = weights_data;
uint64_t i = 0; // i-th weight to dequantize. uint64_t i = 0; // i-th weight to dequantize.
while(i < count) { while(i < count) {
@@ -667,12 +694,16 @@ void gguf_q6_k_to_float(void *weights_data, float *y, uint64_t count) {
int8_t *scales = (int8_t*)block+128+64; int8_t *scales = (int8_t*)block+128+64;
for (int cluster = 0; cluster < 2; cluster++) { for (int cluster = 0; cluster < 2; cluster++) {
for (uint64_t j = 0; j < 128; j++) { for (uint64_t j = 0; j < 128; j++) {
y[i] = (super_scale * scales[j/16]) * float weight =
(super_scale * scales[j/16]) *
((int8_t) ((int8_t)
((((L[j%64] >> (j/64*4)) & 0xF) | ((((L[j%64] >> (j/64*4)) & 0xF) |
(((H[j%32] >> (j/32*2)) & 3) << 4)))-32); (((H[j%32] >> (j/32*2)) & 3) << 4)))-32);
i++; if (store_callback)
if (i == count) return; store_callback(dst,i,weight);
else
f[i] = weight;
if (++i == count) return;
} }
L += 64; L += 64;
H += 32; H += 32;
@@ -684,7 +715,8 @@ void gguf_q6_k_to_float(void *weights_data, float *y, uint64_t count) {
/* Q2_K blocks dequantization to floats. /* Q2_K blocks dequantization to floats.
* 'y' is supposed to have enough space for 'count' weights. */ * 'y' is supposed to have enough space for 'count' weights. */
void gguf_q2_k_to_float(void *weights_data, float *y, uint64_t count) { void gguf_q2_k_to_float(void *weights_data, void *dst, uint64_t count, store_float_callback store_callback) {
float *f = dst;
uint8_t *block = weights_data; uint8_t *block = weights_data;
uint64_t i = 0; // i-th weight to dequantize. uint64_t i = 0; // i-th weight to dequantize.
while(i < count) { while(i < count) {
@@ -734,8 +766,12 @@ void gguf_q2_k_to_float(void *weights_data, float *y, uint64_t count) {
bn++; bn++;
} }
uint8_t q = (block[16+j%32+cluster*32] >> (j/32*2)) & 3; uint8_t q = (block[16+j%32+cluster*32] >> (j/32*2)) & 3;
y[i++] = q * scale - min; float weight = q * scale - min;
if (i == count) return; if (store_callback)
store_callback(dst,i,weight);
else
f[i] = weight;
if (++i == count) return;
} }
} }
block += 16+64+4; block += 16+64+4;
@@ -744,11 +780,16 @@ void gguf_q2_k_to_float(void *weights_data, float *y, uint64_t count) {
/* FP16 blocks dequantization to floats. /* FP16 blocks dequantization to floats.
* 'y' is supposed to have enough space for 'count' weights. */ * 'y' is supposed to have enough space for 'count' weights. */
void gguf_f16_to_float(void *weights_data, float *y, uint64_t count) { void gguf_f16_to_float(void *weights_data, float *dst, uint64_t count, store_float_callback store_callback) {
float *f = dst;
uint64_t i = 0; // i-th weight to dequantize. uint64_t i = 0; // i-th weight to dequantize.
uint16_t *w16 = weights_data; uint16_t *w16 = weights_data;
while(i < count) { while(i < count) {
y[i] = from_half(w16[i]); float weight = from_half(w16[i]);
if (store_callback)
store_callback(dst,i,weight);
else
f[i] = weight;
i++; i++;
} }
} }
@@ -764,18 +805,43 @@ float *gguf_tensor_to_float(gguf_tensor *tensor) {
if (tensor->type == GGUF_TYPE_F32) { if (tensor->type == GGUF_TYPE_F32) {
memcpy(f, tensor->weights_data, tensor->num_weights*sizeof(float)); memcpy(f, tensor->weights_data, tensor->num_weights*sizeof(float));
} else if (tensor->type == GGUF_TYPE_F16) { } else if (tensor->type == GGUF_TYPE_F16) {
gguf_f16_to_float(tensor->weights_data, f, tensor->num_weights); gguf_f16_to_float(tensor->weights_data, f, tensor->num_weights, NULL);
} else if (tensor->type == GGUF_TYPE_Q8_0) { } else if (tensor->type == GGUF_TYPE_Q8_0) {
gguf_q8_0_to_float(tensor->weights_data, f, tensor->num_weights); gguf_q8_0_to_float(tensor->weights_data, f, tensor->num_weights, NULL);
} else if (tensor->type == GGUF_TYPE_Q4_K) { } else if (tensor->type == GGUF_TYPE_Q4_K) {
gguf_q4_k_to_float(tensor->weights_data, f, tensor->num_weights); gguf_q4_k_to_float(tensor->weights_data, f, tensor->num_weights, NULL);
} else if (tensor->type == GGUF_TYPE_Q6_K) { } else if (tensor->type == GGUF_TYPE_Q6_K) {
gguf_q6_k_to_float(tensor->weights_data, f, tensor->num_weights); gguf_q6_k_to_float(tensor->weights_data, f, tensor->num_weights, NULL);
} else if (tensor->type == GGUF_TYPE_Q2_K) { } else if (tensor->type == GGUF_TYPE_Q2_K) {
gguf_q2_k_to_float(tensor->weights_data, f, tensor->num_weights); gguf_q2_k_to_float(tensor->weights_data, f, tensor->num_weights, NULL);
} else { } else {
errno = EINVAL; errno = EINVAL;
return NULL; return NULL;
} }
return f; return f;
} }
/* Same as gguf_tensor_to_float() but the result will be an f16 tensor, that is
* an array of int16_t values. */
int16_t *gguf_tensor_to_f16(gguf_tensor *tensor) {
int16_t *f16 = malloc(tensor->num_weights*sizeof(int16_t));
if (tensor->type == GGUF_TYPE_F32) {
float *f = (float*)tensor->weights_data;
for (uint64_t j = 0; j < tensor->num_weights; j++)
f16[j] = to_half(f[j]);
} else if (tensor->type == GGUF_TYPE_F16) {
memcpy(f16, tensor->weights_data, tensor->num_weights*sizeof(int16_t));
} else if (tensor->type == GGUF_TYPE_Q8_0) {
gguf_q8_0_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_f16_callback);
} else if (tensor->type == GGUF_TYPE_Q4_K) {
gguf_q4_k_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_f16_callback);
} else if (tensor->type == GGUF_TYPE_Q6_K) {
gguf_q6_k_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_f16_callback);
} else if (tensor->type == GGUF_TYPE_Q2_K) {
gguf_q2_k_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_f16_callback);
} else {
errno = EINVAL;
return NULL;
}
return f16;
}

View File

@@ -184,5 +184,6 @@ int gguf_append_tensor_data(gguf_ctx *ctx, void *tensor, uint64_t tensor_size);
uint64_t gguf_get_alignment_padding(uint64_t alignment, uint64_t offset); uint64_t gguf_get_alignment_padding(uint64_t alignment, uint64_t offset);
void gguf_skip_key_values_section(gguf_ctx *ctx); void gguf_skip_key_values_section(gguf_ctx *ctx);
float *gguf_tensor_to_float(gguf_tensor *tensor); float *gguf_tensor_to_float(gguf_tensor *tensor);
int16_t *gguf_tensor_to_f16(gguf_tensor *tensor);
#endif #endif