From eec3dc9f544e818c39ba3f3500c7a7e8eb6e29c9 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 9 Jan 2024 18:46:26 +0100 Subject: [PATCH] F16 output for dequantization. --- gguflib.c | 112 +++++++++++++++++++++++++++++++++++++++++++----------- gguflib.h | 1 + 2 files changed, 90 insertions(+), 23 deletions(-) diff --git a/gguflib.c b/gguflib.c index f72cddb..8231b0b 100644 --- a/gguflib.c +++ b/gguflib.c @@ -504,9 +504,22 @@ int gguf_append_tensor_data(gguf_ctx *ctx, void *tensor, uint64_t tensor_size) { /* ============================ GGUF dequantization ========================= */ +/* This callback is used by dequantization functions to store dequantized + * weights in a different format than f32. By default all the dequantization + * functions will store f32 floats just just f[j] = weight, but if + * a store callback is passed, the function will be used. */ +typedef void (*store_float_callback)(void *dst, uint64_t idx, float f); + +/* Callback used to store F16 when dequantizing. */ +void gguf_store_f16_callback(void *dst, uint64_t idx, float f) { + uint16_t *f16 = dst; + f16[idx] = to_half(f); +} + /* Q8_0 blocks dequantization to floats. - * 'y' is supposed to have enough space for 'count' weights. */ -void gguf_q8_0_to_float(void *weights_data, float *y, uint64_t count) { + * 'dst' is supposed to have enough space for 'count' weights. */ +void gguf_q8_0_to_float(void *weights_data, void *dst, uint64_t count, store_float_callback store_callback) { + float *f = dst; struct gguf_tensor_type_features *tf = gguf_get_tensor_type_features(GGUF_TYPE_Q8_0); @@ -519,8 +532,12 @@ void gguf_q8_0_to_float(void *weights_data, float *y, uint64_t count) { * weights in the block. */ float scale = from_half(*((uint16_t*)block)); for (uint32_t j = 0; j < tf->items_per_block; j++) { - y[i++] = block[j+2] * scale; // j+2 to skip the scale bytes. - if (i == count) break; + float weight = block[j+2] * scale; // j+2 to skip the scale bytes. + if (store_callback) + store_callback(dst,i,weight); + else + f[i] = weight; + if (++i == count) break; } block += tf->bytes_per_block; // Go to the next block. } @@ -528,7 +545,8 @@ void gguf_q8_0_to_float(void *weights_data, float *y, uint64_t count) { /* Q4_K blocks dequantization to floats. * 'y' is supposed to have enough space for 'count' weights. */ -void gguf_q4_k_to_float(void *weights_data, float *y, uint64_t count) { +void gguf_q4_k_to_float(void *weights_data, void *dst, uint64_t count, store_float_callback store_callback) { + float *f = dst; uint8_t *block = weights_data; uint64_t i = 0; // i-th weight to dequantize. while(i < count) { @@ -593,14 +611,22 @@ void gguf_q4_k_to_float(void *weights_data, float *y, uint64_t count) { /* First set: higher bits. */ for (uint32_t j = 0; j < 32; j++) { uint8_t w = block[j] & 0xf; - y[i++] = w * scale - min; - if (i == count) return; + float weight = w * scale - min; + if (store_callback) + store_callback(dst,i,weight); + else + f[i] = weight; + if (++i == count) return; } /* Second set: lower bits. */ for (uint32_t j = 0; j < 32; j++) { uint8_t w = block[j] >> 4; - y[i++] = w * scale - min; - if (i == count) return; + float weight = w * scale - min; + if (store_callback) + store_callback(dst,i,weight); + else + f[i] = weight; + if (++i == count) return; } block += 32; // Skip the two processed blocks. } @@ -609,7 +635,8 @@ void gguf_q4_k_to_float(void *weights_data, float *y, uint64_t count) { /* Q6_K blocks dequantization to floats. * 'y' is supposed to have enough space for 'count' weights. */ -void gguf_q6_k_to_float(void *weights_data, float *y, uint64_t count) { +void gguf_q6_k_to_float(void *weights_data, void *dst, uint64_t count, store_float_callback store_callback) { + float *f = dst; uint8_t *block = weights_data; uint64_t i = 0; // i-th weight to dequantize. while(i < count) { @@ -667,12 +694,16 @@ void gguf_q6_k_to_float(void *weights_data, float *y, uint64_t count) { int8_t *scales = (int8_t*)block+128+64; for (int cluster = 0; cluster < 2; cluster++) { for (uint64_t j = 0; j < 128; j++) { - y[i] = (super_scale * scales[j/16]) * + float weight = + (super_scale * scales[j/16]) * ((int8_t) ((((L[j%64] >> (j/64*4)) & 0xF) | (((H[j%32] >> (j/32*2)) & 3) << 4)))-32); - i++; - if (i == count) return; + if (store_callback) + store_callback(dst,i,weight); + else + f[i] = weight; + if (++i == count) return; } L += 64; H += 32; @@ -684,7 +715,8 @@ void gguf_q6_k_to_float(void *weights_data, float *y, uint64_t count) { /* Q2_K blocks dequantization to floats. * 'y' is supposed to have enough space for 'count' weights. */ -void gguf_q2_k_to_float(void *weights_data, float *y, uint64_t count) { +void gguf_q2_k_to_float(void *weights_data, void *dst, uint64_t count, store_float_callback store_callback) { + float *f = dst; uint8_t *block = weights_data; uint64_t i = 0; // i-th weight to dequantize. while(i < count) { @@ -734,8 +766,12 @@ void gguf_q2_k_to_float(void *weights_data, float *y, uint64_t count) { bn++; } uint8_t q = (block[16+j%32+cluster*32] >> (j/32*2)) & 3; - y[i++] = q * scale - min; - if (i == count) return; + float weight = q * scale - min; + if (store_callback) + store_callback(dst,i,weight); + else + f[i] = weight; + if (++i == count) return; } } block += 16+64+4; @@ -744,11 +780,16 @@ void gguf_q2_k_to_float(void *weights_data, float *y, uint64_t count) { /* FP16 blocks dequantization to floats. * 'y' is supposed to have enough space for 'count' weights. */ -void gguf_f16_to_float(void *weights_data, float *y, uint64_t count) { +void gguf_f16_to_float(void *weights_data, float *dst, uint64_t count, store_float_callback store_callback) { + float *f = dst; uint64_t i = 0; // i-th weight to dequantize. uint16_t *w16 = weights_data; while(i < count) { - y[i] = from_half(w16[i]); + float weight = from_half(w16[i]); + if (store_callback) + store_callback(dst,i,weight); + else + f[i] = weight; i++; } } @@ -764,18 +805,43 @@ float *gguf_tensor_to_float(gguf_tensor *tensor) { if (tensor->type == GGUF_TYPE_F32) { memcpy(f, tensor->weights_data, tensor->num_weights*sizeof(float)); } else if (tensor->type == GGUF_TYPE_F16) { - gguf_f16_to_float(tensor->weights_data, f, tensor->num_weights); + gguf_f16_to_float(tensor->weights_data, f, tensor->num_weights, NULL); } else if (tensor->type == GGUF_TYPE_Q8_0) { - gguf_q8_0_to_float(tensor->weights_data, f, tensor->num_weights); + gguf_q8_0_to_float(tensor->weights_data, f, tensor->num_weights, NULL); } else if (tensor->type == GGUF_TYPE_Q4_K) { - gguf_q4_k_to_float(tensor->weights_data, f, tensor->num_weights); + gguf_q4_k_to_float(tensor->weights_data, f, tensor->num_weights, NULL); } else if (tensor->type == GGUF_TYPE_Q6_K) { - gguf_q6_k_to_float(tensor->weights_data, f, tensor->num_weights); + gguf_q6_k_to_float(tensor->weights_data, f, tensor->num_weights, NULL); } else if (tensor->type == GGUF_TYPE_Q2_K) { - gguf_q2_k_to_float(tensor->weights_data, f, tensor->num_weights); + gguf_q2_k_to_float(tensor->weights_data, f, tensor->num_weights, NULL); } else { errno = EINVAL; return NULL; } return f; } + +/* Same as gguf_tensor_to_float() but the result will be an f16 tensor, that is + * an array of int16_t values. */ +int16_t *gguf_tensor_to_f16(gguf_tensor *tensor) { + int16_t *f16 = malloc(tensor->num_weights*sizeof(int16_t)); + if (tensor->type == GGUF_TYPE_F32) { + float *f = (float*)tensor->weights_data; + for (uint64_t j = 0; j < tensor->num_weights; j++) + f16[j] = to_half(f[j]); + } else if (tensor->type == GGUF_TYPE_F16) { + memcpy(f16, tensor->weights_data, tensor->num_weights*sizeof(int16_t)); + } else if (tensor->type == GGUF_TYPE_Q8_0) { + gguf_q8_0_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_f16_callback); + } else if (tensor->type == GGUF_TYPE_Q4_K) { + gguf_q4_k_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_f16_callback); + } else if (tensor->type == GGUF_TYPE_Q6_K) { + gguf_q6_k_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_f16_callback); + } else if (tensor->type == GGUF_TYPE_Q2_K) { + gguf_q2_k_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_f16_callback); + } else { + errno = EINVAL; + return NULL; + } + return f16; +} diff --git a/gguflib.h b/gguflib.h index 7aea123..8eb9eae 100644 --- a/gguflib.h +++ b/gguflib.h @@ -184,5 +184,6 @@ int gguf_append_tensor_data(gguf_ctx *ctx, void *tensor, uint64_t tensor_size); uint64_t gguf_get_alignment_padding(uint64_t alignment, uint64_t offset); void gguf_skip_key_values_section(gguf_ctx *ctx); float *gguf_tensor_to_float(gguf_tensor *tensor); +int16_t *gguf_tensor_to_f16(gguf_tensor *tensor); #endif