Quantization functions refactoring.

2025-12-16 00:18:52 +08:00 · 2024-01-03 21:02:17 +01:00
parent ff16bc3dcf
commit b1f32c4088
1 changed files with 193 additions and 169 deletions
--- a/gguflib.c
+++ b/gguflib.c
@@ -500,45 +500,34 @@ int gguf_append_tensor_data(gguf_ctx *ctx, void *tensor, uint64_t tensor_size) {
 /* ============================ GGUF dequantization ========================= */
-/* Convert the specified tensor (quantized or not) into an array of
+/* G8_0 blocks dequantization to floats.
- * floats. The array is allocated with malloc(). If the tensor is already
+ * 'y' is supposed to have enough space for 'count' weights. */
- * in FP32 floats format, it is just memcpy()-ed to the destination array.
+void gguf_q8_0_to_float(void *weights_data, float *y, uint64_t count) {
 *
 * On OOM, NULL is returned. If the tensor format is not yet supported,
 * NULL is returned as well, but errno is set to EINVAL. */
 float *gguf_tensor_to_float(gguf_tensor *tensor) {
    struct gguf_tensor_type_features *tf =
-        gguf_get_tensor_type_features(tensor->type);
+        gguf_get_tensor_type_features(GGUF_TYPE_Q8_0);
-    uint64_t block_size = tf->bytes_per_block;
+
    float *f = malloc(tensor->num_weights*sizeof(float));
    if (tensor->type == GGUF_TYPE_F32) {
        memcpy(f, tensor->weights_data, tensor->num_weights*sizeof(float));
    } else if (tensor->type == GGUF_TYPE_F16) {
        uint64_t i = 0; // i-th weight to dequantize.
        uint16_t *w16 = (uint16_t*) tensor->weights_data;
        while(i < tensor->num_weights) {
            f[i] = from_half(w16[i]);
            i++;
        }
    } else if (tensor->type == GGUF_TYPE_Q8_0) {
    /* Very simple layout: |16 bit scale|32 x 8bit weights|
     * Each weight is scale * quantized_weight[0..31] */
-        int8_t *block = (int8_t*)tensor->weights_data;
+    int8_t *block = weights_data;
    uint64_t i = 0; // i-th weight to dequantize.
-        while(i < tensor->num_weights) {
+    while(i < count) {
        /* For each block get the scale and convert all the
         * weights in the block. */
        float scale = from_half(*((uint16_t*)block));
        for (uint32_t j = 0; j < tf->items_per_block; j++) {
-                f[i++] = block[j+2] * scale; // j+2 to skip the scale bytes.
+            y[i++] = block[j+2] * scale; // j+2 to skip the scale bytes.
-                if (i == tensor->num_weights) break;
+            if (i == count) break;
        }
-            block += block_size; // Go to the next block.
+        block += tf->bytes_per_block; // Go to the next block.
    }
-    } else if (tensor->type == GGUF_TYPE_Q4_K) {
+}
-        uint8_t *block = (uint8_t*)tensor->weights_data;
+
 /* G4_K blocks dequantization to floats.
 * 'y' is supposed to have enough space for 'count' weights. */
 void gguf_q4_k_to_float(void *weights_data, float *y, uint64_t count) {
    uint8_t *block = weights_data;
    uint64_t i = 0; // i-th weight to dequantize.
-        while(i < tensor->num_weights) {
+    while(i < count) {
        /* Q4_K super-blocks have 256 total weights, split in 8 sub-block.
         * Each 8 sub-blocks have a different set of scales/mins, so
         * there are 16 total values for scales/mins, but the scales/mins
@@ -600,22 +589,26 @@ float *gguf_tensor_to_float(gguf_tensor *tensor) {
            /* First set: higher bits. */
            for (uint32_t j = 0; j < 32; j++) {
                uint8_t w = block[j] & 0xf;
-                    f[i++] = w * scale - min;
+                y[i++] = w * scale - min;
-                    if (i == tensor->num_weights) return f;
+                if (i == count) return;
            }
            /* Second set: lower bits. */
            for (uint32_t j = 0; j < 32; j++) {
                uint8_t w = block[j] >> 4;
-                    f[i++] = w * scale - min;
+                y[i++] = w * scale - min;
-                    if (i == tensor->num_weights) return f;
+                if (i == count) return;
            }
            block += 32; // Skip the two processed blocks.
        }
    }
-    } else if (tensor->type == GGUF_TYPE_Q6_K) {
+}
-        uint8_t *block = (uint8_t*)tensor->weights_data;
+
 /* G6_K blocks dequantization to floats.
 * 'y' is supposed to have enough space for 'count' weights. */
 void gguf_q6_k_to_float(void *weights_data, float *y, uint64_t count) {
    uint8_t *block = weights_data;
    uint64_t i = 0; // i-th weight to dequantize.
-        while(i < tensor->num_weights) {
+    while(i < count) {
        /* Q6_K super-blocks have 256 total weights, split in 16 sub-block
         * of 16 elements. There are no mins, just scales. Each sub-block
         * have a block-specific scale quantized at 8 bits via a single
@@ -670,12 +663,12 @@ float *gguf_tensor_to_float(gguf_tensor *tensor) {
        int8_t *scales = (int8_t*)block+128+64;
        for (int cluster = 0; cluster < 2; cluster++) {
            for (uint64_t j = 0; j < 128; j++) {
-                    f[i] = (super_scale * scales[j/16]) *
+                y[i] = (super_scale * scales[j/16]) *
                       ((int8_t)
                        ((((L[j%64] >> (j/64*4)) & 0xF) |
                         (((H[j%32] >> (j/32*2)) & 3) << 4)))-32);
                i++;
-                    if (i == tensor->num_weights) return f;
+                if (i == count) return;
            }
            L += 64;
            H += 32;
@@ -683,6 +676,37 @@ float *gguf_tensor_to_float(gguf_tensor *tensor) {
        }
        block += 128+64+16+2; // Go to the next block.
    }
 }
 /* FP16 blocks dequantization to floats.
 * 'y' is supposed to have enough space for 'count' weights. */
 void gguf_f16_to_float(void *weights_data, float *y, uint64_t count) {
    uint64_t i = 0; // i-th weight to dequantize.
    uint16_t *w16 = weights_data;
    while(i < count) {
        y[i] = from_half(w16[i]);
        i++;
    }
 }
 /* Convert the specified tensor (quantized or not) into an array of
 * floats. The array is allocated with malloc(). If the tensor is already
 * in FP32 floats format, it is just memcpy()-ed to the destination array.
 *
 * On OOM, NULL is returned. If the tensor format is not yet supported,
 * NULL is returned as well, but errno is set to EINVAL. */
 float *gguf_tensor_to_float(gguf_tensor *tensor) {
    float *f = malloc(tensor->num_weights*sizeof(float));
    if (tensor->type == GGUF_TYPE_F32) {
        memcpy(f, tensor->weights_data, tensor->num_weights*sizeof(float));
    } else if (tensor->type == GGUF_TYPE_F16) {
        gguf_f16_to_float(tensor->weights_data, f, tensor->num_weights);
    } else if (tensor->type == GGUF_TYPE_Q8_0) {
        gguf_q8_0_to_float(tensor->weights_data, f, tensor->num_weights);
    } else if (tensor->type == GGUF_TYPE_Q4_K) {
        gguf_q4_k_to_float(tensor->weights_data, f, tensor->num_weights);
    } else if (tensor->type == GGUF_TYPE_Q6_K) {
        gguf_q6_k_to_float(tensor->weights_data, f, tensor->num_weights);
    } else {
        errno = EINVAL;
        return NULL;