Merge pull request #8 from jbochi/q4

Add support for q4_0 and q4_1 quantizations
2025-12-16 00:18:52 +08:00 · 2024-01-10 00:10:54 +01:00
parent eec3dc9f54 dc69c608df
commit fe34f6ec5c
1 changed files with 91 additions and 0 deletions
--- a/gguflib.c
+++ b/gguflib.c
@@ -778,6 +778,89 @@ void gguf_q2_k_to_float(void *weights_data, void *dst, uint64_t count, store_flo
    }
 }

+/* Q4_0 blocks dequantization to floats.
+ * 'dst' is supposed to have enough space for 'count' weights. */
+void gguf_q4_0_to_float(void *weights_data, void *dst, uint64_t count, store_float_callback store_callback) {
+    float *f = dst;
+    struct gguf_tensor_type_features *tf =
+        gguf_get_tensor_type_features(GGUF_TYPE_Q4_0);
+
+    /* Very simple layout: |16 bit scale|32 x 4bit weights|
+     * Each weight is scale * (quantized_weight[0..31] - 8) */
+    uint8_t *block = weights_data;
+    uint64_t i = 0; // i-th weight to dequantize.
+    while(i < count) {
+        /* For each block get the scale and convert all the
+         * weights in the block. */
+        float scale = from_half(*((uint16_t*)block));
+        /* First 16 weights are in the lower bits */
+        for (uint32_t j = 0; j < 16; j++) {
+            uint8_t value = block[j+2]; // j+2 to skip the scale bytes.
+            value &= 0xf;  // lower bits
+            float weight = ((int8_t) value - 8) * scale;
+            if (store_callback)
+                store_callback(dst,i,weight);
+            else
+                f[i] = weight;
+            if (++i == count) break;
+        }
+        /* Last 16 weights are in the higher bits */
+        for (uint32_t j = 0; j < 16; j++) {
+            uint8_t value = block[j+2]; // j+2 to skip the scale bytes.
+            value >>= 4;  // higher bits
+            float weight = ((int8_t) value - 8) * scale;
+            if (store_callback)
+                store_callback(dst,i,weight);
+            else
+                f[i] = weight;
+            if (++i == count) break;
+        }
+        block += tf->bytes_per_block; // Go to the next block.
+    }
+}
+
+/* Q4_1 blocks dequantization to floats.
+ * 'dst' is supposed to have enough space for 'count' weights. */
+void gguf_q4_1_to_float(void *weights_data, void *dst, uint64_t count, store_float_callback store_callback) {
+    float *f = dst;
+    struct gguf_tensor_type_features *tf =
+        gguf_get_tensor_type_features(GGUF_TYPE_Q4_1);
+
+    /* Very simple layout: |16 bit scale|16 bit bias|32 x 4bit weights|
+     * Each weight is scale * quantized_weight[0..31] + bias */
+    uint8_t *block = weights_data;
+    uint64_t i = 0; // i-th weight to dequantize.
+    while(i < count) {
+        /* For each block get the scale and convert all the
+         * weights in the block. */
+        float scale = from_half(*((uint16_t*)block));
+        float bias = from_half(*((uint16_t*)block+2));
+        /* First 16 weights are in the lower bits */
+        for (uint32_t j = 0; j < 16; j++) {
+            uint8_t value = block[j+4]; // j+2 to skip the scale and bias bytes.
+            value &= 0xf;  // lower bits
+            float weight = value * scale + bias;
+            if (store_callback)
+                store_callback(dst,i,weight);
+            else
+                f[i] = weight;
+            if (++i == count) break;
+        }
+        /* Last 16 weights are in the higher bits */
+        for (uint32_t j = 0; j < 16; j++) {
+            uint8_t value = block[j+4]; // j+2 to skip the scale and bias bytes.
+            value >>= 4;  // higher bits
+            float weight = value * scale + bias;
+            if (store_callback)
+                store_callback(dst,i,weight);
+            else
+                f[i] = weight;
+            if (++i == count) break;
+        }
+        block += tf->bytes_per_block; // Go to the next block.
+    }
+}
+
 /* FP16 blocks dequantization to floats.
 * 'y' is supposed to have enough space for 'count' weights. */
 void gguf_f16_to_float(void *weights_data, float *dst, uint64_t count, store_float_callback store_callback) {
@@ -814,6 +897,10 @@ float *gguf_tensor_to_float(gguf_tensor *tensor) {
        gguf_q6_k_to_float(tensor->weights_data, f, tensor->num_weights, NULL);
    } else if (tensor->type == GGUF_TYPE_Q2_K) {
        gguf_q2_k_to_float(tensor->weights_data, f, tensor->num_weights, NULL);
+    } else if (tensor->type == GGUF_TYPE_Q4_0) {
+        gguf_q4_0_to_float(tensor->weights_data, f, tensor->num_weights, NULL);
+    } else if (tensor->type == GGUF_TYPE_Q4_1) {
+        gguf_q4_1_to_float(tensor->weights_data, f, tensor->num_weights, NULL);
    } else {
        errno = EINVAL;
        return NULL;
@@ -839,6 +926,10 @@ int16_t *gguf_tensor_to_f16(gguf_tensor *tensor) {
        gguf_q6_k_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_f16_callback);
    } else if (tensor->type == GGUF_TYPE_Q2_K) {
        gguf_q2_k_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_f16_callback);
+    } else if (tensor->type == GGUF_TYPE_Q4_0) {
+        gguf_q4_0_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_f16_callback);
+    } else if (tensor->type == GGUF_TYPE_Q4_1) {
+        gguf_q4_1_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_f16_callback);
    } else {
        errno = EINVAL;
        return NULL;