Merge pull request #14 from jart/update

Add BF16 support and fix warnings
2025-12-16 00:18:52 +08:00 · 2024-05-26 09:22:00 +02:00
parent 3e5c0a464d ede59bb742
commit 4e6455ecaf
6 changed files with 201 additions and 27 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+gguf-tools
--- a/4
+++ b/4
@@ -1,8 +1,8 @@
 all: gguf-tools

-gguf-tools: gguf-tools.c gguflib.c gguflib.h sds.c sds.h sdsalloc.h fp16.h
+gguf-tools: gguf-tools.c gguflib.c gguflib.h sds.c sds.h sdsalloc.h fp16.h bf16.h
 	$(CC) gguf-tools.c gguflib.c sds.c fp16.c \
-		-march=native -flto -ffast-math \
+		-march=native -ffast-math \
 		-g -ggdb -Wall -W -pedantic -O3 -o gguf-tools

 clean:
--- a/bf16.h
+++ b/bf16.h
@@ -0,0 +1,78 @@
+#ifndef BF16_h
+#define BF16_h
+#include <stdint.h>
+
+/**
+ * Converts brain16 to float32.
+ *
+ * The bfloat16 floating point format has the following structure:
+ *
+ *       ┌sign
+ *       │
+ *       │   ┌exponent
+ *       │   │
+ *       │   │      ┌mantissa
+ *       │   │      │
+ *       │┌──┴───┐┌─┴───┐
+ *     0b0000000000000000 brain16
+ *
+ * Since bf16 has the same number of exponent bits as a 32bit float,
+ * encoding and decoding numbers becomes relatively straightforward.
+ *
+ *       ┌sign
+ *       │
+ *       │   ┌exponent
+ *       │   │
+ *       │   │      ┌mantissa
+ *       │   │      │
+ *       │┌──┴───┐┌─┴───────────────────┐
+ *     0b00000000000000000000000000000000 IEEE binary32
+ *
+ * For comparison, the standard fp16 format has fewer exponent bits.
+ *
+ *       ┌sign
+ *       │
+ *       │  ┌exponent
+ *       │  │
+ *       │  │    ┌mantissa
+ *       │  │    │
+ *       │┌─┴─┐┌─┴──────┐
+ *     0b0000000000000000 IEEE binary16
+ *
+ * @see IEEE 754-2008
+ */
+static inline float from_brain(uint16_t h) {
+    union {
+        float f;
+        uint32_t i;
+    } u;
+    u.i = (uint32_t)h << 16;
+    return u.f;
+}
+
+/**
+ * Converts float32 to brain16.
+ *
+ * This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
+ * Subnormals shall be flushed to zero, and NANs will be quiet.
+ * This code should vectorize nicely if using modern compilers.
+ */
+static inline uint16_t to_brain(float s) {
+    uint16_t h;
+    union {
+        float f;
+        uint32_t i;
+    } u;
+    u.f = s;
+    if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
+        h = (u.i >> 16) | 64; /* force to quiet */
+        return h;
+    }
+    if (!(u.i & 0x7f800000)) { /* subnormal */
+        h = (u.i & 0x80000000) >> 16; /* flush to zero */
+        return h;
+    }
+    return (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
+}
+
+#endif
--- a/gguf-tools.c
+++ b/gguf-tools.c
@@ -5,6 +5,7 @@
 #include <assert.h>
 #include <errno.h>
 #include <math.h>
+#include <inttypes.h>

 #include "gguflib.h"
 #include "sds.h"
@@ -143,7 +144,7 @@ int strmatch(const char *pattern, int patternLen,
 void gguf_tools_show(const char *filename) {
    gguf_ctx *ctx = gguf_open(filename);
    if (ctx == NULL) {
-        perror("Opening GGUF file");
+        perror(filename);
        exit(1);
    }

@@ -166,16 +167,16 @@ void gguf_tools_show(const char *filename) {
    gguf_tensor tensor;
    uint64_t params = 0;
    while (gguf_get_tensor(ctx,&tensor)) {
-        printf("%s tensor %.*s @%llu, %llu weights, dims ",
+        printf("%s tensor %.*s @%" PRIu64 ", %" PRIu64 " weights, dims ",
            gguf_get_tensor_type_name(tensor.type),
            (int)tensor.namelen,
            tensor.name,
            tensor.offset,
            tensor.num_weights);
        for (uint32_t j = 0; j < tensor.ndim; j++) {
-            printf("%s%llu",(j == 0) ? "[" : ",", tensor.dim[j]);
+            printf("%s%" PRIu64 "",(j == 0) ? "[" : ",", tensor.dim[j]);
        }
-        printf("], %llu bytes\n", tensor.bsize);
+        printf("], %" PRIu64 " bytes\n", tensor.bsize);

        params += tensor.num_weights;
    }
@@ -192,13 +193,13 @@ void gguf_tools_show(const char *filename) {
 void gguf_tools_split_mixtral(int *experts_id, const char *mixtral_filename, const char *output_filename) {
    gguf_ctx *mixtral = gguf_open(mixtral_filename);
    if (mixtral == NULL) {
-        perror("Opening Mixtral file");
+        perror(mixtral_filename);
        exit(1);
    }

    gguf_ctx *output = gguf_create(output_filename, GGUF_NONE);
    if (output == NULL) {
-        perror("Opening the output file");
+        perror(output_filename);
        exit(1);
    }

@@ -312,7 +313,7 @@ void gguf_tools_split_mixtral(int *experts_id, const char *mixtral_filename, con
        }
        tensor_off += tensors[j].orig_info.bsize;
    }
-    printf("Output file: after writing tensors info, file size is: %llu\n", output->size);
+    printf("Output file: after writing tensors info, file size is: %" PRIu64 "\n", output->size);

    /* Finally, append the tensors weights. */
    for (uint32_t j = 0; j < num_tensors; j++) {
@@ -333,7 +334,7 @@ void gguf_tools_split_mixtral(int *experts_id, const char *mixtral_filename, con
 void gguf_tools_inspect_weights(const char *filename, const char *tname, uint64_t count) {
    gguf_ctx *ctx = gguf_open(filename);
    if (ctx == NULL) {
-        perror("Opening GGUF file");
+        perror(filename);
        exit(1);
    }

@@ -424,8 +425,8 @@ int tensors_avg_diff(gguf_tensor *t1, gguf_tensor *t2, double *diff) {
    float *weights1 = gguf_tensor_to_float(t1);
    float *weights2 = gguf_tensor_to_float(t2);
    if (weights1 == NULL || weights2 == NULL) {
-        if (weights1) free(weights1);
-        if (weights2) free(weights2);
+        free(weights1);
+        free(weights2);
        return 0;
    }

@@ -444,7 +445,7 @@ int tensors_avg_diff(gguf_tensor *t1, gguf_tensor *t2, double *diff) {
    double avg_diff = tot_diff / t1->num_weights;

    /* Multiply by 75 to normalize the difference of a
-     * random varialbe between -N and +N to 0 - 100% */
+     * random variable between -N and +N to 0 - 100% */
    *diff = avg_diff / avg_mag * 75;

    free(weights1);
@@ -454,9 +455,14 @@ int tensors_avg_diff(gguf_tensor *t1, gguf_tensor *t2, double *diff) {

 void gguf_tools_compare(const char *file1, const char *file2) {
    gguf_ctx *ctx1 = gguf_open(file1);
+    if (ctx1 == NULL) {
+        perror(file1);
+        exit(1);
+    }
+
    gguf_ctx *ctx2 = gguf_open(file2);
-    if (ctx1 == NULL || ctx2 == NULL) {
-        perror("Opening GGUF files");
+    if (ctx2 == NULL) {
+        perror(file2);
        exit(1);
    }

--- a/gguflib.c
+++ b/gguflib.c
@@ -8,9 +8,11 @@
 #include <unistd.h>
 #include <string.h>
 #include <assert.h>
+#include <inttypes.h>

 #include "gguflib.h"
 #include "fp16.h"
+#include "bf16.h"

 /* ============================ Low level functions ========================= */

@@ -43,9 +45,21 @@ struct gguf_tensor_type_features {
    {"q5_k", 256, 176},
    {"q6_k", 256, 210},
    {"q8_k", 256, 292},
+    {"iq2_xxs", 256, 66},
+    {"iq2_xs", 256, 74},
+    {"iq3_xxs", 256, 98},
+    {"iq1_s", 256, 110},
+    {"iq4_nl", 256, 50},
+    {"iq3_s", 256, 110},
+    {"iq2_s", 256, 82},
+    {"iq4_xs", 256, 136},
    {"i8", 1, 1},
    {"i16", 1, 2},
    {"i32", 1, 4},
+    {"i64", 1, 8},
+    {"f64", 1, 8},
+    {"iq1_m", 256, 56},
+    {"bf16", 1, 2},
 };

 /* Return the value type name given the type ID. */
@@ -101,8 +115,8 @@ gguf_ctx *gguf_open(const char *filename) {
    if (fd == -1) return NULL;

    /* Mapping successful. We can create our context object. */
-    gguf_ctx *ctx = malloc(sizeof(*ctx));
-    memset(ctx,0,sizeof(*ctx));
+    gguf_ctx *ctx = calloc(1, sizeof(*ctx));
+    if (!ctx) return NULL;
    ctx->fd = fd;
    ctx->alignment = 32; // Default alignment of GGUF files.
    ctx->data_off = 0;   // Set later.
@@ -363,8 +377,8 @@ void gguf_print_value_callback(void *privdata, uint32_t type, union gguf_value *
    struct gguf_print_options *po = privdata;
    if (po && po->max_array_items && in_array > po->max_array_items) {
        if (in_array-1 == po->max_array_items)
-            printf("... %llu more items of %llu", array_len-in_array+1,
-                                                  array_len);
+            printf("... %" PRIu64 " more items of %" PRIu64 "",
+                   array_len-in_array+1, array_len);
        return;
    }

@@ -396,9 +410,9 @@ void gguf_print_value_callback(void *privdata, uint32_t type, union gguf_value *
        case GGUF_VALUE_TYPE_STRING:
            printf("%.*s", (int)val->string.len, val->string.string); break;
        case GGUF_VALUE_TYPE_UINT64:
-            printf("%llu", val->uint64); break;
+            printf("%" PRIu64 "", val->uint64); break;
        case GGUF_VALUE_TYPE_INT64:
-            printf("%lld", val->int64); break;
+            printf("%" PRId64 "", val->int64); break;
        case GGUF_VALUE_TYPE_FLOAT64:
            printf("%lf", val->float64); break;
        default:
@@ -516,6 +530,12 @@ void gguf_store_f16_callback(void *dst, uint64_t idx, float f) {
    f16[idx] = to_half(f);
 }

+/* Callback used to store BF16 when dequantizing. */
+void gguf_store_bf16_callback(void *dst, uint64_t idx, float f) {
+    uint16_t *f16 = dst;
+    f16[idx] = to_brain(f);
+}
+
 /* Q8_0 blocks dequantization to floats.
 * 'dst' is supposed to have enough space for 'count' weights. */
 void gguf_q8_0_to_float(void *weights_data, void *dst, uint64_t count, store_float_callback store_callback) {
@@ -755,7 +775,7 @@ void gguf_q2_k_to_float(void *weights_data, void *dst, uint64_t count, store_flo
        float scale_of_scales = from_half(*((uint16_t*)(block+16+64)));
        float scale_of_mins = from_half(*((uint16_t*)(block+16+64+2)));

-        float scale, min;
+        float scale = 0, min = 0;
        int bn = 0; // Block number
        for (uint64_t cluster = 0; cluster < 2; cluster++) {
            for (uint64_t j = 0; j < 128; j++) {
@@ -863,7 +883,8 @@ void gguf_q4_1_to_float(void *weights_data, void *dst, uint64_t count, store_flo

 /* FP16 blocks dequantization to floats.
 * 'y' is supposed to have enough space for 'count' weights. */
-void gguf_f16_to_float(void *weights_data, float *dst, uint64_t count, store_float_callback store_callback) {
+static void gguf_f16_to_float(void *weights_data, void *dst, uint64_t count,
+                              store_float_callback store_callback) {
    float *f = dst;
    uint64_t i = 0; // i-th weight to dequantize.
    uint16_t *w16 = weights_data;
@@ -877,6 +898,23 @@ void gguf_f16_to_float(void *weights_data, float *dst, uint64_t count, store_flo
    }
 }

+/* BF16 blocks dequantization to floats.
+ * 'y' is supposed to have enough space for 'count' weights. */
+static void gguf_bf16_to_float(void *weights_data, void *dst, uint64_t count,
+                               store_float_callback store_callback) {
+    float *f = dst;
+    uint64_t i = 0; // i-th weight to dequantize.
+    uint16_t *w16 = weights_data;
+    while(i < count) {
+        float weight = from_brain(w16[i]);
+        if (store_callback)
+            store_callback(dst,i,weight);
+        else
+            f[i] = weight;
+        i++;
+    }
+}
+
 /* Convert the specified tensor (quantized or not) into an array of
 * floats. The array is allocated with malloc(). If the tensor is already
 * in FP32 floats format, it is just memcpy()-ed to the destination array.
@@ -885,10 +923,13 @@ void gguf_f16_to_float(void *weights_data, float *dst, uint64_t count, store_flo
 * NULL is returned as well, but errno is set to EINVAL. */
 float *gguf_tensor_to_float(gguf_tensor *tensor) {
    float *f = malloc(tensor->num_weights*sizeof(float));
+    if (!f) return NULL;
    if (tensor->type == GGUF_TYPE_F32) {
        memcpy(f, tensor->weights_data, tensor->num_weights*sizeof(float));
    } else if (tensor->type == GGUF_TYPE_F16) {
        gguf_f16_to_float(tensor->weights_data, f, tensor->num_weights, NULL);
+    } else if (tensor->type == GGUF_TYPE_BF16) {
+        gguf_bf16_to_float(tensor->weights_data, f, tensor->num_weights, NULL);
    } else if (tensor->type == GGUF_TYPE_Q8_0) {
        gguf_q8_0_to_float(tensor->weights_data, f, tensor->num_weights, NULL);
    } else if (tensor->type == GGUF_TYPE_Q4_K) {
@@ -913,12 +954,15 @@ float *gguf_tensor_to_float(gguf_tensor *tensor) {
 * an array of int16_t values. */
 int16_t *gguf_tensor_to_f16(gguf_tensor *tensor) {
    int16_t *f16 = malloc(tensor->num_weights*sizeof(int16_t));
+    if (!f16) return NULL;
    if (tensor->type == GGUF_TYPE_F32) {
        float *f = (float*)tensor->weights_data;
        for (uint64_t j = 0; j < tensor->num_weights; j++)
            f16[j] = to_half(f[j]);
    } else if (tensor->type == GGUF_TYPE_F16) {
        memcpy(f16, tensor->weights_data, tensor->num_weights*sizeof(int16_t));
+    } else if (tensor->type == GGUF_TYPE_BF16) {
+        gguf_bf16_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_f16_callback);
    } else if (tensor->type == GGUF_TYPE_Q8_0) {
        gguf_q8_0_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_f16_callback);
    } else if (tensor->type == GGUF_TYPE_Q4_K) {
@@ -938,3 +982,36 @@ int16_t *gguf_tensor_to_f16(gguf_tensor *tensor) {
    }
    return f16;
 }
+
+/* Same as gguf_tensor_to_float() but the result will be an bf16 tensor, that is
+ * an array of int16_t values. */
+int16_t *gguf_tensor_to_bf16(gguf_tensor *tensor) {
+    int16_t *f16 = malloc(tensor->num_weights*sizeof(int16_t));
+    if (!f16) return NULL;
+    if (tensor->type == GGUF_TYPE_F32) {
+        float *f = (float*)tensor->weights_data;
+        for (uint64_t j = 0; j < tensor->num_weights; j++)
+            f16[j] = to_half(f[j]);
+    } else if (tensor->type == GGUF_TYPE_F16) {
+        gguf_f16_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_bf16_callback);
+    } else if (tensor->type == GGUF_TYPE_BF16) {
+        memcpy(f16, tensor->weights_data, tensor->num_weights*sizeof(int16_t));
+    } else if (tensor->type == GGUF_TYPE_Q8_0) {
+        gguf_q8_0_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_bf16_callback);
+    } else if (tensor->type == GGUF_TYPE_Q4_K) {
+        gguf_q4_k_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_bf16_callback);
+    } else if (tensor->type == GGUF_TYPE_Q6_K) {
+        gguf_q6_k_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_bf16_callback);
+    } else if (tensor->type == GGUF_TYPE_Q2_K) {
+        gguf_q2_k_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_bf16_callback);
+    } else if (tensor->type == GGUF_TYPE_Q4_0) {
+        gguf_q4_0_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_bf16_callback);
+    } else if (tensor->type == GGUF_TYPE_Q4_1) {
+        gguf_q4_1_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_bf16_callback);
+    } else {
+        free(f16);
+        errno = EINVAL;
+        return NULL;
+    }
+    return f16;
+}
--- a/gguflib.h
+++ b/gguflib.h
@@ -27,16 +27,27 @@ enum gguf_tensor_type {
    GGUF_TYPE_Q5_1 = 7,
    GGUF_TYPE_Q8_0 = 8,
    GGUF_TYPE_Q8_1 = 9,
-    // k-quantizations
    GGUF_TYPE_Q2_K = 10,
    GGUF_TYPE_Q3_K = 11,
    GGUF_TYPE_Q4_K = 12,
    GGUF_TYPE_Q5_K = 13,
    GGUF_TYPE_Q6_K = 14,
    GGUF_TYPE_Q8_K = 15,
-    GGUF_TYPE_I8,
-    GGUF_TYPE_I16,
-    GGUF_TYPE_I32,
+    GGUF_TYPE_IQ2_XXS = 16,
+    GGUF_TYPE_IQ2_XS = 17,
+    GGUF_TYPE_IQ3_XXS = 18,
+    GGUF_TYPE_IQ1_S = 19,
+    GGUF_TYPE_IQ4_NL = 20,
+    GGUF_TYPE_IQ3_S = 21,
+    GGUF_TYPE_IQ2_S = 22,
+    GGUF_TYPE_IQ4_XS = 23,
+    GGUF_TYPE_I8 = 24,
+    GGUF_TYPE_I16 = 25,
+    GGUF_TYPE_I32 = 26,
+    GGUF_TYPE_I64 = 27,
+    GGUF_TYPE_F64 = 28,
+    GGUF_TYPE_IQ1_M = 29,
+    GGUF_TYPE_BF16 = 30,
    GGUF_TYPE_COUNT,
 };

@@ -185,5 +196,6 @@ uint64_t gguf_get_alignment_padding(uint64_t alignment, uint64_t offset);
 void gguf_skip_key_values_section(gguf_ctx *ctx);
 float *gguf_tensor_to_float(gguf_tensor *tensor);
 int16_t *gguf_tensor_to_f16(gguf_tensor *tensor);
+int16_t *gguf_tensor_to_bf16(gguf_tensor *tensor);

 #endif