From bd4ecbda94c5108712a238e276252819b384f4cb Mon Sep 17 00:00:00 2001
From: antirez <antirez@gmail.com>
Date: Wed, 27 Dec 2023 15:13:42 +0100
Subject: [PATCH] FP16 added. Split-mixtral improved.

---
 Makefile     |  4 +--
 README.md    | 14 ++++++++-
 fp16.c       | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fp16.h       |  5 ++++
 gguf-tools.c | 43 ++++++++++++++++++++++----
 gguflib.h    |  1 +
 6 files changed, 144 insertions(+), 8 deletions(-)
 create mode 100644 fp16.c
 create mode 100644 fp16.h

diff --git a/Makefile b/Makefile
index 358f488..3a29ffe 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 all: gguf-tools
 
-gguf-tools: gguf-tools.c gguflib.c gguflib.h sds.c sds.h sdsalloc.h
-	$(CC) gguf-tools.c gguflib.c sds.c \
+gguf-tools: gguf-tools.c gguflib.c gguflib.h sds.c sds.h sdsalloc.h fp16.h
+	$(CC) gguf-tools.c gguflib.c sds.c fp16.c \
 		-g -ggdb -Wall -W -pedantic -O2 -o gguf-tools
 
 clean:
diff --git a/README.md b/README.md
index 96248cf..5933ac0 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,18 @@
 # GGUF tools
 
-Todo... still not clear what this is going to be.
+This is a work in progress library to manipulate GGUF files.
+The program 'gguf-tools' use the library to implement both useful and
+useless stuff, to show the library usage.
+
+    gguf-tools show file.gguf
+
+shows detailed info about the GGUF file. This will include all the key-value pairs, including arrays, and detailed tensors informations. Tensor offsets will be relative to the start *of the file*, not the start of the data section like in the GGUF format (absolute file offsets are more useful and simpler to use).
+
+    gguf-tools split-mixtral 65230776370407150546470161412165 mixtral.gguf out.gguf
+
+Extracts a 7B model `out.gguf` from Mixtral 7B MoE using the specified MoE ID for each layer (there are 32 digits in the sequence 652...).
+
+Note that split-mixtral is quite useless as models obtained in this way will not perform any useful action. This is just an experiment and a non trivial task to show how to use the library. Likely it will be removed soon, once I have more interesting and useful examples to show.
 
 ## Specification documents
 
diff --git a/fp16.c b/fp16.c
new file mode 100644
index 0000000..9a62719
--- /dev/null
+++ b/fp16.c
@@ -0,0 +1,85 @@
+#include <stdint.h>
+#include <math.h>
+
+/* This code comes originally from:
+ * https://github.com/Maratyszcza/FP16/blob/master/include/fp16/fp16.h
+ *
+ * The original code is MIT licensed. */
+
+static inline float fp32_from_bits(uint32_t w) {
+    union {
+        uint32_t as_bits;
+        float as_value;
+    } fp32;
+    fp32.as_bits = w;
+    return fp32.as_value;
+}
+
+static inline uint32_t fp32_to_bits(float f) {
+    union {
+        float as_value;
+        uint32_t as_bits;
+    } fp32;
+    fp32.as_value = f;
+    return fp32.as_bits;
+}
+
+float from_half(uint16_t h) {
+    const uint32_t w = (uint32_t) h << 16;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    const uint32_t two_w = w + w;
+
+    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+    const float exp_scale = 0x1.0p-112f;
+#else
+    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
+#endif
+    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+    const uint32_t magic_mask = UINT32_C(126) << 23;
+    const float magic_bias = 0.5f;
+    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+    const uint32_t result = sign |
+        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+    return fp32_from_bits(result);
+}
+
+uint16_t to_half(float f) {
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+    const float scale_to_inf = 0x1.0p+112f;
+    const float scale_to_zero = 0x1.0p-110f;
+#else
+    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
+    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
+#endif
+    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+    const uint32_t w = fp32_to_bits(f);
+    const uint32_t shl1_w = w + w;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+    if (bias < UINT32_C(0x71000000)) {
+        bias = UINT32_C(0x71000000);
+    }
+
+    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+    const uint32_t bits = fp32_to_bits(base);
+    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+    const uint32_t nonsign = exp_bits + mantissa_bits;
+    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+}
+
+#ifdef TEST_MAIN
+#include <stdio.h>
+int main(void) {
+    float f = 1.2345;
+    uint16_t half = to_half(f);
+    float f2 = from_half(half);
+    printf("%f %f\n", f, f2);
+    return 0;
+}
+#endif
diff --git a/fp16.h b/fp16.h
new file mode 100644
index 0000000..32543b6
--- /dev/null
+++ b/fp16.h
@@ -0,0 +1,5 @@
+#ifdef FP16_h
+#define FP16_h
+float from_half(uint16_t h);
+uint16_t to_half(float f);
+#endif
diff --git a/gguf-tools.c b/gguf-tools.c
index aa012f9..0b0521d 100644
--- a/gguf-tools.c
+++ b/gguf-tools.c
@@ -169,8 +169,9 @@ void gguf_tools_show(const char *filename) {
 }
 
 /* Read a Mixtral MoE model and creates a new non-MoE GGUF file based
- * on the weights of the expert with id 'expert_id'. */
-void gguf_tools_split_mixtral(int expert_id, const char *mixtral_filename, const char *output_filename) {
+ * on the weights of the experts with IDs in the array of 'experts_id'.
+ * The array must contain 32 integers, one for each layer. */
+void gguf_tools_split_mixtral(int *experts_id, const char *mixtral_filename, const char *output_filename) {
     gguf_ctx *mixtral = gguf_init(mixtral_filename);
     if (mixtral == NULL) {
         perror("Opening Mixtral file");
@@ -238,6 +239,20 @@ void gguf_tools_split_mixtral(int expert_id, const char *mixtral_filename, const
         /* The tensor is a feed-forward tensor? We want to copy only
          * the ones of our expert ID. */
         if (strstr(tn,".ffn_") != NULL && strstr(tn,".ffn_norm") == NULL) {
+            /* Extract which block this FFN belongs. */
+            int block;
+            assert(memcmp(tn,"blk.",4) == 0); // Must start with blk.<block>
+            char *p = strchr(tn+4,'.');
+            assert(p != NULL);
+            *p = 0;
+            block = atoi(tn+4);
+            *p = '.';
+            assert(block >= 0 && block < 32);
+
+            /* Now that we have the block, we can select the corresponding
+             * expert ID we want to use for this block. */
+            int expert_id = experts_id[block];
+
             char match[32];
             snprintf(match,sizeof(match),".%d.weight",expert_id);
             char *match_ptr = strstr(tn,match);
@@ -283,7 +298,8 @@ void gguf_tools_split_mixtral(int expert_id, const char *mixtral_filename, const
 
     /* Finally, append the tensors weights. */
     for (uint32_t j = 0; j < num_tensors; j++) {
-        printf("Writing tensor %s\n", tensors[j].dest_name);
+        printf("Writing tensor %s (weights from %.*s)\n", tensors[j].dest_name,
+            (int)tensors[j].orig_info.namelen, tensors[j].orig_info.name);
         if (gguf_append_tensor_data(output,tensors[j].orig_info.weights_data,
             tensors[j].orig_info.bsize) == 0)
         {
@@ -300,7 +316,9 @@ void gguf_tools_usage(const char *progname) {
     printf("Usage: %s <subcommand> [options...]\n"
            "Subcommands:\n"
            "  show <filename> -- show GGUF model keys and tensors.\n"
-           "  split-mixtral <id> mixtral.gguf out.gguf -- extract expert.\n"
+           "  split-mixtral <ids...> mixtral.gguf out.gguf -- extract expert.\n"
+           "Example:\n"
+           "  split-mixtral 65230776370407150546470161412165 mixtral.gguf out.gguf\n"
            , progname);
     exit(1);
 }
@@ -311,7 +329,22 @@ int main(int argc, char **argv) {
     if (!strcmp(argv[1],"show") && argc == 3) {
         gguf_tools_show(argv[2]);
     } else if (!strcmp(argv[1],"split-mixtral") && argc == 5) {
-        gguf_tools_split_mixtral(atoi(argv[2]),argv[3],argv[4]);
+        int experts[32];
+        size_t elen = strlen(argv[2]);
+        for (size_t j = 0; j < 32; j++) {
+            if (j < elen) {
+                experts[j] = argv[2][j] - '0';
+                if (experts[j] < 0 || experts[j] > 7) {
+                    fprintf(stderr,"Invalid expert ID: %d\n", experts[j]);
+                    exit(1);
+                }
+            } else {
+                /* If there aren't 32 digits in the input, use the last
+                 * one repeated up to the last layer. */
+                experts[j] = j > 1 ? experts[j-1] : 0;
+            }
+        }
+        gguf_tools_split_mixtral(experts,argv[3],argv[4]);
     } else {
         gguf_tools_usage(argv[0]);
     }
diff --git a/gguflib.h b/gguflib.h
index cf10367..09f0998 100644
--- a/gguflib.h
+++ b/gguflib.h
@@ -6,6 +6,7 @@
  */
 
 #ifndef GGUFLIB_H
+#define GGUFLIB_H
 
 #include <stdint.h>