diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8eb55dd --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +gguf-tools diff --git a/Makefile b/Makefile index fb79066..b1f9127 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ all: gguf-tools -gguf-tools: gguf-tools.c gguflib.c gguflib.h sds.c sds.h sdsalloc.h fp16.h +gguf-tools: gguf-tools.c gguflib.c gguflib.h sds.c sds.h sdsalloc.h fp16.h bf16.h $(CC) gguf-tools.c gguflib.c sds.c fp16.c \ - -march=native -flto -ffast-math \ + -march=native -ffast-math \ -g -ggdb -Wall -W -pedantic -O3 -o gguf-tools clean: diff --git a/bf16.h b/bf16.h new file mode 100644 index 0000000..c63305f --- /dev/null +++ b/bf16.h @@ -0,0 +1,78 @@ +#ifndef BF16_h +#define BF16_h +#include + +/** + * Converts brain16 to float32. + * + * The bfloat16 floating point format has the following structure: + * + * ┌sign + * │ + * │ ┌exponent + * │ │ + * │ │ ┌mantissa + * │ │ │ + * │┌──┴───┐┌─┴───┐ + * 0b0000000000000000 brain16 + * + * Since bf16 has the same number of exponent bits as a 32bit float, + * encoding and decoding numbers becomes relatively straightforward. + * + * ┌sign + * │ + * │ ┌exponent + * │ │ + * │ │ ┌mantissa + * │ │ │ + * │┌──┴───┐┌─┴───────────────────┐ + * 0b00000000000000000000000000000000 IEEE binary32 + * + * For comparison, the standard fp16 format has fewer exponent bits. + * + * ┌sign + * │ + * │ ┌exponent + * │ │ + * │ │ ┌mantissa + * │ │ │ + * │┌─┴─┐┌─┴──────┐ + * 0b0000000000000000 IEEE binary16 + * + * @see IEEE 754-2008 + */ +static inline float from_brain(uint16_t h) { + union { + float f; + uint32_t i; + } u; + u.i = (uint32_t)h << 16; + return u.f; +} + +/** + * Converts float32 to brain16. + * + * This function is binary identical to AMD Zen4 VCVTNEPS2BF16. + * Subnormals shall be flushed to zero, and NANs will be quiet. + * This code should vectorize nicely if using modern compilers. + */ +static inline uint16_t to_brain(float s) { + uint16_t h; + union { + float f; + uint32_t i; + } u; + u.f = s; + if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */ + h = (u.i >> 16) | 64; /* force to quiet */ + return h; + } + if (!(u.i & 0x7f800000)) { /* subnormal */ + h = (u.i & 0x80000000) >> 16; /* flush to zero */ + return h; + } + return (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16; +} + +#endif diff --git a/gguf-tools.c b/gguf-tools.c index 22b7f7f..c847c17 100644 --- a/gguf-tools.c +++ b/gguf-tools.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "gguflib.h" #include "sds.h" @@ -143,7 +144,7 @@ int strmatch(const char *pattern, int patternLen, void gguf_tools_show(const char *filename) { gguf_ctx *ctx = gguf_open(filename); if (ctx == NULL) { - perror("Opening GGUF file"); + perror(filename); exit(1); } @@ -166,16 +167,16 @@ void gguf_tools_show(const char *filename) { gguf_tensor tensor; uint64_t params = 0; while (gguf_get_tensor(ctx,&tensor)) { - printf("%s tensor %.*s @%llu, %llu weights, dims ", + printf("%s tensor %.*s @%" PRIu64 ", %" PRIu64 " weights, dims ", gguf_get_tensor_type_name(tensor.type), (int)tensor.namelen, tensor.name, tensor.offset, tensor.num_weights); for (uint32_t j = 0; j < tensor.ndim; j++) { - printf("%s%llu",(j == 0) ? "[" : ",", tensor.dim[j]); + printf("%s%" PRIu64 "",(j == 0) ? "[" : ",", tensor.dim[j]); } - printf("], %llu bytes\n", tensor.bsize); + printf("], %" PRIu64 " bytes\n", tensor.bsize); params += tensor.num_weights; } @@ -192,13 +193,13 @@ void gguf_tools_show(const char *filename) { void gguf_tools_split_mixtral(int *experts_id, const char *mixtral_filename, const char *output_filename) { gguf_ctx *mixtral = gguf_open(mixtral_filename); if (mixtral == NULL) { - perror("Opening Mixtral file"); + perror(mixtral_filename); exit(1); } gguf_ctx *output = gguf_create(output_filename, GGUF_NONE); if (output == NULL) { - perror("Opening the output file"); + perror(output_filename); exit(1); } @@ -312,7 +313,7 @@ void gguf_tools_split_mixtral(int *experts_id, const char *mixtral_filename, con } tensor_off += tensors[j].orig_info.bsize; } - printf("Output file: after writing tensors info, file size is: %llu\n", output->size); + printf("Output file: after writing tensors info, file size is: %" PRIu64 "\n", output->size); /* Finally, append the tensors weights. */ for (uint32_t j = 0; j < num_tensors; j++) { @@ -333,7 +334,7 @@ void gguf_tools_split_mixtral(int *experts_id, const char *mixtral_filename, con void gguf_tools_inspect_weights(const char *filename, const char *tname, uint64_t count) { gguf_ctx *ctx = gguf_open(filename); if (ctx == NULL) { - perror("Opening GGUF file"); + perror(filename); exit(1); } @@ -424,8 +425,8 @@ int tensors_avg_diff(gguf_tensor *t1, gguf_tensor *t2, double *diff) { float *weights1 = gguf_tensor_to_float(t1); float *weights2 = gguf_tensor_to_float(t2); if (weights1 == NULL || weights2 == NULL) { - if (weights1) free(weights1); - if (weights2) free(weights2); + free(weights1); + free(weights2); return 0; } @@ -444,7 +445,7 @@ int tensors_avg_diff(gguf_tensor *t1, gguf_tensor *t2, double *diff) { double avg_diff = tot_diff / t1->num_weights; /* Multiply by 75 to normalize the difference of a - * random varialbe between -N and +N to 0 - 100% */ + * random variable between -N and +N to 0 - 100% */ *diff = avg_diff / avg_mag * 75; free(weights1); @@ -454,9 +455,14 @@ int tensors_avg_diff(gguf_tensor *t1, gguf_tensor *t2, double *diff) { void gguf_tools_compare(const char *file1, const char *file2) { gguf_ctx *ctx1 = gguf_open(file1); + if (ctx1 == NULL) { + perror(file1); + exit(1); + } + gguf_ctx *ctx2 = gguf_open(file2); - if (ctx1 == NULL || ctx2 == NULL) { - perror("Opening GGUF files"); + if (ctx2 == NULL) { + perror(file2); exit(1); } diff --git a/gguflib.c b/gguflib.c index 96e66b1..aa68527 100644 --- a/gguflib.c +++ b/gguflib.c @@ -8,9 +8,11 @@ #include #include #include +#include #include "gguflib.h" #include "fp16.h" +#include "bf16.h" /* ============================ Low level functions ========================= */ @@ -43,9 +45,21 @@ struct gguf_tensor_type_features { {"q5_k", 256, 176}, {"q6_k", 256, 210}, {"q8_k", 256, 292}, + {"iq2_xxs", 256, 66}, + {"iq2_xs", 256, 74}, + {"iq3_xxs", 256, 98}, + {"iq1_s", 256, 110}, + {"iq4_nl", 256, 50}, + {"iq3_s", 256, 110}, + {"iq2_s", 256, 82}, + {"iq4_xs", 256, 136}, {"i8", 1, 1}, {"i16", 1, 2}, {"i32", 1, 4}, + {"i64", 1, 8}, + {"f64", 1, 8}, + {"iq1_m", 256, 56}, + {"bf16", 1, 2}, }; /* Return the value type name given the type ID. */ @@ -101,8 +115,8 @@ gguf_ctx *gguf_open(const char *filename) { if (fd == -1) return NULL; /* Mapping successful. We can create our context object. */ - gguf_ctx *ctx = malloc(sizeof(*ctx)); - memset(ctx,0,sizeof(*ctx)); + gguf_ctx *ctx = calloc(1, sizeof(*ctx)); + if (!ctx) return NULL; ctx->fd = fd; ctx->alignment = 32; // Default alignment of GGUF files. ctx->data_off = 0; // Set later. @@ -363,8 +377,8 @@ void gguf_print_value_callback(void *privdata, uint32_t type, union gguf_value * struct gguf_print_options *po = privdata; if (po && po->max_array_items && in_array > po->max_array_items) { if (in_array-1 == po->max_array_items) - printf("... %llu more items of %llu", array_len-in_array+1, - array_len); + printf("... %" PRIu64 " more items of %" PRIu64 "", + array_len-in_array+1, array_len); return; } @@ -396,9 +410,9 @@ void gguf_print_value_callback(void *privdata, uint32_t type, union gguf_value * case GGUF_VALUE_TYPE_STRING: printf("%.*s", (int)val->string.len, val->string.string); break; case GGUF_VALUE_TYPE_UINT64: - printf("%llu", val->uint64); break; + printf("%" PRIu64 "", val->uint64); break; case GGUF_VALUE_TYPE_INT64: - printf("%lld", val->int64); break; + printf("%" PRId64 "", val->int64); break; case GGUF_VALUE_TYPE_FLOAT64: printf("%lf", val->float64); break; default: @@ -516,6 +530,12 @@ void gguf_store_f16_callback(void *dst, uint64_t idx, float f) { f16[idx] = to_half(f); } +/* Callback used to store BF16 when dequantizing. */ +void gguf_store_bf16_callback(void *dst, uint64_t idx, float f) { + uint16_t *f16 = dst; + f16[idx] = to_brain(f); +} + /* Q8_0 blocks dequantization to floats. * 'dst' is supposed to have enough space for 'count' weights. */ void gguf_q8_0_to_float(void *weights_data, void *dst, uint64_t count, store_float_callback store_callback) { @@ -755,7 +775,7 @@ void gguf_q2_k_to_float(void *weights_data, void *dst, uint64_t count, store_flo float scale_of_scales = from_half(*((uint16_t*)(block+16+64))); float scale_of_mins = from_half(*((uint16_t*)(block+16+64+2))); - float scale, min; + float scale = 0, min = 0; int bn = 0; // Block number for (uint64_t cluster = 0; cluster < 2; cluster++) { for (uint64_t j = 0; j < 128; j++) { @@ -863,7 +883,8 @@ void gguf_q4_1_to_float(void *weights_data, void *dst, uint64_t count, store_flo /* FP16 blocks dequantization to floats. * 'y' is supposed to have enough space for 'count' weights. */ -void gguf_f16_to_float(void *weights_data, float *dst, uint64_t count, store_float_callback store_callback) { +static void gguf_f16_to_float(void *weights_data, void *dst, uint64_t count, + store_float_callback store_callback) { float *f = dst; uint64_t i = 0; // i-th weight to dequantize. uint16_t *w16 = weights_data; @@ -877,6 +898,23 @@ void gguf_f16_to_float(void *weights_data, float *dst, uint64_t count, store_flo } } +/* BF16 blocks dequantization to floats. + * 'y' is supposed to have enough space for 'count' weights. */ +static void gguf_bf16_to_float(void *weights_data, void *dst, uint64_t count, + store_float_callback store_callback) { + float *f = dst; + uint64_t i = 0; // i-th weight to dequantize. + uint16_t *w16 = weights_data; + while(i < count) { + float weight = from_brain(w16[i]); + if (store_callback) + store_callback(dst,i,weight); + else + f[i] = weight; + i++; + } +} + /* Convert the specified tensor (quantized or not) into an array of * floats. The array is allocated with malloc(). If the tensor is already * in FP32 floats format, it is just memcpy()-ed to the destination array. @@ -885,10 +923,13 @@ void gguf_f16_to_float(void *weights_data, float *dst, uint64_t count, store_flo * NULL is returned as well, but errno is set to EINVAL. */ float *gguf_tensor_to_float(gguf_tensor *tensor) { float *f = malloc(tensor->num_weights*sizeof(float)); + if (!f) return NULL; if (tensor->type == GGUF_TYPE_F32) { memcpy(f, tensor->weights_data, tensor->num_weights*sizeof(float)); } else if (tensor->type == GGUF_TYPE_F16) { gguf_f16_to_float(tensor->weights_data, f, tensor->num_weights, NULL); + } else if (tensor->type == GGUF_TYPE_BF16) { + gguf_bf16_to_float(tensor->weights_data, f, tensor->num_weights, NULL); } else if (tensor->type == GGUF_TYPE_Q8_0) { gguf_q8_0_to_float(tensor->weights_data, f, tensor->num_weights, NULL); } else if (tensor->type == GGUF_TYPE_Q4_K) { @@ -913,12 +954,15 @@ float *gguf_tensor_to_float(gguf_tensor *tensor) { * an array of int16_t values. */ int16_t *gguf_tensor_to_f16(gguf_tensor *tensor) { int16_t *f16 = malloc(tensor->num_weights*sizeof(int16_t)); + if (!f16) return NULL; if (tensor->type == GGUF_TYPE_F32) { float *f = (float*)tensor->weights_data; for (uint64_t j = 0; j < tensor->num_weights; j++) f16[j] = to_half(f[j]); } else if (tensor->type == GGUF_TYPE_F16) { memcpy(f16, tensor->weights_data, tensor->num_weights*sizeof(int16_t)); + } else if (tensor->type == GGUF_TYPE_BF16) { + gguf_bf16_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_f16_callback); } else if (tensor->type == GGUF_TYPE_Q8_0) { gguf_q8_0_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_f16_callback); } else if (tensor->type == GGUF_TYPE_Q4_K) { @@ -938,3 +982,36 @@ int16_t *gguf_tensor_to_f16(gguf_tensor *tensor) { } return f16; } + +/* Same as gguf_tensor_to_float() but the result will be an bf16 tensor, that is + * an array of int16_t values. */ +int16_t *gguf_tensor_to_bf16(gguf_tensor *tensor) { + int16_t *f16 = malloc(tensor->num_weights*sizeof(int16_t)); + if (!f16) return NULL; + if (tensor->type == GGUF_TYPE_F32) { + float *f = (float*)tensor->weights_data; + for (uint64_t j = 0; j < tensor->num_weights; j++) + f16[j] = to_half(f[j]); + } else if (tensor->type == GGUF_TYPE_F16) { + gguf_f16_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_bf16_callback); + } else if (tensor->type == GGUF_TYPE_BF16) { + memcpy(f16, tensor->weights_data, tensor->num_weights*sizeof(int16_t)); + } else if (tensor->type == GGUF_TYPE_Q8_0) { + gguf_q8_0_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_bf16_callback); + } else if (tensor->type == GGUF_TYPE_Q4_K) { + gguf_q4_k_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_bf16_callback); + } else if (tensor->type == GGUF_TYPE_Q6_K) { + gguf_q6_k_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_bf16_callback); + } else if (tensor->type == GGUF_TYPE_Q2_K) { + gguf_q2_k_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_bf16_callback); + } else if (tensor->type == GGUF_TYPE_Q4_0) { + gguf_q4_0_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_bf16_callback); + } else if (tensor->type == GGUF_TYPE_Q4_1) { + gguf_q4_1_to_float(tensor->weights_data, f16, tensor->num_weights, gguf_store_bf16_callback); + } else { + free(f16); + errno = EINVAL; + return NULL; + } + return f16; +} diff --git a/gguflib.h b/gguflib.h index 1119130..4cb973a 100644 --- a/gguflib.h +++ b/gguflib.h @@ -27,16 +27,27 @@ enum gguf_tensor_type { GGUF_TYPE_Q5_1 = 7, GGUF_TYPE_Q8_0 = 8, GGUF_TYPE_Q8_1 = 9, - // k-quantizations GGUF_TYPE_Q2_K = 10, GGUF_TYPE_Q3_K = 11, GGUF_TYPE_Q4_K = 12, GGUF_TYPE_Q5_K = 13, GGUF_TYPE_Q6_K = 14, GGUF_TYPE_Q8_K = 15, - GGUF_TYPE_I8, - GGUF_TYPE_I16, - GGUF_TYPE_I32, + GGUF_TYPE_IQ2_XXS = 16, + GGUF_TYPE_IQ2_XS = 17, + GGUF_TYPE_IQ3_XXS = 18, + GGUF_TYPE_IQ1_S = 19, + GGUF_TYPE_IQ4_NL = 20, + GGUF_TYPE_IQ3_S = 21, + GGUF_TYPE_IQ2_S = 22, + GGUF_TYPE_IQ4_XS = 23, + GGUF_TYPE_I8 = 24, + GGUF_TYPE_I16 = 25, + GGUF_TYPE_I32 = 26, + GGUF_TYPE_I64 = 27, + GGUF_TYPE_F64 = 28, + GGUF_TYPE_IQ1_M = 29, + GGUF_TYPE_BF16 = 30, GGUF_TYPE_COUNT, }; @@ -185,5 +196,6 @@ uint64_t gguf_get_alignment_padding(uint64_t alignment, uint64_t offset); void gguf_skip_key_values_section(gguf_ctx *ctx); float *gguf_tensor_to_float(gguf_tensor *tensor); int16_t *gguf_tensor_to_f16(gguf_tensor *tensor); +int16_t *gguf_tensor_to_bf16(gguf_tensor *tensor); #endif