2023-12-27 18:54:36 +01:00
|
|
|
/* Conversion from floats to FP16 and the other way around.
|
|
|
|
* This is useful as in GGUF files we have both FP16 tensors
|
|
|
|
* and quantized blocks where half-precisions floats are used
|
|
|
|
* to store the scaling factor (delta) and other parameters.
|
|
|
|
*
|
|
|
|
* This code comes originally from:
|
2023-12-27 15:13:42 +01:00
|
|
|
* https://github.com/Maratyszcza/FP16/blob/master/include/fp16/fp16.h
|
|
|
|
*
|
|
|
|
* The original code is MIT licensed. */
|
|
|
|
|
2023-12-27 18:54:36 +01:00
|
|
|
#include <stdint.h>
|
|
|
|
#include <math.h>
|
|
|
|
|
2023-12-27 15:13:42 +01:00
|
|
|
static inline float fp32_from_bits(uint32_t w) {
|
|
|
|
union {
|
|
|
|
uint32_t as_bits;
|
|
|
|
float as_value;
|
|
|
|
} fp32;
|
|
|
|
fp32.as_bits = w;
|
|
|
|
return fp32.as_value;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline uint32_t fp32_to_bits(float f) {
|
|
|
|
union {
|
|
|
|
float as_value;
|
|
|
|
uint32_t as_bits;
|
|
|
|
} fp32;
|
|
|
|
fp32.as_value = f;
|
|
|
|
return fp32.as_bits;
|
|
|
|
}
|
|
|
|
|
|
|
|
float from_half(uint16_t h) {
|
|
|
|
const uint32_t w = (uint32_t) h << 16;
|
|
|
|
const uint32_t sign = w & UINT32_C(0x80000000);
|
|
|
|
const uint32_t two_w = w + w;
|
|
|
|
|
|
|
|
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
|
|
|
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
|
|
|
const float exp_scale = 0x1.0p-112f;
|
|
|
|
#else
|
|
|
|
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
|
|
|
#endif
|
|
|
|
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
|
|
|
|
|
|
|
const uint32_t magic_mask = UINT32_C(126) << 23;
|
|
|
|
const float magic_bias = 0.5f;
|
|
|
|
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
|
|
|
|
|
|
|
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
|
|
|
const uint32_t result = sign |
|
|
|
|
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
|
|
|
return fp32_from_bits(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint16_t to_half(float f) {
|
|
|
|
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
|
|
|
const float scale_to_inf = 0x1.0p+112f;
|
|
|
|
const float scale_to_zero = 0x1.0p-110f;
|
|
|
|
#else
|
|
|
|
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
|
|
|
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
|
|
|
#endif
|
|
|
|
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
|
|
|
|
|
|
|
const uint32_t w = fp32_to_bits(f);
|
|
|
|
const uint32_t shl1_w = w + w;
|
|
|
|
const uint32_t sign = w & UINT32_C(0x80000000);
|
|
|
|
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
|
|
|
if (bias < UINT32_C(0x71000000)) {
|
|
|
|
bias = UINT32_C(0x71000000);
|
|
|
|
}
|
|
|
|
|
|
|
|
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
|
|
|
const uint32_t bits = fp32_to_bits(base);
|
|
|
|
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
|
|
|
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
|
|
|
const uint32_t nonsign = exp_bits + mantissa_bits;
|
|
|
|
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef TEST_MAIN
|
|
|
|
#include <stdio.h>
|
|
|
|
int main(void) {
|
|
|
|
float f = 1.2345;
|
|
|
|
uint16_t half = to_half(f);
|
|
|
|
float f2 = from_half(half);
|
|
|
|
printf("%f %f\n", f, f2);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|