/* Conversion from floats to FP16 and the other way around. * This is useful as in GGUF files we have both FP16 tensors * and quantized blocks where half-precisions floats are used * to store the scaling factor (delta) and other parameters. * * This code comes originally from: * https://github.com/Maratyszcza/FP16/blob/master/include/fp16/fp16.h * * The original code is MIT licensed. */ #include #include static inline float fp32_from_bits(uint32_t w) { union { uint32_t as_bits; float as_value; } fp32; fp32.as_bits = w; return fp32.as_value; } static inline uint32_t fp32_to_bits(float f) { union { float as_value; uint32_t as_bits; } fp32; fp32.as_value = f; return fp32.as_bits; } float from_half(uint16_t h) { const uint32_t w = (uint32_t) h << 16; const uint32_t sign = w & UINT32_C(0x80000000); const uint32_t two_w = w + w; const uint32_t exp_offset = UINT32_C(0xE0) << 23; #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) const float exp_scale = 0x1.0p-112f; #else const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); #endif const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; const uint32_t magic_mask = UINT32_C(126) << 23; const float magic_bias = 0.5f; const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; const uint32_t denormalized_cutoff = UINT32_C(1) << 27; const uint32_t result = sign | (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); return fp32_from_bits(result); } uint16_t to_half(float f) { #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) const float scale_to_inf = 0x1.0p+112f; const float scale_to_zero = 0x1.0p-110f; #else const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); #endif float base = (fabsf(f) * scale_to_inf) * scale_to_zero; const uint32_t w = fp32_to_bits(f); const uint32_t shl1_w = w + w; const uint32_t sign = w & UINT32_C(0x80000000); uint32_t bias = shl1_w & UINT32_C(0xFF000000); if (bias < UINT32_C(0x71000000)) { bias = UINT32_C(0x71000000); } base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; const uint32_t bits = fp32_to_bits(base); const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); const uint32_t nonsign = exp_bits + mantissa_bits; return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); } #ifdef TEST_MAIN #include int main(void) { float f = 1.2345; uint16_t half = to_half(f); float f2 = from_half(half); printf("%f %f\n", f, f2); return 0; } #endif