gguf-tools/fp16.c

/* Conversion from floats to FP16 and the other way around.
 * This is useful as in GGUF files we have both FP16 tensors
 * and quantized blocks where half-precisions floats are used
 * to store the scaling factor (delta) and other parameters.
 *
 * This code comes originally from:
 * https://github.com/Maratyszcza/FP16/blob/master/include/fp16/fp16.h
 *
 * The original code is MIT licensed. */

#include <stdint.h>
#include <math.h>

static inline float fp32_from_bits(uint32_t w) {
    union {
        uint32_t as_bits;
        float as_value;
    } fp32;
    fp32.as_bits = w;
    return fp32.as_value;
}

static inline uint32_t fp32_to_bits(float f) {
    union {
        float as_value;
        uint32_t as_bits;
    } fp32;
    fp32.as_value = f;
    return fp32.as_bits;
}

float from_half(uint16_t h) {
    const uint32_t w = (uint32_t) h << 16;
    const uint32_t sign = w & UINT32_C(0x80000000);
    const uint32_t two_w = w + w;

    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
    const float exp_scale = 0x1.0p-112f;
#else
    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
#endif
    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;

    const uint32_t magic_mask = UINT32_C(126) << 23;
    const float magic_bias = 0.5f;
    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;

    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
    const uint32_t result = sign |
        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
    return fp32_from_bits(result);
}

uint16_t to_half(float f) {
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
    const float scale_to_inf = 0x1.0p+112f;
    const float scale_to_zero = 0x1.0p-110f;
#else
    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
#endif
    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;

    const uint32_t w = fp32_to_bits(f);
    const uint32_t shl1_w = w + w;
    const uint32_t sign = w & UINT32_C(0x80000000);
    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
    if (bias < UINT32_C(0x71000000)) {
        bias = UINT32_C(0x71000000);
    }

    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
    const uint32_t bits = fp32_to_bits(base);
    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
    const uint32_t nonsign = exp_bits + mantissa_bits;
    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
}

#ifdef TEST_MAIN
#include <stdio.h>
int main(void) {
    float f = 1.2345;
    uint16_t half = to_half(f);
    float f2 = from_half(half);
    printf("%f %f\n", f, f2);
    return 0;
}
#endif
Clarify the need for FP16 implementation. 2023-12-27 18:54:36 +01:00			`/* Conversion from floats to FP16 and the other way around.`
			`* This is useful as in GGUF files we have both FP16 tensors`
			`* and quantized blocks where half-precisions floats are used`
			`* to store the scaling factor (delta) and other parameters.`
			`*`
			`* This code comes originally from:`
FP16 added. Split-mixtral improved. 2023-12-27 15:13:42 +01:00			`* https://github.com/Maratyszcza/FP16/blob/master/include/fp16/fp16.h`
			`*`
			`* The original code is MIT licensed. */`

Clarify the need for FP16 implementation. 2023-12-27 18:54:36 +01:00			`#include <stdint.h>`
			`#include <math.h>`

FP16 added. Split-mixtral improved. 2023-12-27 15:13:42 +01:00			`static inline float fp32_from_bits(uint32_t w) {`
			`union {`
			`uint32_t as_bits;`
			`float as_value;`
			`} fp32;`
			`fp32.as_bits = w;`
			`return fp32.as_value;`
			`}`

			`static inline uint32_t fp32_to_bits(float f) {`
			`union {`
			`float as_value;`
			`uint32_t as_bits;`
			`} fp32;`
			`fp32.as_value = f;`
			`return fp32.as_bits;`
			`}`

			`float from_half(uint16_t h) {`
			`const uint32_t w = (uint32_t) h << 16;`
			`const uint32_t sign = w & UINT32_C(0x80000000);`
			`const uint32_t two_w = w + w;`

			`const uint32_t exp_offset = UINT32_C(0xE0) << 23;`
			`#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) \|\| defined(__GNUC__) && !defined(__STRICT_ANSI__)`
			`const float exp_scale = 0x1.0p-112f;`
			`#else`
			`const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));`
			`#endif`
			`const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;`

			`const uint32_t magic_mask = UINT32_C(126) << 23;`
			`const float magic_bias = 0.5f;`
			`const float denormalized_value = fp32_from_bits((two_w >> 17) \| magic_mask) - magic_bias;`

			`const uint32_t denormalized_cutoff = UINT32_C(1) << 27;`
			`const uint32_t result = sign \|`
			`(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));`
			`return fp32_from_bits(result);`
			`}`

			`uint16_t to_half(float f) {`
			`#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) \|\| defined(__GNUC__) && !defined(__STRICT_ANSI__)`
			`const float scale_to_inf = 0x1.0p+112f;`
			`const float scale_to_zero = 0x1.0p-110f;`
			`#else`
			`const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));`
			`const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));`
			`#endif`
			`float base = (fabsf(f) * scale_to_inf) * scale_to_zero;`

			`const uint32_t w = fp32_to_bits(f);`
			`const uint32_t shl1_w = w + w;`
			`const uint32_t sign = w & UINT32_C(0x80000000);`
			`uint32_t bias = shl1_w & UINT32_C(0xFF000000);`
			`if (bias < UINT32_C(0x71000000)) {`
			`bias = UINT32_C(0x71000000);`
			`}`

			`base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;`
			`const uint32_t bits = fp32_to_bits(base);`
			`const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);`
			`const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);`
			`const uint32_t nonsign = exp_bits + mantissa_bits;`
			`return (sign >> 16) \| (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);`
			`}`

			`#ifdef TEST_MAIN`
			`#include <stdio.h>`
			`int main(void) {`
			`float f = 1.2345;`
			`uint16_t half = to_half(f);`
			`float f2 = from_half(half);`
			`printf("%f %f\n", f, f2);`
			`return 0;`
			`}`
			`#endif`