Files
gguf-tools/bf16.h

71 lines
1.6 KiB
C
Raw Normal View History

#ifndef BF16_h
#define BF16_h
#include <stdint.h>
/**
* Converts brain16 to float32.
*
* The bfloat16 floating point format has the following structure:
*
* sign
*
* exponent
*
* mantissa
*
*
* 0b0000000000000000 brain16
*
* Since bf16 has the same number of exponent bits as a 32bit float,
* encoding and decoding numbers becomes relatively straightforward.
*
* sign
*
* exponent
*
* mantissa
*
*
* 0b00000000000000000000000000000000 IEEE binary32
*
* For comparison, the standard fp16 format has fewer exponent bits.
*
* sign
*
* exponent
*
* mantissa
*
*
* 0b0000000000000000 IEEE binary16
*
* @see IEEE 754-2008
*/
static inline float from_brain(uint16_t h) {
union {
float f;
uint32_t i;
} u;
u.i = (uint32_t)h << 16;
return u.f;
}
/**
* Converts float32 to brain16.
*/
static inline uint16_t to_brain(float s) {
uint16_t h;
union {
float f;
uint32_t i;
} u;
u.f = s;
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
h = (u.i >> 16) | 64; /* force to quiet */
return h;
}
return (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
}
#endif