mirror of
https://github.com/antirez/gguf-tools.git
synced 2025-09-17 02:28:07 +08:00
71 lines
1.6 KiB
C
71 lines
1.6 KiB
C
![]() |
#ifndef BF16_h
|
||
|
#define BF16_h
|
||
|
#include <stdint.h>
|
||
|
|
||
|
/**
|
||
|
* Converts brain16 to float32.
|
||
|
*
|
||
|
* The bfloat16 floating point format has the following structure:
|
||
|
*
|
||
|
* ┌sign
|
||
|
* │
|
||
|
* │ ┌exponent
|
||
|
* │ │
|
||
|
* │ │ ┌mantissa
|
||
|
* │ │ │
|
||
|
* │┌──┴───┐┌─┴───┐
|
||
|
* 0b0000000000000000 brain16
|
||
|
*
|
||
|
* Since bf16 has the same number of exponent bits as a 32bit float,
|
||
|
* encoding and decoding numbers becomes relatively straightforward.
|
||
|
*
|
||
|
* ┌sign
|
||
|
* │
|
||
|
* │ ┌exponent
|
||
|
* │ │
|
||
|
* │ │ ┌mantissa
|
||
|
* │ │ │
|
||
|
* │┌──┴───┐┌─┴───────────────────┐
|
||
|
* 0b00000000000000000000000000000000 IEEE binary32
|
||
|
*
|
||
|
* For comparison, the standard fp16 format has fewer exponent bits.
|
||
|
*
|
||
|
* ┌sign
|
||
|
* │
|
||
|
* │ ┌exponent
|
||
|
* │ │
|
||
|
* │ │ ┌mantissa
|
||
|
* │ │ │
|
||
|
* │┌─┴─┐┌─┴──────┐
|
||
|
* 0b0000000000000000 IEEE binary16
|
||
|
*
|
||
|
* @see IEEE 754-2008
|
||
|
*/
|
||
|
static inline float from_brain(uint16_t h) {
|
||
|
union {
|
||
|
float f;
|
||
|
uint32_t i;
|
||
|
} u;
|
||
|
u.i = (uint32_t)h << 16;
|
||
|
return u.f;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Converts float32 to brain16.
|
||
|
*/
|
||
|
static inline uint16_t to_brain(float s) {
|
||
|
uint16_t h;
|
||
|
union {
|
||
|
float f;
|
||
|
uint32_t i;
|
||
|
} u;
|
||
|
u.f = s;
|
||
|
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
|
||
|
h = (u.i >> 16) | 64; /* force to quiet */
|
||
|
return h;
|
||
|
}
|
||
|
return (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
||
|
}
|
||
|
|
||
|
#endif
|