mirror of
https://github.com/antirez/gguf-tools.git
synced 2025-09-18 21:08:07 +08:00
8
bf16.h
8
bf16.h
@@ -52,10 +52,6 @@ static inline float from_brain(uint16_t h) {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Converts float32 to brain16.
|
* Converts float32 to brain16.
|
||||||
*
|
|
||||||
* This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
|
|
||||||
* Subnormals shall be flushed to zero, and NANs will be quiet.
|
|
||||||
* This code should vectorize nicely if using modern compilers.
|
|
||||||
*/
|
*/
|
||||||
static inline uint16_t to_brain(float s) {
|
static inline uint16_t to_brain(float s) {
|
||||||
uint16_t h;
|
uint16_t h;
|
||||||
@@ -68,10 +64,6 @@ static inline uint16_t to_brain(float s) {
|
|||||||
h = (u.i >> 16) | 64; /* force to quiet */
|
h = (u.i >> 16) | 64; /* force to quiet */
|
||||||
return h;
|
return h;
|
||||||
}
|
}
|
||||||
if (!(u.i & 0x7f800000)) { /* subnormal */
|
|
||||||
h = (u.i & 0x80000000) >> 16; /* flush to zero */
|
|
||||||
return h;
|
|
||||||
}
|
|
||||||
return (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
return (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user