mirror of
https://github.com/antirez/gguf-tools.git
synced 2025-09-18 20:59:37 +08:00
8
bf16.h
8
bf16.h
@@ -52,10 +52,6 @@ static inline float from_brain(uint16_t h) {
|
||||
|
||||
/**
|
||||
* Converts float32 to brain16.
|
||||
*
|
||||
* This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
|
||||
* Subnormals shall be flushed to zero, and NANs will be quiet.
|
||||
* This code should vectorize nicely if using modern compilers.
|
||||
*/
|
||||
static inline uint16_t to_brain(float s) {
|
||||
uint16_t h;
|
||||
@@ -68,10 +64,6 @@ static inline uint16_t to_brain(float s) {
|
||||
h = (u.i >> 16) | 64; /* force to quiet */
|
||||
return h;
|
||||
}
|
||||
if (!(u.i & 0x7f800000)) { /* subnormal */
|
||||
h = (u.i & 0x80000000) >> 16; /* flush to zero */
|
||||
return h;
|
||||
}
|
||||
return (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user