52 uint32_t x_sign_32 = in.u & uint32_t(0x80000000);
53 uint16_t x_sign_16 = (x_sign_32 >> 16);
59 float_bits_fp16 inf_scale, zero_scale, magic_bits;
62 uint32_t x_expo_32 = in.u & uint32_t(0x7f800000);
63 uint32_t max_expo_32 = uint32_t(0x38800000);
64 x_expo_32 = x_expo_32 < max_expo_32 ? max_expo_32 : x_expo_32;
65 x_expo_32 += uint32_t(15) << 23;
68 inf_scale.u = uint32_t(0x77800000);
69 zero_scale.u = uint32_t(0x08800000);
72 magic_bits.u = x_expo_32;
73 magic_bits.f += (std::abs(x) * inf_scale.f) * zero_scale.f;
76 uint32_t x_expo_16 = ((magic_bits.u >> 13) & uint32_t(0x7c00));
79 uint32_t x_mant_16 = magic_bits.u & uint32_t(0x0fff);
82 bits_ = (x_sign_16 | uint16_t(x_expo_16 + x_mant_16));
87 operator float()
const {
94 uint32_t x_sign_32 = (
bits_ << 16) & uint32_t(0x80000000);
95 uint32_t base = (
bits_ << 16);
96 uint32_t two_base = base + base;
98 uint32_t denorm_max = 1u << 27;
99 if (two_base < denorm_max) {
100 out.u = uint32_t(126) << 23;
101 out.u |= (two_base >> 17);
104 out.u = uint32_t(0xE0) << 23;
105 out.u += (two_base >> 4);
106 float out_unscaled = out.f;
107 out.u = uint32_t(0x7800000);
108 out.f *= out_unscaled;