From b1f32c4088336e1a3a366516af21cb7165bfd9d8 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 3 Jan 2024 21:02:17 +0100 Subject: [PATCH] Quantization functions refactoring. --- gguflib.c | 362 +++++++++++++++++++++++++++++------------------------- 1 file changed, 193 insertions(+), 169 deletions(-) diff --git a/gguflib.c b/gguflib.c index b51d630..30a2fba 100644 --- a/gguflib.c +++ b/gguflib.c @@ -500,6 +500,195 @@ int gguf_append_tensor_data(gguf_ctx *ctx, void *tensor, uint64_t tensor_size) { /* ============================ GGUF dequantization ========================= */ +/* G8_0 blocks dequantization to floats. + * 'y' is supposed to have enough space for 'count' weights. */ +void gguf_q8_0_to_float(void *weights_data, float *y, uint64_t count) { + struct gguf_tensor_type_features *tf = + gguf_get_tensor_type_features(GGUF_TYPE_Q8_0); + + /* Very simple layout: |16 bit scale|32 x 8bit weights| + * Each weight is scale * quantized_weight[0..31] */ + int8_t *block = weights_data; + uint64_t i = 0; // i-th weight to dequantize. + while(i < count) { + /* For each block get the scale and convert all the + * weights in the block. */ + float scale = from_half(*((uint16_t*)block)); + for (uint32_t j = 0; j < tf->items_per_block; j++) { + y[i++] = block[j+2] * scale; // j+2 to skip the scale bytes. + if (i == count) break; + } + block += tf->bytes_per_block; // Go to the next block. + } +} + +/* G4_K blocks dequantization to floats. + * 'y' is supposed to have enough space for 'count' weights. */ +void gguf_q4_k_to_float(void *weights_data, float *y, uint64_t count) { + uint8_t *block = weights_data; + uint64_t i = 0; // i-th weight to dequantize. + while(i < count) { + /* Q4_K super-blocks have 256 total weights, split in 8 sub-block. + * Each 8 sub-blocks have a different set of scales/mins, so + * there are 16 total values for scales/mins, but the scales/mins + * are also quantized (6 bits each) using two different scales: + * scale_of_scales and scale_of_mins, that are two FP16 values + * at the start of the super block, so: + * + * |FP16 s_of_scales | + + * |FP16 s_of_mins | + + * |16 6 bit integers d,m pairs, one per sub-block of 32 ele | + + * |256 x 4bit weights| + * + * Each quantized weight 'q' is restored as: + * + * w = q * scale - min; + */ + float scales_scale = from_half(*((uint16_t*)block)); + float mins_scale = from_half(*((uint16_t*)(block+2))); + block += 4; + + /* Extract the 16 x 6 bit values scales-mins pairs. The + * encoding of those values is odd because of performance + * reasons: + * + * dddddddd dddddddd dddddddd dddddddd mmmmmmmm mmmmmmmm + * 44000000|55111111|66222222|77333333|44000000|55111111 + * + * mmmmmmmm mmmmmmmm mmmmdddd mmmmdddd mmmmdddd mmmmdddd + * 66222222|77333333|44444444|55555555|66666666|77777777 + * + * In the above diagram you can see the 12 bytes and the + * scales/mins 6 bits encodings. */ + + /* Scale scales/mins. */ + float scales[8], mins[8]; + for (int j = 0; j < 8; j++) { + uint8_t d,m; + if (j < 4) { + d = block[j] & 63; + m = block[j+4] & 63; + } else { + d = (block[j+4] & 0xF) | ((block[j-4] >> 6) << 4); + m = (block[j+4] >> 4) | ((block[j-0] >> 6) << 4); + } + scales[j] = d * scales_scale; + mins[j] = m * mins_scale; + } + block += 12; // Seek 4-bit weights start. + + /* Finally we can extract the 256 weights. + * We process two blocks per time, because each + * 32 bytes have 64 weights stored like this: + * First 32 weights of the first block are the higher 4 + * bits of each byte. Second 32 weights of the second + * block are lower 4 bits of each byte. */ + for (uint32_t b = 0; b < 8; b += 2) { + float scale = scales[b]; + float min = mins[b]; + /* First set: higher bits. */ + for (uint32_t j = 0; j < 32; j++) { + uint8_t w = block[j] & 0xf; + y[i++] = w * scale - min; + if (i == count) return; + } + /* Second set: lower bits. */ + for (uint32_t j = 0; j < 32; j++) { + uint8_t w = block[j] >> 4; + y[i++] = w * scale - min; + if (i == count) return; + } + block += 32; // Skip the two processed blocks. + } + } +} + +/* G6_K blocks dequantization to floats. + * 'y' is supposed to have enough space for 'count' weights. */ +void gguf_q6_k_to_float(void *weights_data, float *y, uint64_t count) { + uint8_t *block = weights_data; + uint64_t i = 0; // i-th weight to dequantize. + while(i < count) { + /* Q6_K super-blocks have 256 total weights, split in 16 sub-block + * of 16 elements. There are no mins, just scales. Each sub-block + * have a block-specific scale quantized at 8 bits via a single + * 16-bit main scale-of-scales. + * + * |128 bytes of lower 4 bits of quants| + + * |64 bytes of lower 2 bits of quants| + + * |16 bytes of 8-bit block scales | + + * |A single FP16 value: the scale of the scales above | + * + * Let's call "L" the lower 4 bits array (128 bytes) + * and "H" the higher 2 bits array (64 bytes) + * + * Values are logically encoded in two 128 weights clusters + * where the first cluster is the first 64 bytes of "L" and + * the first 32 bytes of "H". + * + * Higher bits of the i-th weight from 0 to 63 are stored in the + * lower 4 bits of L[i], while higher bits of the i-th weight + * from 64 to 127 are stored in the higher bits of L[i-64]: + * + * L = |64640000|65650101|66660202|... + * + * So this actually is: w_low = (L[i%64] >> i/64*4) & 15 + * + * H = |96643200|97653301|98663402|... + * + * Higher bits of the i-th weight are arranged like that: + * + * From 0 to 31, bits 0,1 of H[i] + * From 32 to 63, bits 3,2 of H[i-32] + * From 64 to 95, bits 5,4 of H[i-64] + * From 96 to 127, bits 7,6 of H[i-96] + * + * So this actually is: w_high = ((H[i%32] >> i/32*2) & 3) << 2 + * The same is true with the next 128 weights cluster, but + * everything is relative to the second half of H and L. + * + * Finally, there is to extract the scale from the + * 16 blocks scales array. Scales are just sequential, + * so the i-th weight uses the scale[i/16]. + * + * Important: In Q6_K the 6-bit quants are wisely stored + * as unsigned integers + 32, so that there is no need to + * do sign bit extension in order to convert the 6-bit value + * into 8 bit value. Instead the values from -32 to 31 are + * remapped in the 0-63 range (just adding 32). + */ + float super_scale = from_half(*((uint16_t*)(block+128+64+16))); + uint8_t *L = block; + uint8_t *H = block+128; + int8_t *scales = (int8_t*)block+128+64; + for (int cluster = 0; cluster < 2; cluster++) { + for (uint64_t j = 0; j < 128; j++) { + y[i] = (super_scale * scales[j/16]) * + ((int8_t) + ((((L[j%64] >> (j/64*4)) & 0xF) | + (((H[j%32] >> (j/32*2)) & 3) << 4)))-32); + i++; + if (i == count) return; + } + L += 64; + H += 32; + scales += 8; + } + block += 128+64+16+2; // Go to the next block. + } +} + +/* FP16 blocks dequantization to floats. + * 'y' is supposed to have enough space for 'count' weights. */ +void gguf_f16_to_float(void *weights_data, float *y, uint64_t count) { + uint64_t i = 0; // i-th weight to dequantize. + uint16_t *w16 = weights_data; + while(i < count) { + y[i] = from_half(w16[i]); + i++; + } +} + /* Convert the specified tensor (quantized or not) into an array of * floats. The array is allocated with malloc(). If the tensor is already * in FP32 floats format, it is just memcpy()-ed to the destination array. @@ -507,182 +696,17 @@ int gguf_append_tensor_data(gguf_ctx *ctx, void *tensor, uint64_t tensor_size) { * On OOM, NULL is returned. If the tensor format is not yet supported, * NULL is returned as well, but errno is set to EINVAL. */ float *gguf_tensor_to_float(gguf_tensor *tensor) { - struct gguf_tensor_type_features *tf = - gguf_get_tensor_type_features(tensor->type); - uint64_t block_size = tf->bytes_per_block; float *f = malloc(tensor->num_weights*sizeof(float)); if (tensor->type == GGUF_TYPE_F32) { memcpy(f, tensor->weights_data, tensor->num_weights*sizeof(float)); } else if (tensor->type == GGUF_TYPE_F16) { - uint64_t i = 0; // i-th weight to dequantize. - uint16_t *w16 = (uint16_t*) tensor->weights_data; - while(i < tensor->num_weights) { - f[i] = from_half(w16[i]); - i++; - } + gguf_f16_to_float(tensor->weights_data, f, tensor->num_weights); } else if (tensor->type == GGUF_TYPE_Q8_0) { - /* Very simple layout: |16 bit scale|32 x 8bit weights| - * Each weight is scale * quantized_weight[0..31] */ - int8_t *block = (int8_t*)tensor->weights_data; - uint64_t i = 0; // i-th weight to dequantize. - while(i < tensor->num_weights) { - /* For each block get the scale and convert all the - * weights in the block. */ - float scale = from_half(*((uint16_t*)block)); - for (uint32_t j = 0; j < tf->items_per_block; j++) { - f[i++] = block[j+2] * scale; // j+2 to skip the scale bytes. - if (i == tensor->num_weights) break; - } - block += block_size; // Go to the next block. - } + gguf_q8_0_to_float(tensor->weights_data, f, tensor->num_weights); } else if (tensor->type == GGUF_TYPE_Q4_K) { - uint8_t *block = (uint8_t*)tensor->weights_data; - uint64_t i = 0; // i-th weight to dequantize. - while(i < tensor->num_weights) { - /* Q4_K super-blocks have 256 total weights, split in 8 sub-block. - * Each 8 sub-blocks have a different set of scales/mins, so - * there are 16 total values for scales/mins, but the scales/mins - * are also quantized (6 bits each) using two different scales: - * scale_of_scales and scale_of_mins, that are two FP16 values - * at the start of the super block, so: - * - * |FP16 s_of_scales | + - * |FP16 s_of_mins | + - * |16 6 bit integers d,m pairs, one per sub-block of 32 ele | + - * |256 x 4bit weights| - * - * Each quantized weight 'q' is restored as: - * - * w = q * scale - min; - */ - float scales_scale = from_half(*((uint16_t*)block)); - float mins_scale = from_half(*((uint16_t*)(block+2))); - block += 4; - - /* Extract the 16 x 6 bit values scales-mins pairs. The - * encoding of those values is odd because of performance - * reasons: - * - * dddddddd dddddddd dddddddd dddddddd mmmmmmmm mmmmmmmm - * 44000000|55111111|66222222|77333333|44000000|55111111 - * - * mmmmmmmm mmmmmmmm mmmmdddd mmmmdddd mmmmdddd mmmmdddd - * 66222222|77333333|44444444|55555555|66666666|77777777 - * - * In the above diagram you can see the 12 bytes and the - * scales/mins 6 bits encodings. */ - - /* Scale scales/mins. */ - float scales[8], mins[8]; - for (int j = 0; j < 8; j++) { - uint8_t d,m; - if (j < 4) { - d = block[j] & 63; - m = block[j+4] & 63; - } else { - d = (block[j+4] & 0xF) | ((block[j-4] >> 6) << 4); - m = (block[j+4] >> 4) | ((block[j-0] >> 6) << 4); - } - scales[j] = d * scales_scale; - mins[j] = m * mins_scale; - } - block += 12; // Seek 4-bit weights start. - - /* Finally we can extract the 256 weights. - * We process two blocks per time, because each - * 32 bytes have 64 weights stored like this: - * First 32 weights of the first block are the higher 4 - * bits of each byte. Second 32 weights of the second - * block are lower 4 bits of each byte. */ - for (uint32_t b = 0; b < 8; b += 2) { - float scale = scales[b]; - float min = mins[b]; - /* First set: higher bits. */ - for (uint32_t j = 0; j < 32; j++) { - uint8_t w = block[j] & 0xf; - f[i++] = w * scale - min; - if (i == tensor->num_weights) return f; - } - /* Second set: lower bits. */ - for (uint32_t j = 0; j < 32; j++) { - uint8_t w = block[j] >> 4; - f[i++] = w * scale - min; - if (i == tensor->num_weights) return f; - } - block += 32; // Skip the two processed blocks. - } - } + gguf_q4_k_to_float(tensor->weights_data, f, tensor->num_weights); } else if (tensor->type == GGUF_TYPE_Q6_K) { - uint8_t *block = (uint8_t*)tensor->weights_data; - uint64_t i = 0; // i-th weight to dequantize. - while(i < tensor->num_weights) { - /* Q6_K super-blocks have 256 total weights, split in 16 sub-block - * of 16 elements. There are no mins, just scales. Each sub-block - * have a block-specific scale quantized at 8 bits via a single - * 16-bit main scale-of-scales. - * - * |128 bytes of lower 4 bits of quants| + - * |64 bytes of lower 2 bits of quants| + - * |16 bytes of 8-bit block scales | + - * |A single FP16 value: the scale of the scales above | - * - * Let's call "L" the lower 4 bits array (128 bytes) - * and "H" the higher 2 bits array (64 bytes) - * - * Values are logically encoded in two 128 weights clusters - * where the first cluster is the first 64 bytes of "L" and - * the first 32 bytes of "H". - * - * Higher bits of the i-th weight from 0 to 63 are stored in the - * lower 4 bits of L[i], while higher bits of the i-th weight - * from 64 to 127 are stored in the higher bits of L[i-64]: - * - * L = |64640000|65650101|66660202|... - * - * So this actually is: w_low = (L[i%64] >> i/64*4) & 15 - * - * H = |96643200|97653301|98663402|... - * - * Higher bits of the i-th weight are arranged like that: - * - * From 0 to 31, bits 0,1 of H[i] - * From 32 to 63, bits 3,2 of H[i-32] - * From 64 to 95, bits 5,4 of H[i-64] - * From 96 to 127, bits 7,6 of H[i-96] - * - * So this actually is: w_high = ((H[i%32] >> i/32*2) & 3) << 2 - * The same is true with the next 128 weights cluster, but - * everything is relative to the second half of H and L. - * - * Finally, there is to extract the scale from the - * 16 blocks scales array. Scales are just sequential, - * so the i-th weight uses the scale[i/16]. - * - * Important: In Q6_K the 6-bit quants are wisely stored - * as unsigned integers + 32, so that there is no need to - * do sign bit extension in order to convert the 6-bit value - * into 8 bit value. Instead the values from -32 to 31 are - * remapped in the 0-63 range (just adding 32). - */ - float super_scale = from_half(*((uint16_t*)(block+128+64+16))); - uint8_t *L = block; - uint8_t *H = block+128; - int8_t *scales = (int8_t*)block+128+64; - for (int cluster = 0; cluster < 2; cluster++) { - for (uint64_t j = 0; j < 128; j++) { - f[i] = (super_scale * scales[j/16]) * - ((int8_t) - ((((L[j%64] >> (j/64*4)) & 0xF) | - (((H[j%32] >> (j/32*2)) & 3) << 4)))-32); - i++; - if (i == tensor->num_weights) return f; - } - L += 64; - H += 32; - scales += 8; - } - block += 128+64+16+2; // Go to the next block. - } + gguf_q6_k_to_float(tensor->weights_data, f, tensor->num_weights); } else { errno = EINVAL; return NULL;