3 and 6 bit quantization (#1613)

* Support 3 and 6 bit quantization
This commit is contained in:
Alex Barron
2024-11-22 10:22:13 -08:00
committed by GitHub
parent 0c5eea226b
commit c79f6a4a8c
12 changed files with 633 additions and 419 deletions

View File

@@ -3683,7 +3683,7 @@ std::tuple<array, array, array> quantize(
int group_size /* = 64 */,
int bits /* = 4 */,
StreamOrDevice s /* = {} */) {
return fast::affine_quantize(w, group_size, bits);
return fast::affine_quantize(w, group_size, bits, s);
}
array dequantize(