Fix quantization of all 0s (#1028)

This commit is contained in:
Angelos Katharopoulos
2024-04-24 00:40:42 -07:00
committed by GitHub
parent d0dbfe0b97
commit ec8578d41a
2 changed files with 12 additions and 1 deletions

View File

@@ -3274,7 +3274,10 @@ std::tuple<array, array, array> quantize(
reshape(w, {w.shape(0), w.shape(1) / group_size, group_size}, s);
array w_max = max(packed_w, /* axis= */ -1, /* keepdims= */ true, s);
array w_min = min(packed_w, /* axis= */ -1, /* keepdims= */ true, s);
array delta = divide(subtract(w_max, w_min, s), array(n_bins, w.dtype()), s);
array delta = maximum(
divide(subtract(w_max, w_min, s), array(n_bins, w.dtype()), s),
array(1e-7, w.dtype()),
s);
array scales = squeeze(delta, -1, s);
array biases = squeeze(w_min, -1, s);