Batched Quantized Matmul + Fast Small QMV (#1503)

* add fast qmv for small dims

* fix test

* batched cpu

* add batched template param

* refactor metal quantized.cpp
This commit is contained in:
Alex Barron
2024-10-21 16:23:17 -07:00
committed by GitHub
parent 58a855682c
commit d15fa13daf
9 changed files with 866 additions and 761 deletions

View File

@@ -725,15 +725,6 @@ affine_quantize(const array& w, int group_size, int bits, StreamOrDevice s_) {
int el_per_int = 32 / bits;
if (w.shape(-1) < 32 * el_per_int) {
std::ostringstream msg;
msg << "[quantize] The feature dimension (2nd dimension of the matrix) is "
<< "too small for quantization. We support >=512 for 2 bits, "
<< ">= 256 for 4 bits and >= 128 for 8 bits. The provided matrix has "
<< "shape " << w.shape() << ".";
throw std::invalid_argument(msg.str());
}
auto fallback = [group_size, bits, el_per_int, s](
const std::vector<array>& inputs) -> std::vector<array> {
auto& w = inputs[0];