mirror of
				https://github.com/ml-explore/mlx.git
				synced 2025-10-31 16:21:27 +08:00 
			
		
		
		
	GGUF: Avoid dequantization when format is compatible (#426)
* GGUF: Don't dequantize q4_1 * Fix weight order. First in low bits * Add unpacking for q4_0 * Don't dequantize q8_0 * rebase quants and split file * don't quantize every weight * reapply patch * error handling --------- Co-authored-by: Awni Hannun <awni@apple.com>
This commit is contained in:
		| @@ -4,6 +4,7 @@ target_sources( | ||||
|   ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp | ||||
|   ${CMAKE_CURRENT_SOURCE_DIR}/safetensor.cpp | ||||
|   ${CMAKE_CURRENT_SOURCE_DIR}/gguf.cpp | ||||
|   ${CMAKE_CURRENT_SOURCE_DIR}/gguf_quants.cpp | ||||
| ) | ||||
|  | ||||
| MESSAGE(STATUS "Downloading json") | ||||
|   | ||||
| @@ -1,17 +1,10 @@ | ||||
| // Copyright © 2023 Apple Inc. | ||||
| // Copyright © 2023-2024 Apple Inc. | ||||
|  | ||||
| #include <cstdint> | ||||
| #include <cstring> | ||||
| #include <numeric> | ||||
|  | ||||
| #include "mlx/io.h" | ||||
| #include "mlx/primitives.h" | ||||
| #include "mlx/transforms.h" | ||||
| #include "mlx/utils.h" | ||||
|  | ||||
| extern "C" { | ||||
| #include <gguflib.h> | ||||
| } | ||||
| #include <mlx/io/gguf.h> | ||||
|  | ||||
| namespace mlx::core { | ||||
|  | ||||
| @@ -52,7 +45,16 @@ std::optional<Dtype> gguf_type_to_dtype(const uint32_t& gguf_type) { | ||||
|   } | ||||
| } | ||||
|  | ||||
| std::pair<allocator::Buffer, Dtype> extract_tensor_data(gguf_tensor* tensor) { | ||||
| std::vector<int> get_shape(const gguf_tensor& tensor) { | ||||
|   std::vector<int> shape; | ||||
|   // The dimension order in GGML is the reverse of the order used in MLX. | ||||
|   for (int i = tensor.ndim - 1; i >= 0; i--) { | ||||
|     shape.push_back(tensor.dim[i]); | ||||
|   } | ||||
|   return shape; | ||||
| } | ||||
|  | ||||
| std::tuple<allocator::Buffer, Dtype> extract_tensor_data(gguf_tensor* tensor) { | ||||
|   std::optional<Dtype> equivalent_dtype = gguf_type_to_dtype(tensor->type); | ||||
|   // If there's an equivalent type, we can simply copy. | ||||
|   if (equivalent_dtype.has_value()) { | ||||
| @@ -203,16 +205,27 @@ std::unordered_map<std::string, MetaData> load_metadata(gguf_ctx* ctx) { | ||||
| std::unordered_map<std::string, array> load_arrays(gguf_ctx* ctx) { | ||||
|   std::unordered_map<std::string, array> array_map; | ||||
|   gguf_tensor tensor; | ||||
|   while (gguf_get_tensor(ctx, &tensor)) { | ||||
|     std::vector<int> shape; | ||||
|     // The dimension order in GGML is the reverse of the order used in MLX. | ||||
|     for (int i = tensor.ndim - 1; i >= 0; i--) { | ||||
|       shape.push_back(tensor.dim[i]); | ||||
|  | ||||
|   auto check_insert = [](auto inserted) { | ||||
|     if (!inserted.second) { | ||||
|       std::ostringstream msg; | ||||
|       msg << "[load_gguf] Duplicate parameter name " << inserted.first->second | ||||
|           << " this can happend when loading quantized tensors."; | ||||
|       throw std::runtime_error(msg.str()); | ||||
|     } | ||||
|   }; | ||||
|  | ||||
|   while (gguf_get_tensor(ctx, &tensor)) { | ||||
|     if (tensor.type == GGUF_TYPE_Q4_0 || tensor.type == GGUF_TYPE_Q4_1 || | ||||
|         tensor.type == GGUF_TYPE_Q8_0) { | ||||
|       gguf_load_quantized(array_map, tensor); | ||||
|     } else { | ||||
|       std::string name = std::string(tensor.name, tensor.namelen); | ||||
|  | ||||
|       const auto& [data, dtype] = extract_tensor_data(&tensor); | ||||
|       array loaded_array = array(data, get_shape(tensor), dtype); | ||||
|       array_map.insert({name, loaded_array}); | ||||
|     } | ||||
|     const auto& [data, dtype] = extract_tensor_data(&tensor); | ||||
|     array loaded_array = array(data, shape, dtype); | ||||
|     std::string name = std::string(tensor.name, tensor.namelen); | ||||
|     array_map.insert({name, loaded_array}); | ||||
|   } | ||||
|   return array_map; | ||||
| } | ||||
|   | ||||
							
								
								
									
										20
									
								
								mlx/io/gguf.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								mlx/io/gguf.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,20 @@ | ||||
| // Copyright © 2023-2024 Apple Inc. | ||||
| #pragma once | ||||
|  | ||||
| #include "mlx/io.h" | ||||
| #include "mlx/primitives.h" | ||||
| #include "mlx/transforms.h" | ||||
| #include "mlx/utils.h" | ||||
|  | ||||
| extern "C" { | ||||
| #include <gguflib.h> | ||||
| } | ||||
|  | ||||
| namespace mlx::core { | ||||
|  | ||||
| std::vector<int> get_shape(const gguf_tensor& tensor); | ||||
| void gguf_load_quantized( | ||||
|     std::unordered_map<std::string, array>& a, | ||||
|     const gguf_tensor& tensor); | ||||
|  | ||||
| } // namespace mlx::core | ||||
							
								
								
									
										158
									
								
								mlx/io/gguf_quants.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										158
									
								
								mlx/io/gguf_quants.cpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,158 @@ | ||||
| // Copyright © 2023-2024 Apple Inc. | ||||
|  | ||||
| #include <cstdint> | ||||
| #include <cstring> | ||||
|  | ||||
| #include <mlx/io/gguf.h> | ||||
|  | ||||
| namespace mlx::core { | ||||
|  | ||||
| void unpack_32_4(uint8_t* data, int8_t* dst) { | ||||
|   for (int64_t j = 0; j < 16; ++j) { | ||||
|     uint8_t x = (data[j + 2] & 0x0F); // j+2 to skip scale bytes. | ||||
|     if (j % 2 != 0) { | ||||
|       x <<= 4; | ||||
|     } | ||||
|     dst[j / 2] += x; | ||||
|   } | ||||
|   // Last 16 weights are in the higher bits | ||||
|   for (int64_t j = 0; j < 16; ++j) { | ||||
|     uint8_t x = (data[j + 2] >> 4); | ||||
|     if (j % 2 != 0) { | ||||
|       x <<= 4; | ||||
|     } | ||||
|     dst[8 + j / 2] += x; | ||||
|   } | ||||
| } | ||||
|  | ||||
| // Extracts (weight, scales, biases) from Q4_0 tensors. | ||||
| // Data layout is: |16 bit scale|32 x 4bit weights|. | ||||
| void extract_q4_0_data( | ||||
|     const gguf_tensor& tensor, | ||||
|     array& weights_arr, | ||||
|     array& scales_arr, | ||||
|     array& biases_arr) { | ||||
|   const uint64_t bytes_per_block = 18; // 2 bytes scale, 32x0.5 byte weights | ||||
|   auto data = static_cast<uint8_t*>(tensor.weights_data); | ||||
|   auto weights = weights_arr.data<int8_t>(); | ||||
|   auto scales = scales_arr.data<float16_t>(); | ||||
|   auto biases = biases_arr.data<float16_t>(); | ||||
|   for (int64_t i = 0; i < scales_arr.size(); i++) { | ||||
|     scales[i] = *((float16_t*)data); | ||||
|     biases[i] = -8 * scales[i]; | ||||
|     unpack_32_4(data, weights); | ||||
|     weights += 16; | ||||
|     data += bytes_per_block; | ||||
|   } | ||||
| } | ||||
|  | ||||
| // Extracts (weight, scales, biases) from Q4_1 tensors. | ||||
| // Data layout is: |16 bit scale|16 bit bias|32 x 4bit weights|. | ||||
| void extract_q4_1_data( | ||||
|     const gguf_tensor& tensor, | ||||
|     array& weights_arr, | ||||
|     array& scales_arr, | ||||
|     array& biases_arr) { | ||||
|   const uint64_t bytes_per_block = | ||||
|       20; // 2 bytes scale, 2 bytes bias, 32x0.5 byte weights | ||||
|   auto data = static_cast<uint8_t*>(tensor.weights_data); | ||||
|   auto weights = weights_arr.data<int8_t>(); | ||||
|   auto scales = scales_arr.data<float16_t>(); | ||||
|   auto biases = biases_arr.data<float16_t>(); | ||||
|   for (int64_t i = 0; i < scales_arr.size(); i++) { | ||||
|     scales[i] = *((float16_t*)data); | ||||
|     biases[i] = *((float16_t*)(data) + 1); | ||||
|     unpack_32_4(data, weights); | ||||
|     weights += 16; | ||||
|     data += bytes_per_block; | ||||
|   } | ||||
| } | ||||
|  | ||||
| // Extracts (weight, scales, biases) from Q8_0 tensors. | ||||
| // Data layout is: |16 bit scale|32 x 8bit weights|. | ||||
| void extract_q8_0_data( | ||||
|     const gguf_tensor& tensor, | ||||
|     array& weights_arr, | ||||
|     array& scales_arr, | ||||
|     array& biases_arr) { | ||||
|   const uint64_t weights_per_block = 32; | ||||
|   const uint64_t bytes_per_block = 34; // 2 bytes scale, 32x1 byte weights | ||||
|   auto data = static_cast<uint8_t*>(tensor.weights_data); | ||||
|   auto weights = weights_arr.data<int8_t>(); | ||||
|   auto scales = scales_arr.data<float16_t>(); | ||||
|   auto biases = biases_arr.data<float16_t>(); | ||||
|   for (int64_t i = 0; i < scales_arr.size(); i++) { | ||||
|     uint8_t* block_data = data + i * bytes_per_block; | ||||
|     scales[i] = *((float16_t*)block_data); | ||||
|     biases[i] = -128 * scales[i]; | ||||
|     for (int64_t j = 0; j < weights_per_block; ++j) { | ||||
|       uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes. | ||||
|       // Original data is in int8_t, so we add a bias of -128 and invert the | ||||
|       // first bit. | ||||
|       x ^= 1 << 7; | ||||
|       weights[i * weights_per_block + j] = x; | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| void gguf_load_quantized( | ||||
|     std::unordered_map<std::string, array>& a, | ||||
|     const gguf_tensor& tensor) { | ||||
|   uint64_t weights_per_byte; | ||||
|   if (tensor.type == GGUF_TYPE_Q4_0 || tensor.type == GGUF_TYPE_Q4_1) { | ||||
|     weights_per_byte = 2; | ||||
|   } else { // tensor.type == GGUF_TYPE_Q8_0 | ||||
|     weights_per_byte = 1; | ||||
|   } | ||||
|  | ||||
|   std::string name = std::string(tensor.name, tensor.namelen); | ||||
|   std::vector<int> shape = get_shape(tensor); | ||||
|   const uint64_t weights_per_block = 32; | ||||
|   if (shape[shape.size() - 1] % weights_per_block != 0) { | ||||
|     std::ostringstream msg; | ||||
|     msg << "[load_gguf] tensor " << name | ||||
|         << "has incompatible last dim shape: " << shape[shape.size() - 1]; | ||||
|     throw std::runtime_error(msg.str()); | ||||
|   } | ||||
|   const uint64_t num_blocks = tensor.num_weights / weights_per_block; | ||||
|  | ||||
|   std::vector<int> weights_shape = shape; | ||||
|   weights_shape.back() /= (weights_per_byte * 4); | ||||
|  | ||||
|   array weights(std::move(weights_shape), uint32, nullptr, {}); | ||||
|   weights.set_data(allocator::malloc(weights.nbytes())); | ||||
|  | ||||
|   // For scales and bias | ||||
|   shape[shape.size() - 1] = shape[shape.size() - 1] / weights_per_block; | ||||
|   array scales(shape, float16, nullptr, {}); | ||||
|   array biases(std::move(shape), float16, nullptr, {}); | ||||
|   scales.set_data(allocator::malloc(scales.nbytes())); | ||||
|   biases.set_data(allocator::malloc(biases.nbytes())); | ||||
|  | ||||
|   if (tensor.type == GGUF_TYPE_Q4_0) { | ||||
|     extract_q4_0_data(tensor, weights, scales, biases); | ||||
|   } else if (tensor.type == GGUF_TYPE_Q4_1) { | ||||
|     extract_q4_1_data(tensor, weights, scales, biases); | ||||
|   } else if (tensor.type == GGUF_TYPE_Q8_0) { | ||||
|     extract_q8_0_data(tensor, weights, scales, biases); | ||||
|   } | ||||
|  | ||||
|   a.insert({name, weights}); | ||||
|  | ||||
|   auto check_insert = [](auto inserted) { | ||||
|     if (!inserted.second) { | ||||
|       std::ostringstream msg; | ||||
|       msg << "[load_gguf] Duplicate parameter name " << inserted.first->second | ||||
|           << " this can happend when loading quantized tensors."; | ||||
|       throw std::runtime_error(msg.str()); | ||||
|     } | ||||
|   }; | ||||
|  | ||||
|   const std::string weight_suffix = ".weight"; | ||||
|   const std::string name_prefix = | ||||
|       name.substr(0, name.length() - weight_suffix.length()); | ||||
|   check_insert(a.insert({name_prefix + ".scales", scales})); | ||||
|   check_insert(a.insert({name_prefix + ".biases", biases})); | ||||
| } | ||||
|  | ||||
| } // namespace mlx::core | ||||
		Reference in New Issue
	
	Block a user
	 Juarez Bochi
					Juarez Bochi