mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-06-24 17:31:18 +08:00
Add Supported Quantized Phi-3-mini-4k-instruct gguf Weight (#717)
* support for phi-3 4bits quantized gguf weights * Added link to 4 bits quantized model * removed some prints * Added correct comment * Added correct comment * removed print Since last condition already prints warning for when quantization is None
This commit is contained in:
parent
5513c4e57d
commit
7c0962f4e2
@ -47,6 +47,10 @@ Models that have been tested and work include:
|
|||||||
- [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF),
|
- [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF),
|
||||||
for quantized models use:
|
for quantized models use:
|
||||||
- `tinyllama-1.1b-chat-v1.0.Q8_0.gguf`
|
- `tinyllama-1.1b-chat-v1.0.Q8_0.gguf`
|
||||||
- `tinyllama-1.1b-chat-v1.0.Q4_0.gguf`
|
- `tinyllama-1.1b-chat-v1.0.Q4_0.gguf`
|
||||||
|
|
||||||
|
- [Jaward/phi-3-mini-4k-instruct.Q4_0.gguf](https://huggingface.co/Jaward/phi-3-mini-4k-instruct.Q4_0.gguf),
|
||||||
|
for 4 bits quantized phi-3-mini-4k-instruct use:
|
||||||
|
- `phi-3-mini-4k-instruct.Q4_0.gguf`
|
||||||
|
|
||||||
[^1]: For more information on GGUF see [the documentation](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md).
|
[^1]: For more information on GGUF see [the documentation](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md).
|
||||||
|
@ -18,6 +18,7 @@ class ModelArgs:
|
|||||||
num_attention_heads: int
|
num_attention_heads: int
|
||||||
rms_norm_eps: float
|
rms_norm_eps: float
|
||||||
vocab_size: int
|
vocab_size: int
|
||||||
|
context_length: int
|
||||||
num_key_value_heads: int = None
|
num_key_value_heads: int = None
|
||||||
rope_theta: float = 10000
|
rope_theta: float = 10000
|
||||||
rope_traditional: bool = False
|
rope_traditional: bool = False
|
||||||
@ -157,6 +158,16 @@ class LlamaModel(nn.Module):
|
|||||||
TransformerBlock(args=args) for _ in range(args.num_hidden_layers)
|
TransformerBlock(args=args) for _ in range(args.num_hidden_layers)
|
||||||
]
|
]
|
||||||
self.norm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
|
self.norm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
|
||||||
|
# model info
|
||||||
|
print(
|
||||||
|
f"Model info\n"
|
||||||
|
f"==========\n"
|
||||||
|
f"Context length: {args.context_length}\n"
|
||||||
|
f"Vocab size: {args.vocab_size}\n"
|
||||||
|
f"Hidden size: {args.hidden_size}\n"
|
||||||
|
f"Num layers: {args.num_hidden_layers}\n"
|
||||||
|
f"Num attention heads: {args.num_attention_heads}\n"
|
||||||
|
)
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self,
|
self,
|
||||||
@ -196,6 +207,7 @@ class Model(nn.Module):
|
|||||||
|
|
||||||
def get_config(metadata: dict):
|
def get_config(metadata: dict):
|
||||||
output = {
|
output = {
|
||||||
|
"context_length": metadata["llama.context_length"],
|
||||||
"hidden_size": metadata["llama.embedding_length"],
|
"hidden_size": metadata["llama.embedding_length"],
|
||||||
"num_hidden_layers": metadata["llama.block_count"],
|
"num_hidden_layers": metadata["llama.block_count"],
|
||||||
"num_attention_heads": metadata["llama.attention.head_count"],
|
"num_attention_heads": metadata["llama.attention.head_count"],
|
||||||
@ -269,9 +281,12 @@ def load(gguf_file: str, repo: str = None):
|
|||||||
elif gguf_ft == 2 or gguf_ft == 3:
|
elif gguf_ft == 2 or gguf_ft == 3:
|
||||||
# MOSTLY_Q4_0 or MOSTLY_Q4_1
|
# MOSTLY_Q4_0 or MOSTLY_Q4_1
|
||||||
quantization = {"group_size": 32, "bits": 4}
|
quantization = {"group_size": 32, "bits": 4}
|
||||||
|
# print bits value
|
||||||
|
print(f"{quantization['bits']} bits quantized model")
|
||||||
elif gguf_ft == 7:
|
elif gguf_ft == 7:
|
||||||
# MOSTLY_Q8_0 = 7
|
# MOSTLY_Q8_0 = 7
|
||||||
quantization = {"group_size": 32, "bits": 8}
|
quantization = {"group_size": 32, "bits": 8}
|
||||||
|
print(f"{quantization['bits']} bits quantized model")
|
||||||
else:
|
else:
|
||||||
quantization = None
|
quantization = None
|
||||||
print("[WARNING] Using unsupported GGUF quantization. Casting to float16.")
|
print("[WARNING] Using unsupported GGUF quantization. Casting to float16.")
|
||||||
|
Loading…
Reference in New Issue
Block a user