From 7c0962f4e2274b3057c1887c64fbacf23122cb52 Mon Sep 17 00:00:00 2001
From: Jaward Sesay <jkef96@icloud.com>
Date: Tue, 30 Apr 2024 11:11:32 +0800
Subject: [PATCH] Add Supported Quantized Phi-3-mini-4k-instruct gguf Weight
 (#717)

* support for phi-3 4bits quantized gguf weights

* Added link to 4 bits quantized model

* removed some prints

* Added correct comment

* Added correct comment

* removed print

Since last condition already prints warning for when quantization is None
---
 llms/gguf_llm/README.md |  6 +++++-
 llms/gguf_llm/models.py | 15 +++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/llms/gguf_llm/README.md b/llms/gguf_llm/README.md
index 6ea25cf1..1228f4c6 100644
--- a/llms/gguf_llm/README.md
+++ b/llms/gguf_llm/README.md
@@ -47,6 +47,10 @@ Models that have been tested and work include:
 - [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF),
   for quantized models use:
   - `tinyllama-1.1b-chat-v1.0.Q8_0.gguf`
-  - `tinyllama-1.1b-chat-v1.0.Q4_0.gguf` 
+  - `tinyllama-1.1b-chat-v1.0.Q4_0.gguf`
+
+- [Jaward/phi-3-mini-4k-instruct.Q4_0.gguf](https://huggingface.co/Jaward/phi-3-mini-4k-instruct.Q4_0.gguf),
+  for 4 bits quantized phi-3-mini-4k-instruct use:
+  - `phi-3-mini-4k-instruct.Q4_0.gguf` 
 
 [^1]: For more information on GGUF see [the documentation](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md).
diff --git a/llms/gguf_llm/models.py b/llms/gguf_llm/models.py
index cc9b3f0e..3b0afc65 100644
--- a/llms/gguf_llm/models.py
+++ b/llms/gguf_llm/models.py
@@ -18,6 +18,7 @@ class ModelArgs:
     num_attention_heads: int
     rms_norm_eps: float
     vocab_size: int
+    context_length: int
     num_key_value_heads: int = None
     rope_theta: float = 10000
     rope_traditional: bool = False
@@ -157,6 +158,16 @@ class LlamaModel(nn.Module):
             TransformerBlock(args=args) for _ in range(args.num_hidden_layers)
         ]
         self.norm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
+        # model info
+        print(
+            f"Model info\n"
+            f"==========\n"
+            f"Context length: {args.context_length}\n"
+            f"Vocab size: {args.vocab_size}\n"
+            f"Hidden size: {args.hidden_size}\n"
+            f"Num layers: {args.num_hidden_layers}\n"
+            f"Num attention heads: {args.num_attention_heads}\n"
+        )
 
     def __call__(
         self,
@@ -196,6 +207,7 @@ class Model(nn.Module):
 
 def get_config(metadata: dict):
     output = {
+        "context_length": metadata["llama.context_length"],
         "hidden_size": metadata["llama.embedding_length"],
         "num_hidden_layers": metadata["llama.block_count"],
         "num_attention_heads": metadata["llama.attention.head_count"],
@@ -269,9 +281,12 @@ def load(gguf_file: str, repo: str = None):
     elif gguf_ft == 2 or gguf_ft == 3:
         # MOSTLY_Q4_0 or MOSTLY_Q4_1
         quantization = {"group_size": 32, "bits": 4}
+        # print bits value
+        print(f"{quantization['bits']} bits quantized model")
     elif gguf_ft == 7:
         # MOSTLY_Q8_0 = 7
         quantization = {"group_size": 32, "bits": 8}
+        print(f"{quantization['bits']} bits quantized model")
     else:
         quantization = None
         print("[WARNING] Using unsupported GGUF quantization. Casting to float16.")