diff --git a/llms/gguf_llm/generate.py b/llms/gguf_llm/generate.py
index 7215aa48..db327cda 100644
--- a/llms/gguf_llm/generate.py
+++ b/llms/gguf_llm/generate.py
@@ -40,7 +40,7 @@ def generate(
     if len(tokens) == 0:
         print("No tokens generated for this prompt")
         return
-    prompt_tps = prompt.size / prompt_time
+    prompt_tps = len(prompt) / prompt_time
     gen_tps = (len(tokens) - 1) / gen_time
     print(f"Prompt: {prompt_tps:.3f} tokens-per-sec")
     print(f"Generation: {gen_tps:.3f} tokens-per-sec")
diff --git a/llms/gguf_llm/models.py b/llms/gguf_llm/models.py
index 3b0afc65..9e1f9666 100644
--- a/llms/gguf_llm/models.py
+++ b/llms/gguf_llm/models.py
@@ -19,10 +19,10 @@ class ModelArgs:
     rms_norm_eps: float
     vocab_size: int
     context_length: int
-    num_key_value_heads: int = None
+    num_key_value_heads: Optional[int] = None
     rope_theta: float = 10000
     rope_traditional: bool = False
-    model_type: str = None
+    model_type: Optional[str] = None
     rope_scaling: Optional[Dict[str, Union[float, str]]] = None
 
     def __post_init__(self):
@@ -54,7 +54,7 @@ class Attention(nn.Module):
 
         dim = args.hidden_size
         self.n_heads = n_heads = args.num_attention_heads
-        self.n_kv_heads = n_kv_heads = args.num_key_value_heads
+        self.n_kv_heads = n_kv_heads = args.num_key_value_heads or n_heads
 
         self.repeats = n_heads // n_kv_heads
 
@@ -66,7 +66,7 @@ class Attention(nn.Module):
         self.v_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=False)
         self.o_proj = nn.Linear(n_heads * head_dim, dim, bias=False)
         rope_scale = (
-            1 / args.rope_scaling["factor"]
+            1 / float(args.rope_scaling["factor"])
             if args.rope_scaling is not None and args.rope_scaling["type"] == "linear"
             else 1
         )
@@ -254,7 +254,7 @@ def translate_weight_names(name):
     return name
 
 
-def load(gguf_file: str, repo: str = None):
+def load(gguf_file: str, repo: Optional[str] = None):
     # If the gguf_file exists, try to load model from it.
     # Otherwise try to download and cache from the HF repo
     if not Path(gguf_file).exists():
diff --git a/llms/llama/convert.py b/llms/llama/convert.py
index 04c10a5f..9c05c8b7 100644
--- a/llms/llama/convert.py
+++ b/llms/llama/convert.py
@@ -7,7 +7,7 @@ import glob
 import json
 import shutil
 from pathlib import Path
-
+from typing import Dict
 import mlx.core as mx
 import mlx.nn as nn
 import torch
@@ -149,7 +149,8 @@ def quantize(weights, config, args):
 def make_shards(weights: dict, max_file_size_gibibyte: int = 15):
     max_file_size_bytes = max_file_size_gibibyte << 30
     shards = []
-    shard, shard_size = {}, 0
+    shard : Dict[str, mx.array] = {}
+    shard_size = 0
     for k, v in weights.items():
         if shard_size + v.nbytes > max_file_size_bytes:
             shards.append(shard)
diff --git a/llms/mixtral/mixtral.py b/llms/mixtral/mixtral.py
index 4b45d066..8520b87c 100644
--- a/llms/mixtral/mixtral.py
+++ b/llms/mixtral/mixtral.py
@@ -23,7 +23,7 @@ class ModelArgs:
     n_kv_heads: int
     norm_eps: float
     vocab_size: int
-    moe: dict = None
+    moe: Optional[dict] = None
 
 
 class Attention(nn.Module):
@@ -91,6 +91,9 @@ class FeedForward(nn.Module):
 class MOEFeedForward(nn.Module):
     def __init__(self, args: ModelArgs):
         super().__init__()
+        
+        if args.moe is None:
+            raise ValueError("args.moe must not be None for MOEFeedForward")
 
         self.num_experts = args.moe["num_experts"]
         self.num_experts_per_tok = args.moe["num_experts_per_tok"]