diff --git a/gguf-tools.c b/gguf-tools.c
index 5f6872c..b4ef6ab 100644
--- a/gguf-tools.c
+++ b/gguf-tools.c
@@ -9,6 +9,11 @@
 #include "sds.h"
 #include "fp16.h"
 
+/* Global options that can could be used for all the subcommands. */
+struct {
+    int verbose;        // --verbose option
+} Opt = {0};
+
 /* ========================== Utility functions  ============================ */
 
 /* Glob-style pattern matching. Return 1 on match, 0 otherwise. */
@@ -170,6 +175,8 @@ void gguf_tools_show(const char *filename) {
     return;
 }
 
+/* ======================= 'split-mixtral' subcommand ======================= */
+
 /* Read a Mixtral MoE model and creates a new non-MoE GGUF file based
  * on the weights of the experts with IDs in the array of 'experts_id'.
  * The array must contain 32 integers, one for each layer. */
@@ -312,6 +319,8 @@ void gguf_tools_split_mixtral(int *experts_id, const char *mixtral_filename, con
     exit(0);
 }
 
+/* ====================== 'inspect-weights' subcommand ====================== */
+
 void gguf_tools_inspect_weights(const char *filename, const char *tname, uint64_t count) {
     gguf_ctx *ctx = gguf_init(filename);
     if (ctx == NULL) {
@@ -362,7 +371,7 @@ void gguf_tools_inspect_weights(const char *filename, const char *tname, uint64_
 /* ======================= Main and CLI options parsing ===================== */
 
 void gguf_tools_usage(const char *progname) {
-    printf("Usage: %s <subcommand> [options...]\n"
+    printf("Usage: %s <subcommand> [arguments...] [options...]\n"
 "Subcommands:\n"
 "  show <filename> -- show GGUF model keys and tensors.\n"
 "  inspect-tensor <filename> <tensor-name> [count] -- show tensor weights.\n"
diff --git a/gguflib.c b/gguflib.c
index 396a078..ade0b6c 100644
--- a/gguflib.c
+++ b/gguflib.c
@@ -511,9 +511,11 @@ float *gguf_tensor_to_float(gguf_tensor *tensor) {
         gguf_get_tensor_type_features(tensor->type);
     uint64_t block_size = tf->bytes_per_block;
     float *f = malloc(tensor->num_weights*sizeof(float));
-    if (tensor->type == GUFF_TYPE_Q8_0) {
+    if (tensor->type == GGUF_TYPE_Q8_0) {
+        /* Very simple layout: |16 bit delta|32 x 8bit weights|
+         * Each weight is delta * quantized_weight[0..31] */
         int8_t *block = (int8_t*)tensor->weights_data;
-        uint64_t i = 0;
+        uint64_t i = 0; // i-th weight to dequantize.
         while(i < tensor->num_weights) {
             /* For each block get the delta and convert all the
              * weights in the block. */
@@ -524,6 +526,79 @@ float *gguf_tensor_to_float(gguf_tensor *tensor) {
             }
             block += block_size; // Go to the next block.
         }
+    } else if (tensor->type == GGUF_TYPE_Q4_K) {
+        uint8_t *block = (uint8_t*)tensor->weights_data;
+        uint64_t i = 0; // i-th weight to dequantize.
+        while(i < tensor->num_weights) {
+            /* Q4_K super-blocks have 256 total weights, split in 8 sub-block.
+             * Each 8 sub-blocks have a different set of deltas/mins, so
+             * there are 16 total values for deltas/mins, but the deltas/mins
+             * are also quantized (6 bits each) using two different deltas:
+             * delta_of_deltas and delta_of_mins, that are two FP16 values
+             * at the start of the super block, so:
+             *
+             * |FP16 d_of_deltas  | + 
+             * |FP16 d_of_mins    | +
+             * |16 6 bit integers d,m pairs, one per sub-block of 32 ele | +
+             * |256 x 4bit weights|
+             */
+            float deltas_delta = from_half(*((uint16_t*)block));
+            float mins_delta  = from_half(*((uint16_t*)(block+2)));
+            block += 4;
+            
+            /* Extract the 16 x 6 bit values deltas-mins pairs. The
+             * encoding of those values is odd because of performance
+             * reasons:
+             *
+             *  dddddddd dddddddd dddddddd dddddddd mmmmmmmm mmmmmmmm
+             *  44000000|55111111|66222222|77333333|44000000|55111111
+             *
+             *  mmmmmmmm mmmmmmmm mmmmdddd mmmmdddd mmmmdddd mmmmdddd
+             *  66222222|77333333|44444444|55555555|66666666|77777777
+             *
+             * In the above diagram you can see the 12 bytes and the
+             * deltas/mins 6 bits encodings. */
+
+            /* Scale deltas/mins. */
+            float deltas[8], mins[8];
+            for (int j = 0; j < 8; j++) {
+                uint8_t d,m;
+		if (j < 4) {
+		    d = block[j] & 63;
+		    m = block[j+4] & 63;
+		} else {
+		    d = (block[j+4] & 0xF) | ((block[j-4] >> 6) << 4);
+		    m = (block[j+4] >> 4) | ((block[j-0] >> 6) << 4);
+		}
+                deltas[j] = d * deltas_delta;
+                mins[j] = m * mins_delta;
+            }
+            block += 12; // Seek 4-bit weights start.
+
+            /* Finally we can extract the 256 weights.
+             * We process two blocks per time, because each
+             * 32 bytes have 64 weights stored like this:
+             * First 32 weights of the first block are the higher 4
+             * bits of each byte. Second 32 weights of the second
+             * block are lower 4 bits of each byte. */
+            for (uint32_t b = 0; b < 8; b += 2) {
+                float delta = deltas[b];
+                float min = mins[b];
+                /* First set: higher bits. */
+                for (uint32_t j = 0; j < 32; j++) {
+                    uint8_t w = block[j] & 0xf;
+                    f[i++] = w * delta - min;
+                    if (i == tensor->num_weights) return f;
+                }
+                /* Second set: lower bits. */
+                for (uint32_t j = 0; j < 32; j++) {
+                    uint8_t w = block[j] >> 4;
+                    f[i++] = w * delta - min;
+                    if (i == tensor->num_weights) return f;
+                }
+                block += 32; // Skip the two processed blocks.
+            }
+        }
     } else {
         errno = EINVAL;
         return NULL;
diff --git a/gguflib.h b/gguflib.h
index 815ae88..7e8a9b5 100644
--- a/gguflib.h
+++ b/gguflib.h
@@ -13,27 +13,27 @@
 /* ============================ Enums and structures ======================== */
 
 enum gguf_tensor_type {
-    GUFF_TYPE_F32  = 0,
-    GUFF_TYPE_F16  = 1,
-    GUFF_TYPE_Q4_0 = 2,
-    GUFF_TYPE_Q4_1 = 3,
-    // GUFF_TYPE_Q4_2 = 4, support has been removed
-    // GUFF_TYPE_Q4_3 (5) support has been removed
-    GUFF_TYPE_Q5_0 = 6,
-    GUFF_TYPE_Q5_1 = 7,
-    GUFF_TYPE_Q8_0 = 8,
-    GUFF_TYPE_Q8_1 = 9,
+    GGUF_TYPE_F32  = 0,
+    GGUF_TYPE_F16  = 1,
+    GGUF_TYPE_Q4_0 = 2,
+    GGUF_TYPE_Q4_1 = 3,
+    // GGUF_TYPE_Q4_2 = 4, support has been removed
+    // GGUF_TYPE_Q4_3 (5) support has been removed
+    GGUF_TYPE_Q5_0 = 6,
+    GGUF_TYPE_Q5_1 = 7,
+    GGUF_TYPE_Q8_0 = 8,
+    GGUF_TYPE_Q8_1 = 9,
     // k-quantizations
-    GUFF_TYPE_Q2_K = 10,
-    GUFF_TYPE_Q3_K = 11,
-    GUFF_TYPE_Q4_K = 12,
-    GUFF_TYPE_Q5_K = 13,
-    GUFF_TYPE_Q6_K = 14,
-    GUFF_TYPE_Q8_K = 15,
-    GUFF_TYPE_I8,
-    GUFF_TYPE_I16,
-    GUFF_TYPE_I32,
-    GUFF_TYPE_COUNT,
+    GGUF_TYPE_Q2_K = 10,
+    GGUF_TYPE_Q3_K = 11,
+    GGUF_TYPE_Q4_K = 12,
+    GGUF_TYPE_Q5_K = 13,
+    GGUF_TYPE_Q6_K = 14,
+    GGUF_TYPE_Q8_K = 15,
+    GGUF_TYPE_I8,
+    GGUF_TYPE_I16,
+    GGUF_TYPE_I32,
+    GGUF_TYPE_COUNT,
 };
 
 enum gguf_value_type {