emb2redis utility added.

This commit is contained in:
antirez
2025-08-28 16:35:01 +02:00
parent 8fa6eb6523
commit a3257ff3cb
4 changed files with 382 additions and 0 deletions

1
utils/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
hiredis

11
utils/Makefile Normal file
View File

@@ -0,0 +1,11 @@
all: emb2redis
hiredis/libhiredis.a:
@echo "Please, enter the hiredis directory and build it."
@exit 1
emb2redis: emb2redis.c hiredis/libhiredis.a
$(CC) emb2redis.c ../gguflib.c ../sds.c ../fp16.c hiredis/libhiredis.a -o emb2redis -O2 -Wall -W
clean:
rm -f emb2redis

25
utils/README.md Normal file
View File

@@ -0,0 +1,25 @@
This is a simple program that gets the token embeddings from an LLM in
GGUF format, and adds it into a [Redis vector set](https://github.com/redis/redis/tree/unstable/modules/vector-sets). After the embeddings are added into
Redis you can easily check what are the embeddings more similar to others, an
operation that allows to build some mental model about the tokens embedding
space the LLM learned during training (spoiler: it is quite different than
than word2vec or alike: often certain words are near to unexpected words:
that's likely due to the fact we can't fully appreciate how
the models use all the components of the embedding in the Transformer blocks
inference).
To compile the program, stay in this directory and perform the following:
1. git clone https://github.com/redis/hiredis
2. cd hiredis; make
3. cd ..
4. make
Then do something like:
./emb2redis my_llm.gguf llm_embeddings_key -h 127.0.0.1 -p 6379
At the end, try something like this:
redis-cli VSIM llm_embeddings_keys ELE "banana"

345
utils/emb2redis.c Normal file
View File

@@ -0,0 +1,345 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <errno.h>
#include <math.h>
#include "../gguflib.h"
#include "hiredis/hiredis.h"
uint64_t tokens_processed = 0;
/* Free the token strings array. */
void free_token_strings(char **tokens, uint64_t num_tokens) {
if (!tokens) return;
for (uint64_t i = 0; i < num_tokens; i++) {
free(tokens[i]);
}
free(tokens);
}
/* Load all token strings into an array, so that we can lookup
* the token by its ID in constant time.
* Returns an array of token strings that must be freed by the caller. */
char **load_token_strings(gguf_ctx *ctx, uint64_t *num_tokens_out) {
gguf_key key;
char **tokens = NULL;
*num_tokens_out = 0;
gguf_rewind(ctx);
/* Look for tokenizer.ggml.tokens array. */
const char *tensorname = "tokenizer.ggml.tokens";
size_t tensorname_len = strlen(tensorname);
while (gguf_get_key(ctx, &key)) {
if (key.namelen == tensorname_len &&
memcmp(key.name, tensorname, key.namelen) == 0 &&
key.type == GGUF_VALUE_TYPE_ARRAY)
{
/* Found the tokens array. */
uint32_t etype = key.val->array.type;
uint64_t len = key.val->array.len;
if (etype != GGUF_VALUE_TYPE_STRING) {
fprintf(stderr,
"Unexpected token type in array (not string)\n");
return NULL;
}
printf("Found %llu tokens in vocabulary\n",
(unsigned long long)len);
/* Allocate array for all token strings. */
tokens = calloc(len, sizeof(char*));
if (!tokens) {
fprintf(stderr, "Failed to allocate token array.\n");
return NULL;
}
/* Skip array header. */
ctx->off += 4 + 8; /* 4 for type, 8 for length. */
/* Load all token strings. */
for (uint64_t j = 0; j < len; j++) {
struct gguf_string *str =
(struct gguf_string*)(ctx->data + ctx->off);
/* Allocate and copy token string. */
tokens[j] = malloc(str->len+1);
if (!tokens[j]) {
fprintf(stderr, "Failed to allocate token string\n");
/* Free already allocated tokens and return error. */
free_token_strings(tokens,j);
return NULL;
}
memcpy(tokens[j], str->string, str->len);
tokens[j][str->len] = '\0';
/* Move to next string. */
ctx->off += 8 + str->len;
/* Show progress. */
if ((j + 1) % 10000 == 0) {
printf(" Loaded %llu / %llu token strings...\n",
(unsigned long long)(j + 1),
(unsigned long long)len);
}
}
*num_tokens_out = len;
printf("Successfully loaded all %llu token strings\n",
(unsigned long long)len);
return tokens;
} else {
/* Skip this key-value pair. */
gguf_do_with_value(ctx, key.type, key.val, NULL, 0, 0, NULL);
}
}
fprintf(stderr, "Could not find tokenizer.ggml.tokens array\n");
return NULL;
}
/* Process the token embeddings tensor and add to Redis */
int process_token_embeddings(gguf_ctx *ctx, redisContext *rctx,
const char *key_name, char **token_strings,
uint64_t num_token_strings) {
gguf_tensor tensor;
int found = 0;
/* Skip all key-value pairs to get to tensors. */
gguf_skip_key_values_section(ctx);
/* Look for token_embd.weight tensor. */
const char *tensorname = "token_embd.weight";
size_t tensorname_len = strlen(tensorname);
while (gguf_get_tensor(ctx, &tensor)) {
if (tensor.namelen == tensorname_len &&
memcmp(tensor.name, tensorname, tensor.namelen) == 0)
{
found = 1;
break;
}
}
if (!found) {
fprintf(stderr, "Could not find token_embd.weight tensor\n");
return 0;
}
printf("\nFound token embeddings tensor:\n");
printf(" Type: %s\n", gguf_get_tensor_type_name(tensor.type));
printf(" Dimensions: [%llu, %llu]\n",
(unsigned long long)tensor.dim[0],
(unsigned long long)tensor.dim[1]);
printf(" Total tokens: %llu\n", (unsigned long long)tensor.dim[1]);
printf(" Embedding dimension: %llu\n", (unsigned long long)tensor.dim[0]);
uint64_t emb_dim = tensor.dim[0];
uint64_t num_tokens = tensor.dim[1];
/* Verify that we have matching number of tokens. */
if (num_tokens != num_token_strings) {
fprintf(stderr, "Warning: Mismatch between embedding tokens (%llu) and vocabulary (%llu)\n",
(unsigned long long)num_tokens,
(unsigned long long)num_token_strings);
/* Use the minimum to be safe */
if (num_tokens > num_token_strings) {
num_tokens = num_token_strings;
}
}
/* Convert tensor to float if needed, there are files where the
* embeddings are also quantized. */
printf("Converting tensor to float format...\n");
float *embeddings = gguf_tensor_to_float(&tensor);
if (!embeddings) {
if (errno == EINVAL) {
fprintf(stderr, "Unsupported tensor type for conversion: %s\n",
gguf_get_tensor_type_name(tensor.type));
} else {
fprintf(stderr, "Out of memory converting tensor\n");
}
return 0;
}
printf("\nAdding %llu tokens to Redis key '%s'...\n",
(unsigned long long)num_tokens, key_name);
/* Process each token. */
for (uint64_t token_id = 0; token_id < num_tokens; token_id++) {
/* Get the token string. */
char *token_str = token_strings[token_id];
/* Get the embedding vector for this token. */
float *token_emb = embeddings + (token_id * emb_dim);
/* Build VADD command: VADD key FP32 vector element */
const char *argv[5];
size_t arglen[5];
argv[0] = "VADD";
arglen[0] = 4;
argv[1] = key_name;
arglen[1] = strlen(key_name);
argv[2] = "FP32";
arglen[2] = 4;
argv[3] = (char*)token_emb;
arglen[3] = emb_dim * sizeof(float);
argv[4] = token_str;
arglen[4] = strlen(token_str);
/* Execute the command. */
redisReply *reply = redisCommandArgv(rctx, 5, argv, arglen);
if (!reply) {
fprintf(stderr, "Error executing VADD for token %llu: %s\n",
(unsigned long long)token_id, rctx->errstr);
free(embeddings);
return 0;
}
if (reply->type == REDIS_REPLY_ERROR) {
fprintf(stderr, "VADD error for token %llu (%s): %s\n",
(unsigned long long)token_id, token_str, reply->str);
freeReplyObject(reply);
free(embeddings);
return 0;
}
freeReplyObject(reply);
tokens_processed++;
/* Progress indicator every 10000 tokens. */
if (tokens_processed % 10000 == 0) {
printf(" Processed %llu / %llu tokens (%.1f%%)\n",
(unsigned long long)tokens_processed,
(unsigned long long)num_tokens,
(double)tokens_processed * 100.0 / num_tokens);
}
}
printf("\nSuccessfully added all %llu tokens to Redis\n",
(unsigned long long)num_tokens);
free(embeddings);
return 1;
}
/* Print usage information */
void usage(const char *progname) {
fprintf(stderr, "Usage: %s <gguf-file> <redis-key> [options]\n", progname);
fprintf(stderr, "Options:\n");
fprintf(stderr, " -h <host> Redis host (default: 127.0.0.1)\n");
fprintf(stderr, " -p <port> Redis port (default: 6379)\n");
fprintf(stderr, "\nExample:\n");
fprintf(stderr, " %s model.gguf llm_embeddings\n", progname);
fprintf(stderr, " %s model.gguf llm_embeddings -h localhost -p 6379\n", progname);
}
int main(int argc, char **argv) {
char *gguf_file = NULL;
char *redis_key = NULL;
char *redis_host = "127.0.0.1";
int redis_port = 6379;
char **token_strings = NULL;
uint64_t num_token_strings = 0;
/* Parse command line arguments */
if (argc < 3) {
usage(argv[0]);
return 1;
}
gguf_file = argv[1];
redis_key = argv[2];
/* Parse optional arguments. */
for (int i = 3; i < argc; i++) {
if (strcmp(argv[i], "-h") == 0 && i + 1 < argc) {
redis_host = argv[++i];
} else if (strcmp(argv[i], "-p") == 0 && i + 1 < argc) {
redis_port = atoi(argv[++i]);
} else {
fprintf(stderr, "Unknown option: %s\n", argv[i]);
usage(argv[0]);
return 1;
}
}
printf("==============================================\n");
printf("GGUF to Redis Vector Set Importer\n");
printf("==============================================\n");
printf("GGUF file: %s\n", gguf_file);
printf("Redis target: %s:%d\n", redis_host, redis_port);
printf("Vector set key: %s\n", redis_key);
printf("==============================================\n\n");
/* Open GGUF file. */
printf("Loading GGUF file...\n");
gguf_ctx *ctx = gguf_open(gguf_file);
if (!ctx) {
fprintf(stderr, "Failed to open GGUF file: %s\n", strerror(errno));
return 1;
}
printf("GGUF file loaded successfully (version %d)\n", ctx->header->version);
printf(" Key-value pairs: %llu\n", (unsigned long long)ctx->header->metadata_kv_count);
printf(" Tensors: %llu\n\n", (unsigned long long)ctx->header->tensor_count);
/* First, load all token strings into memory. */
printf("Loading vocabulary tokens...\n");
token_strings = load_token_strings(ctx, &num_token_strings);
if (!token_strings) {
fprintf(stderr, "Failed to load token strings\n");
gguf_close(ctx);
return 1;
}
/* Connect to Redis. */
printf("\nConnecting to Redis...\n");
redisContext *rctx = redisConnect(redis_host, redis_port);
if (!rctx || rctx->err) {
if (rctx) {
fprintf(stderr, "Redis connection error: %s\n", rctx->errstr);
redisFree(rctx);
} else {
fprintf(stderr, "Cannot allocate redis context\n");
}
free_token_strings(token_strings, num_token_strings);
gguf_close(ctx);
return 1;
}
printf("Connected to Redis successfully\n");
/* Process the embeddings, adding it to Redis. */
if (!process_token_embeddings(ctx, rctx, redis_key,
token_strings, num_token_strings)) {
fprintf(stderr, "Failed to process token embeddings\n");
redisFree(rctx);
free_token_strings(token_strings, num_token_strings);
gguf_close(ctx);
return 1;
}
/* Cleanup and reporting. */
free_token_strings(token_strings, num_token_strings);
redisFree(rctx);
gguf_close(ctx);
printf("\n==============================================\n");
printf("Import completed successfully!\n");
printf("Total tokens added: %llu\n", (unsigned long long)tokens_processed);
printf("==============================================\n\n");
printf("You can now use VSIM to find similar tokens:\n");
printf(" VSIM %s ELE \"apple\" COUNT 10\n", redis_key);
printf(" VSIM %s ELE \"python\" WITHSCORES\n", redis_key);
printf(" VCARD %s # Check total count\n", redis_key);
return 0;
}