From a3257ff3cb8aed8b60ba3243c70b85a17491d7d6 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 28 Aug 2025 16:35:01 +0200 Subject: [PATCH] emb2redis utility added. --- utils/.gitignore | 1 + utils/Makefile | 11 ++ utils/README.md | 25 ++++ utils/emb2redis.c | 345 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 382 insertions(+) create mode 100644 utils/.gitignore create mode 100644 utils/Makefile create mode 100644 utils/README.md create mode 100644 utils/emb2redis.c diff --git a/utils/.gitignore b/utils/.gitignore new file mode 100644 index 0000000..01e7805 --- /dev/null +++ b/utils/.gitignore @@ -0,0 +1 @@ +hiredis diff --git a/utils/Makefile b/utils/Makefile new file mode 100644 index 0000000..3001076 --- /dev/null +++ b/utils/Makefile @@ -0,0 +1,11 @@ +all: emb2redis + +hiredis/libhiredis.a: + @echo "Please, enter the hiredis directory and build it." + @exit 1 + +emb2redis: emb2redis.c hiredis/libhiredis.a + $(CC) emb2redis.c ../gguflib.c ../sds.c ../fp16.c hiredis/libhiredis.a -o emb2redis -O2 -Wall -W + +clean: + rm -f emb2redis diff --git a/utils/README.md b/utils/README.md new file mode 100644 index 0000000..dee02e4 --- /dev/null +++ b/utils/README.md @@ -0,0 +1,25 @@ +This is a simple program that gets the token embeddings from an LLM in +GGUF format, and adds it into a [Redis vector set](https://github.com/redis/redis/tree/unstable/modules/vector-sets). After the embeddings are added into +Redis you can easily check what are the embeddings more similar to others, an +operation that allows to build some mental model about the tokens embedding +space the LLM learned during training (spoiler: it is quite different than +than word2vec or alike: often certain words are near to unexpected words: +that's likely due to the fact we can't fully appreciate how +the models use all the components of the embedding in the Transformer blocks +inference). + +To compile the program, stay in this directory and perform the following: + +1. git clone https://github.com/redis/hiredis +2. cd hiredis; make +3. cd .. +4. make + +Then do something like: + + ./emb2redis my_llm.gguf llm_embeddings_key -h 127.0.0.1 -p 6379 + +At the end, try something like this: + + redis-cli VSIM llm_embeddings_keys ELE "banana" + diff --git a/utils/emb2redis.c b/utils/emb2redis.c new file mode 100644 index 0000000..03a3feb --- /dev/null +++ b/utils/emb2redis.c @@ -0,0 +1,345 @@ +#include +#include +#include +#include +#include +#include + +#include "../gguflib.h" +#include "hiredis/hiredis.h" + +uint64_t tokens_processed = 0; + +/* Free the token strings array. */ +void free_token_strings(char **tokens, uint64_t num_tokens) { + if (!tokens) return; + for (uint64_t i = 0; i < num_tokens; i++) { + free(tokens[i]); + } + free(tokens); +} + +/* Load all token strings into an array, so that we can lookup + * the token by its ID in constant time. + * Returns an array of token strings that must be freed by the caller. */ +char **load_token_strings(gguf_ctx *ctx, uint64_t *num_tokens_out) { + gguf_key key; + char **tokens = NULL; + *num_tokens_out = 0; + + gguf_rewind(ctx); + + /* Look for tokenizer.ggml.tokens array. */ + const char *tensorname = "tokenizer.ggml.tokens"; + size_t tensorname_len = strlen(tensorname); + while (gguf_get_key(ctx, &key)) { + if (key.namelen == tensorname_len && + memcmp(key.name, tensorname, key.namelen) == 0 && + key.type == GGUF_VALUE_TYPE_ARRAY) + { + /* Found the tokens array. */ + uint32_t etype = key.val->array.type; + uint64_t len = key.val->array.len; + + if (etype != GGUF_VALUE_TYPE_STRING) { + fprintf(stderr, + "Unexpected token type in array (not string)\n"); + return NULL; + } + + printf("Found %llu tokens in vocabulary\n", + (unsigned long long)len); + + /* Allocate array for all token strings. */ + tokens = calloc(len, sizeof(char*)); + if (!tokens) { + fprintf(stderr, "Failed to allocate token array.\n"); + return NULL; + } + + /* Skip array header. */ + ctx->off += 4 + 8; /* 4 for type, 8 for length. */ + + /* Load all token strings. */ + for (uint64_t j = 0; j < len; j++) { + struct gguf_string *str = + (struct gguf_string*)(ctx->data + ctx->off); + + /* Allocate and copy token string. */ + tokens[j] = malloc(str->len+1); + if (!tokens[j]) { + fprintf(stderr, "Failed to allocate token string\n"); + /* Free already allocated tokens and return error. */ + free_token_strings(tokens,j); + return NULL; + } + + memcpy(tokens[j], str->string, str->len); + tokens[j][str->len] = '\0'; + + /* Move to next string. */ + ctx->off += 8 + str->len; + + /* Show progress. */ + if ((j + 1) % 10000 == 0) { + printf(" Loaded %llu / %llu token strings...\n", + (unsigned long long)(j + 1), + (unsigned long long)len); + } + } + + *num_tokens_out = len; + printf("Successfully loaded all %llu token strings\n", + (unsigned long long)len); + return tokens; + } else { + /* Skip this key-value pair. */ + gguf_do_with_value(ctx, key.type, key.val, NULL, 0, 0, NULL); + } + } + + fprintf(stderr, "Could not find tokenizer.ggml.tokens array\n"); + return NULL; +} + +/* Process the token embeddings tensor and add to Redis */ +int process_token_embeddings(gguf_ctx *ctx, redisContext *rctx, + const char *key_name, char **token_strings, + uint64_t num_token_strings) { + gguf_tensor tensor; + int found = 0; + + /* Skip all key-value pairs to get to tensors. */ + gguf_skip_key_values_section(ctx); + + /* Look for token_embd.weight tensor. */ + const char *tensorname = "token_embd.weight"; + size_t tensorname_len = strlen(tensorname); + while (gguf_get_tensor(ctx, &tensor)) { + if (tensor.namelen == tensorname_len && + memcmp(tensor.name, tensorname, tensor.namelen) == 0) + { + found = 1; + break; + } + } + + if (!found) { + fprintf(stderr, "Could not find token_embd.weight tensor\n"); + return 0; + } + + printf("\nFound token embeddings tensor:\n"); + printf(" Type: %s\n", gguf_get_tensor_type_name(tensor.type)); + printf(" Dimensions: [%llu, %llu]\n", + (unsigned long long)tensor.dim[0], + (unsigned long long)tensor.dim[1]); + printf(" Total tokens: %llu\n", (unsigned long long)tensor.dim[1]); + printf(" Embedding dimension: %llu\n", (unsigned long long)tensor.dim[0]); + + uint64_t emb_dim = tensor.dim[0]; + uint64_t num_tokens = tensor.dim[1]; + + /* Verify that we have matching number of tokens. */ + if (num_tokens != num_token_strings) { + fprintf(stderr, "Warning: Mismatch between embedding tokens (%llu) and vocabulary (%llu)\n", + (unsigned long long)num_tokens, + (unsigned long long)num_token_strings); + /* Use the minimum to be safe */ + if (num_tokens > num_token_strings) { + num_tokens = num_token_strings; + } + } + + /* Convert tensor to float if needed, there are files where the + * embeddings are also quantized. */ + printf("Converting tensor to float format...\n"); + float *embeddings = gguf_tensor_to_float(&tensor); + if (!embeddings) { + if (errno == EINVAL) { + fprintf(stderr, "Unsupported tensor type for conversion: %s\n", + gguf_get_tensor_type_name(tensor.type)); + } else { + fprintf(stderr, "Out of memory converting tensor\n"); + } + return 0; + } + + printf("\nAdding %llu tokens to Redis key '%s'...\n", + (unsigned long long)num_tokens, key_name); + + /* Process each token. */ + for (uint64_t token_id = 0; token_id < num_tokens; token_id++) { + /* Get the token string. */ + char *token_str = token_strings[token_id]; + + /* Get the embedding vector for this token. */ + float *token_emb = embeddings + (token_id * emb_dim); + + /* Build VADD command: VADD key FP32 vector element */ + const char *argv[5]; + size_t arglen[5]; + + argv[0] = "VADD"; + arglen[0] = 4; + + argv[1] = key_name; + arglen[1] = strlen(key_name); + + argv[2] = "FP32"; + arglen[2] = 4; + + argv[3] = (char*)token_emb; + arglen[3] = emb_dim * sizeof(float); + + argv[4] = token_str; + arglen[4] = strlen(token_str); + + /* Execute the command. */ + redisReply *reply = redisCommandArgv(rctx, 5, argv, arglen); + if (!reply) { + fprintf(stderr, "Error executing VADD for token %llu: %s\n", + (unsigned long long)token_id, rctx->errstr); + free(embeddings); + return 0; + } + + if (reply->type == REDIS_REPLY_ERROR) { + fprintf(stderr, "VADD error for token %llu (%s): %s\n", + (unsigned long long)token_id, token_str, reply->str); + freeReplyObject(reply); + free(embeddings); + return 0; + } + + freeReplyObject(reply); + tokens_processed++; + + /* Progress indicator every 10000 tokens. */ + if (tokens_processed % 10000 == 0) { + printf(" Processed %llu / %llu tokens (%.1f%%)\n", + (unsigned long long)tokens_processed, + (unsigned long long)num_tokens, + (double)tokens_processed * 100.0 / num_tokens); + } + } + + printf("\nSuccessfully added all %llu tokens to Redis\n", + (unsigned long long)num_tokens); + + free(embeddings); + return 1; +} + +/* Print usage information */ +void usage(const char *progname) { + fprintf(stderr, "Usage: %s [options]\n", progname); + fprintf(stderr, "Options:\n"); + fprintf(stderr, " -h Redis host (default: 127.0.0.1)\n"); + fprintf(stderr, " -p Redis port (default: 6379)\n"); + fprintf(stderr, "\nExample:\n"); + fprintf(stderr, " %s model.gguf llm_embeddings\n", progname); + fprintf(stderr, " %s model.gguf llm_embeddings -h localhost -p 6379\n", progname); +} + +int main(int argc, char **argv) { + char *gguf_file = NULL; + char *redis_key = NULL; + char *redis_host = "127.0.0.1"; + int redis_port = 6379; + char **token_strings = NULL; + uint64_t num_token_strings = 0; + + /* Parse command line arguments */ + if (argc < 3) { + usage(argv[0]); + return 1; + } + + gguf_file = argv[1]; + redis_key = argv[2]; + + /* Parse optional arguments. */ + for (int i = 3; i < argc; i++) { + if (strcmp(argv[i], "-h") == 0 && i + 1 < argc) { + redis_host = argv[++i]; + } else if (strcmp(argv[i], "-p") == 0 && i + 1 < argc) { + redis_port = atoi(argv[++i]); + } else { + fprintf(stderr, "Unknown option: %s\n", argv[i]); + usage(argv[0]); + return 1; + } + } + + printf("==============================================\n"); + printf("GGUF to Redis Vector Set Importer\n"); + printf("==============================================\n"); + printf("GGUF file: %s\n", gguf_file); + printf("Redis target: %s:%d\n", redis_host, redis_port); + printf("Vector set key: %s\n", redis_key); + printf("==============================================\n\n"); + + /* Open GGUF file. */ + printf("Loading GGUF file...\n"); + gguf_ctx *ctx = gguf_open(gguf_file); + if (!ctx) { + fprintf(stderr, "Failed to open GGUF file: %s\n", strerror(errno)); + return 1; + } + + printf("GGUF file loaded successfully (version %d)\n", ctx->header->version); + printf(" Key-value pairs: %llu\n", (unsigned long long)ctx->header->metadata_kv_count); + printf(" Tensors: %llu\n\n", (unsigned long long)ctx->header->tensor_count); + + /* First, load all token strings into memory. */ + printf("Loading vocabulary tokens...\n"); + token_strings = load_token_strings(ctx, &num_token_strings); + if (!token_strings) { + fprintf(stderr, "Failed to load token strings\n"); + gguf_close(ctx); + return 1; + } + + /* Connect to Redis. */ + printf("\nConnecting to Redis...\n"); + redisContext *rctx = redisConnect(redis_host, redis_port); + if (!rctx || rctx->err) { + if (rctx) { + fprintf(stderr, "Redis connection error: %s\n", rctx->errstr); + redisFree(rctx); + } else { + fprintf(stderr, "Cannot allocate redis context\n"); + } + free_token_strings(token_strings, num_token_strings); + gguf_close(ctx); + return 1; + } + + printf("Connected to Redis successfully\n"); + + /* Process the embeddings, adding it to Redis. */ + if (!process_token_embeddings(ctx, rctx, redis_key, + token_strings, num_token_strings)) { + fprintf(stderr, "Failed to process token embeddings\n"); + redisFree(rctx); + free_token_strings(token_strings, num_token_strings); + gguf_close(ctx); + return 1; + } + + /* Cleanup and reporting. */ + free_token_strings(token_strings, num_token_strings); + redisFree(rctx); + gguf_close(ctx); + + printf("\n==============================================\n"); + printf("Import completed successfully!\n"); + printf("Total tokens added: %llu\n", (unsigned long long)tokens_processed); + printf("==============================================\n\n"); + printf("You can now use VSIM to find similar tokens:\n"); + printf(" VSIM %s ELE \"apple\" COUNT 10\n", redis_key); + printf(" VSIM %s ELE \"python\" WITHSCORES\n", redis_key); + printf(" VCARD %s # Check total count\n", redis_key); + return 0; +}