mirror of
https://github.com/antirez/gguf-tools.git
synced 2025-09-16 09:38:08 +08:00
emb2redis utility added.
This commit is contained in:
1
utils/.gitignore
vendored
Normal file
1
utils/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
hiredis
|
11
utils/Makefile
Normal file
11
utils/Makefile
Normal file
@@ -0,0 +1,11 @@
|
||||
all: emb2redis
|
||||
|
||||
hiredis/libhiredis.a:
|
||||
@echo "Please, enter the hiredis directory and build it."
|
||||
@exit 1
|
||||
|
||||
emb2redis: emb2redis.c hiredis/libhiredis.a
|
||||
$(CC) emb2redis.c ../gguflib.c ../sds.c ../fp16.c hiredis/libhiredis.a -o emb2redis -O2 -Wall -W
|
||||
|
||||
clean:
|
||||
rm -f emb2redis
|
25
utils/README.md
Normal file
25
utils/README.md
Normal file
@@ -0,0 +1,25 @@
|
||||
This is a simple program that gets the token embeddings from an LLM in
|
||||
GGUF format, and adds it into a [Redis vector set](https://github.com/redis/redis/tree/unstable/modules/vector-sets). After the embeddings are added into
|
||||
Redis you can easily check what are the embeddings more similar to others, an
|
||||
operation that allows to build some mental model about the tokens embedding
|
||||
space the LLM learned during training (spoiler: it is quite different than
|
||||
than word2vec or alike: often certain words are near to unexpected words:
|
||||
that's likely due to the fact we can't fully appreciate how
|
||||
the models use all the components of the embedding in the Transformer blocks
|
||||
inference).
|
||||
|
||||
To compile the program, stay in this directory and perform the following:
|
||||
|
||||
1. git clone https://github.com/redis/hiredis
|
||||
2. cd hiredis; make
|
||||
3. cd ..
|
||||
4. make
|
||||
|
||||
Then do something like:
|
||||
|
||||
./emb2redis my_llm.gguf llm_embeddings_key -h 127.0.0.1 -p 6379
|
||||
|
||||
At the end, try something like this:
|
||||
|
||||
redis-cli VSIM llm_embeddings_keys ELE "banana"
|
||||
|
345
utils/emb2redis.c
Normal file
345
utils/emb2redis.c
Normal file
@@ -0,0 +1,345 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <errno.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "../gguflib.h"
|
||||
#include "hiredis/hiredis.h"
|
||||
|
||||
uint64_t tokens_processed = 0;
|
||||
|
||||
/* Free the token strings array. */
|
||||
void free_token_strings(char **tokens, uint64_t num_tokens) {
|
||||
if (!tokens) return;
|
||||
for (uint64_t i = 0; i < num_tokens; i++) {
|
||||
free(tokens[i]);
|
||||
}
|
||||
free(tokens);
|
||||
}
|
||||
|
||||
/* Load all token strings into an array, so that we can lookup
|
||||
* the token by its ID in constant time.
|
||||
* Returns an array of token strings that must be freed by the caller. */
|
||||
char **load_token_strings(gguf_ctx *ctx, uint64_t *num_tokens_out) {
|
||||
gguf_key key;
|
||||
char **tokens = NULL;
|
||||
*num_tokens_out = 0;
|
||||
|
||||
gguf_rewind(ctx);
|
||||
|
||||
/* Look for tokenizer.ggml.tokens array. */
|
||||
const char *tensorname = "tokenizer.ggml.tokens";
|
||||
size_t tensorname_len = strlen(tensorname);
|
||||
while (gguf_get_key(ctx, &key)) {
|
||||
if (key.namelen == tensorname_len &&
|
||||
memcmp(key.name, tensorname, key.namelen) == 0 &&
|
||||
key.type == GGUF_VALUE_TYPE_ARRAY)
|
||||
{
|
||||
/* Found the tokens array. */
|
||||
uint32_t etype = key.val->array.type;
|
||||
uint64_t len = key.val->array.len;
|
||||
|
||||
if (etype != GGUF_VALUE_TYPE_STRING) {
|
||||
fprintf(stderr,
|
||||
"Unexpected token type in array (not string)\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
printf("Found %llu tokens in vocabulary\n",
|
||||
(unsigned long long)len);
|
||||
|
||||
/* Allocate array for all token strings. */
|
||||
tokens = calloc(len, sizeof(char*));
|
||||
if (!tokens) {
|
||||
fprintf(stderr, "Failed to allocate token array.\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Skip array header. */
|
||||
ctx->off += 4 + 8; /* 4 for type, 8 for length. */
|
||||
|
||||
/* Load all token strings. */
|
||||
for (uint64_t j = 0; j < len; j++) {
|
||||
struct gguf_string *str =
|
||||
(struct gguf_string*)(ctx->data + ctx->off);
|
||||
|
||||
/* Allocate and copy token string. */
|
||||
tokens[j] = malloc(str->len+1);
|
||||
if (!tokens[j]) {
|
||||
fprintf(stderr, "Failed to allocate token string\n");
|
||||
/* Free already allocated tokens and return error. */
|
||||
free_token_strings(tokens,j);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
memcpy(tokens[j], str->string, str->len);
|
||||
tokens[j][str->len] = '\0';
|
||||
|
||||
/* Move to next string. */
|
||||
ctx->off += 8 + str->len;
|
||||
|
||||
/* Show progress. */
|
||||
if ((j + 1) % 10000 == 0) {
|
||||
printf(" Loaded %llu / %llu token strings...\n",
|
||||
(unsigned long long)(j + 1),
|
||||
(unsigned long long)len);
|
||||
}
|
||||
}
|
||||
|
||||
*num_tokens_out = len;
|
||||
printf("Successfully loaded all %llu token strings\n",
|
||||
(unsigned long long)len);
|
||||
return tokens;
|
||||
} else {
|
||||
/* Skip this key-value pair. */
|
||||
gguf_do_with_value(ctx, key.type, key.val, NULL, 0, 0, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "Could not find tokenizer.ggml.tokens array\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Process the token embeddings tensor and add to Redis */
|
||||
int process_token_embeddings(gguf_ctx *ctx, redisContext *rctx,
|
||||
const char *key_name, char **token_strings,
|
||||
uint64_t num_token_strings) {
|
||||
gguf_tensor tensor;
|
||||
int found = 0;
|
||||
|
||||
/* Skip all key-value pairs to get to tensors. */
|
||||
gguf_skip_key_values_section(ctx);
|
||||
|
||||
/* Look for token_embd.weight tensor. */
|
||||
const char *tensorname = "token_embd.weight";
|
||||
size_t tensorname_len = strlen(tensorname);
|
||||
while (gguf_get_tensor(ctx, &tensor)) {
|
||||
if (tensor.namelen == tensorname_len &&
|
||||
memcmp(tensor.name, tensorname, tensor.namelen) == 0)
|
||||
{
|
||||
found = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
fprintf(stderr, "Could not find token_embd.weight tensor\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
printf("\nFound token embeddings tensor:\n");
|
||||
printf(" Type: %s\n", gguf_get_tensor_type_name(tensor.type));
|
||||
printf(" Dimensions: [%llu, %llu]\n",
|
||||
(unsigned long long)tensor.dim[0],
|
||||
(unsigned long long)tensor.dim[1]);
|
||||
printf(" Total tokens: %llu\n", (unsigned long long)tensor.dim[1]);
|
||||
printf(" Embedding dimension: %llu\n", (unsigned long long)tensor.dim[0]);
|
||||
|
||||
uint64_t emb_dim = tensor.dim[0];
|
||||
uint64_t num_tokens = tensor.dim[1];
|
||||
|
||||
/* Verify that we have matching number of tokens. */
|
||||
if (num_tokens != num_token_strings) {
|
||||
fprintf(stderr, "Warning: Mismatch between embedding tokens (%llu) and vocabulary (%llu)\n",
|
||||
(unsigned long long)num_tokens,
|
||||
(unsigned long long)num_token_strings);
|
||||
/* Use the minimum to be safe */
|
||||
if (num_tokens > num_token_strings) {
|
||||
num_tokens = num_token_strings;
|
||||
}
|
||||
}
|
||||
|
||||
/* Convert tensor to float if needed, there are files where the
|
||||
* embeddings are also quantized. */
|
||||
printf("Converting tensor to float format...\n");
|
||||
float *embeddings = gguf_tensor_to_float(&tensor);
|
||||
if (!embeddings) {
|
||||
if (errno == EINVAL) {
|
||||
fprintf(stderr, "Unsupported tensor type for conversion: %s\n",
|
||||
gguf_get_tensor_type_name(tensor.type));
|
||||
} else {
|
||||
fprintf(stderr, "Out of memory converting tensor\n");
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
printf("\nAdding %llu tokens to Redis key '%s'...\n",
|
||||
(unsigned long long)num_tokens, key_name);
|
||||
|
||||
/* Process each token. */
|
||||
for (uint64_t token_id = 0; token_id < num_tokens; token_id++) {
|
||||
/* Get the token string. */
|
||||
char *token_str = token_strings[token_id];
|
||||
|
||||
/* Get the embedding vector for this token. */
|
||||
float *token_emb = embeddings + (token_id * emb_dim);
|
||||
|
||||
/* Build VADD command: VADD key FP32 vector element */
|
||||
const char *argv[5];
|
||||
size_t arglen[5];
|
||||
|
||||
argv[0] = "VADD";
|
||||
arglen[0] = 4;
|
||||
|
||||
argv[1] = key_name;
|
||||
arglen[1] = strlen(key_name);
|
||||
|
||||
argv[2] = "FP32";
|
||||
arglen[2] = 4;
|
||||
|
||||
argv[3] = (char*)token_emb;
|
||||
arglen[3] = emb_dim * sizeof(float);
|
||||
|
||||
argv[4] = token_str;
|
||||
arglen[4] = strlen(token_str);
|
||||
|
||||
/* Execute the command. */
|
||||
redisReply *reply = redisCommandArgv(rctx, 5, argv, arglen);
|
||||
if (!reply) {
|
||||
fprintf(stderr, "Error executing VADD for token %llu: %s\n",
|
||||
(unsigned long long)token_id, rctx->errstr);
|
||||
free(embeddings);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (reply->type == REDIS_REPLY_ERROR) {
|
||||
fprintf(stderr, "VADD error for token %llu (%s): %s\n",
|
||||
(unsigned long long)token_id, token_str, reply->str);
|
||||
freeReplyObject(reply);
|
||||
free(embeddings);
|
||||
return 0;
|
||||
}
|
||||
|
||||
freeReplyObject(reply);
|
||||
tokens_processed++;
|
||||
|
||||
/* Progress indicator every 10000 tokens. */
|
||||
if (tokens_processed % 10000 == 0) {
|
||||
printf(" Processed %llu / %llu tokens (%.1f%%)\n",
|
||||
(unsigned long long)tokens_processed,
|
||||
(unsigned long long)num_tokens,
|
||||
(double)tokens_processed * 100.0 / num_tokens);
|
||||
}
|
||||
}
|
||||
|
||||
printf("\nSuccessfully added all %llu tokens to Redis\n",
|
||||
(unsigned long long)num_tokens);
|
||||
|
||||
free(embeddings);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Print usage information */
|
||||
void usage(const char *progname) {
|
||||
fprintf(stderr, "Usage: %s <gguf-file> <redis-key> [options]\n", progname);
|
||||
fprintf(stderr, "Options:\n");
|
||||
fprintf(stderr, " -h <host> Redis host (default: 127.0.0.1)\n");
|
||||
fprintf(stderr, " -p <port> Redis port (default: 6379)\n");
|
||||
fprintf(stderr, "\nExample:\n");
|
||||
fprintf(stderr, " %s model.gguf llm_embeddings\n", progname);
|
||||
fprintf(stderr, " %s model.gguf llm_embeddings -h localhost -p 6379\n", progname);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
char *gguf_file = NULL;
|
||||
char *redis_key = NULL;
|
||||
char *redis_host = "127.0.0.1";
|
||||
int redis_port = 6379;
|
||||
char **token_strings = NULL;
|
||||
uint64_t num_token_strings = 0;
|
||||
|
||||
/* Parse command line arguments */
|
||||
if (argc < 3) {
|
||||
usage(argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
gguf_file = argv[1];
|
||||
redis_key = argv[2];
|
||||
|
||||
/* Parse optional arguments. */
|
||||
for (int i = 3; i < argc; i++) {
|
||||
if (strcmp(argv[i], "-h") == 0 && i + 1 < argc) {
|
||||
redis_host = argv[++i];
|
||||
} else if (strcmp(argv[i], "-p") == 0 && i + 1 < argc) {
|
||||
redis_port = atoi(argv[++i]);
|
||||
} else {
|
||||
fprintf(stderr, "Unknown option: %s\n", argv[i]);
|
||||
usage(argv[0]);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
printf("==============================================\n");
|
||||
printf("GGUF to Redis Vector Set Importer\n");
|
||||
printf("==============================================\n");
|
||||
printf("GGUF file: %s\n", gguf_file);
|
||||
printf("Redis target: %s:%d\n", redis_host, redis_port);
|
||||
printf("Vector set key: %s\n", redis_key);
|
||||
printf("==============================================\n\n");
|
||||
|
||||
/* Open GGUF file. */
|
||||
printf("Loading GGUF file...\n");
|
||||
gguf_ctx *ctx = gguf_open(gguf_file);
|
||||
if (!ctx) {
|
||||
fprintf(stderr, "Failed to open GGUF file: %s\n", strerror(errno));
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("GGUF file loaded successfully (version %d)\n", ctx->header->version);
|
||||
printf(" Key-value pairs: %llu\n", (unsigned long long)ctx->header->metadata_kv_count);
|
||||
printf(" Tensors: %llu\n\n", (unsigned long long)ctx->header->tensor_count);
|
||||
|
||||
/* First, load all token strings into memory. */
|
||||
printf("Loading vocabulary tokens...\n");
|
||||
token_strings = load_token_strings(ctx, &num_token_strings);
|
||||
if (!token_strings) {
|
||||
fprintf(stderr, "Failed to load token strings\n");
|
||||
gguf_close(ctx);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Connect to Redis. */
|
||||
printf("\nConnecting to Redis...\n");
|
||||
redisContext *rctx = redisConnect(redis_host, redis_port);
|
||||
if (!rctx || rctx->err) {
|
||||
if (rctx) {
|
||||
fprintf(stderr, "Redis connection error: %s\n", rctx->errstr);
|
||||
redisFree(rctx);
|
||||
} else {
|
||||
fprintf(stderr, "Cannot allocate redis context\n");
|
||||
}
|
||||
free_token_strings(token_strings, num_token_strings);
|
||||
gguf_close(ctx);
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("Connected to Redis successfully\n");
|
||||
|
||||
/* Process the embeddings, adding it to Redis. */
|
||||
if (!process_token_embeddings(ctx, rctx, redis_key,
|
||||
token_strings, num_token_strings)) {
|
||||
fprintf(stderr, "Failed to process token embeddings\n");
|
||||
redisFree(rctx);
|
||||
free_token_strings(token_strings, num_token_strings);
|
||||
gguf_close(ctx);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Cleanup and reporting. */
|
||||
free_token_strings(token_strings, num_token_strings);
|
||||
redisFree(rctx);
|
||||
gguf_close(ctx);
|
||||
|
||||
printf("\n==============================================\n");
|
||||
printf("Import completed successfully!\n");
|
||||
printf("Total tokens added: %llu\n", (unsigned long long)tokens_processed);
|
||||
printf("==============================================\n\n");
|
||||
printf("You can now use VSIM to find similar tokens:\n");
|
||||
printf(" VSIM %s ELE \"apple\" COUNT 10\n", redis_key);
|
||||
printf(" VSIM %s ELE \"python\" WITHSCORES\n", redis_key);
|
||||
printf(" VCARD %s # Check total count\n", redis_key);
|
||||
return 0;
|
||||
}
|
Reference in New Issue
Block a user