From 0340113e02dbb3b29c656132232ca24835cd4f79 Mon Sep 17 00:00:00 2001 From: ZHAOKAI WANG Date: Thu, 1 Feb 2024 11:27:29 +0800 Subject: [PATCH] BUG FIX: Decoding results in garbled text when multiple tokens represent a single character (e.g., Chinese). (#398) * Decoding results in garbled text when multiple tokens represent a single character (e.g., Chinese). * Decoding results in garbled text when multiple tokens represent a single character (e.g., Chinese). --- lora/lora.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lora/lora.py b/lora/lora.py index 9efe8893..d0ff032b 100644 --- a/lora/lora.py +++ b/lora/lora.py @@ -292,8 +292,9 @@ def generate(model, prompt, tokenizer, args): tokens.append(token.item()) s = tokenizer.decode(tokens) - print(s[skip:], end="", flush=True) - skip = len(s) + if len(s) - skip > 1: + print(s[skip:-1], end="", flush=True) + skip = len(s) - 1 print(tokenizer.decode(tokens)[skip:], flush=True) print("=" * 10) if len(tokens) == 0: