From 8fe9539af76075405b2c3071ba9657aa921d749d Mon Sep 17 00:00:00 2001 From: Awni Hannun Date: Sun, 27 Oct 2024 15:06:07 -0700 Subject: [PATCH] Fix detokenizer space match for quote (#1072) * fix + test * remove transformer flax/torch warning * format --- llms/mlx_lm/__init__.py | 5 +++++ llms/mlx_lm/tokenizer_utils.py | 2 +- llms/tests/test_tokenizers.py | 3 +++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/llms/mlx_lm/__init__.py b/llms/mlx_lm/__init__.py index 502c78e5..538be927 100644 --- a/llms/mlx_lm/__init__.py +++ b/llms/mlx_lm/__init__.py @@ -1,4 +1,9 @@ # Copyright © 2023-2024 Apple Inc. +import os + from ._version import __version__ + +os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1" + from .utils import convert, generate, load, stream_generate diff --git a/llms/mlx_lm/tokenizer_utils.py b/llms/mlx_lm/tokenizer_utils.py index 78ec2ff8..0cbc3b9b 100644 --- a/llms/mlx_lm/tokenizer_utils.py +++ b/llms/mlx_lm/tokenizer_utils.py @@ -169,7 +169,7 @@ class BPEStreamingDetokenizer(StreamingDetokenizer): """ _byte_decoder = None - _space_matches = (".", "?", "!", ",", "'", "n't", "'m", "'s", "'ve", "'re") + _space_matches = (".", "?", "!", ",", "n't", "'m", "'s", "'ve", "'re") def __init__(self, tokenizer): diff --git a/llms/tests/test_tokenizers.py b/llms/tests/test_tokenizers.py index 7b4828b1..03445c1f 100644 --- a/llms/tests/test_tokenizers.py +++ b/llms/tests/test_tokenizers.py @@ -51,6 +51,9 @@ class TestTokenizers(unittest.TestCase): tokens = tokenizer.encode("3 3") check(tokens) + tokens = tokenizer.encode("import 'package:flutter/material.dart';") + check(tokens) + def test_tokenizers(self): tokenizer_repos = [ ("mlx-community/Qwen1.5-0.5B-Chat-4bit", BPEStreamingDetokenizer),