mirror of
				https://github.com/ml-explore/mlx-examples.git
				synced 2025-11-01 03:28:08 +08:00 
			
		
		
		
	Bpe stream without space (#1154)
* bpe streaming detokenization without space * version bump
This commit is contained in:
		| @@ -1,3 +1,3 @@ | ||||
| # Copyright © 2023-2024 Apple Inc. | ||||
|  | ||||
| __version__ = "0.20.2" | ||||
| __version__ = "0.20.4" | ||||
|   | ||||
| @@ -195,18 +195,16 @@ class BPEStreamingDetokenizer(StreamingDetokenizer): | ||||
|         self.tokens.append(token) | ||||
|         v = self.tokenmap[token] | ||||
|         is_added = token in self._added_ids | ||||
|         if is_added or self._byte_decoder[v[0]] == 32: | ||||
|             current_text = bytearray( | ||||
|                 self._byte_decoder[c] for c in self._unflushed | ||||
|             ).decode("utf-8", "replace") | ||||
|             self.text += self._maybe_trim_space(current_text) | ||||
|             if is_added: | ||||
|                 self.text += v | ||||
|                 self._unflushed = "" | ||||
|             else: | ||||
|                 self._unflushed = v | ||||
|         else: | ||||
|         if not is_added: | ||||
|             self._unflushed += v | ||||
|         text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode( | ||||
|             "utf-8", "replace" | ||||
|         ) | ||||
|         if is_added: | ||||
|             text += v | ||||
|         if not text.endswith("\ufffd"): | ||||
|             self.text += self._maybe_trim_space(text) | ||||
|             self._unflushed = "" | ||||
|  | ||||
|     def finalize(self): | ||||
|         current_text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode( | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Awni Hannun
					Awni Hannun