mirror of
				https://github.com/ml-explore/mlx-examples.git
				synced 2025-11-01 03:28:08 +08:00 
			
		
		
		
	Replace unicode errors instead of raising exception (#1146)
This commit is contained in:
		 Angelos Katharopoulos
					Angelos Katharopoulos
				
			
				
					committed by
					
						 GitHub
						GitHub
					
				
			
			
				
	
			
			
			 GitHub
						GitHub
					
				
			
						parent
						
							06af3c9b0e
						
					
				
				
					commit
					19abf3dcaa
				
			| @@ -3,8 +3,6 @@ from functools import partial | ||||
|  | ||||
| from transformers import AutoTokenizer | ||||
|  | ||||
| REPLACEMENT_CHAR = "\ufffd" | ||||
|  | ||||
|  | ||||
| class StreamingDetokenizer: | ||||
|     """The streaming detokenizer interface so that we can detokenize one token at a time. | ||||
| @@ -51,11 +49,9 @@ class StreamingDetokenizer: | ||||
|     def last_segment(self): | ||||
|         """Return the last segment of readable text since last time this property was accessed.""" | ||||
|         text = self.text | ||||
|         if text and text[-1] != REPLACEMENT_CHAR: | ||||
|         segment = text[self.offset :] | ||||
|         self.offset = len(text) | ||||
|         return segment | ||||
|         return "" | ||||
|  | ||||
|  | ||||
| class NaiveStreamingDetokenizer(StreamingDetokenizer): | ||||
| @@ -132,7 +128,7 @@ class SPMStreamingDetokenizer(StreamingDetokenizer): | ||||
|         self.tokens = [] | ||||
|  | ||||
|     def _flush(self): | ||||
|         text = self._unflushed.replace(self._sep, b" ").decode("utf-8") | ||||
|         text = self._unflushed.replace(self._sep, b" ").decode("utf-8", "replace") | ||||
|         if not self.text and self.trim_space and text and text[0] == " ": | ||||
|             text = text[1:] | ||||
|         self.text += text | ||||
| @@ -202,7 +198,7 @@ class BPEStreamingDetokenizer(StreamingDetokenizer): | ||||
|         if is_added or self._byte_decoder[v[0]] == 32: | ||||
|             current_text = bytearray( | ||||
|                 self._byte_decoder[c] for c in self._unflushed | ||||
|             ).decode("utf-8") | ||||
|             ).decode("utf-8", "replace") | ||||
|             self.text += self._maybe_trim_space(current_text) | ||||
|             if is_added: | ||||
|                 self.text += v | ||||
| @@ -214,7 +210,8 @@ class BPEStreamingDetokenizer(StreamingDetokenizer): | ||||
|  | ||||
|     def finalize(self): | ||||
|         current_text = bytearray(self._byte_decoder[c] for c in self._unflushed).decode( | ||||
|             "utf-8" | ||||
|             "utf-8", | ||||
|             "replace", | ||||
|         ) | ||||
|         self.text += self._maybe_trim_space(current_text) | ||||
|         self._unflushed = "" | ||||
|   | ||||
		Reference in New Issue
	
	Block a user