diff --git a/tests/test_raw_text.py b/tests/test_raw_text.py index 604326976..d8289fed2 100644 --- a/tests/test_raw_text.py +++ b/tests/test_raw_text.py @@ -174,9 +174,7 @@ def test_raw_text_loader(): ) # Mixed paragraph + Unicode whitespace realistic input - mixed = preprocessor.clean_text( - "Section\u00a01\r\n\r\nBody\ftext\u202Fhere" - ) + mixed = preprocessor.clean_text("Section\u00a01\r\n\r\nBody\ftext\u202fhere") assert mixed == "Section 1\n\nBody text here", ( "Should preserve paragraph breaks and normalize Unicode " "whitespace simultaneously" @@ -197,7 +195,7 @@ def test_raw_text_loader(): # becomes "word1 word2" on a second call. assert preprocessor.clean_text("word1 \u00a9 word2") == "word1 word2" assert preprocessor.clean_text("a \u00e9 b") == "a b" - assert preprocessor.clean_text("prefix \U0001F600 suffix") == "prefix suffix" + assert preprocessor.clean_text("prefix \U0001f600 suffix") == "prefix suffix" # Stripping a non-ASCII character adjacent to a newline must not # leave a stray leading/trailing space on the neighbouring line. @@ -212,7 +210,7 @@ def test_raw_text_loader(): " messy text \n\n\n ", "Line 1\r\n\r\n\r\nLine 2", "hello\u00a0world", - "Section\u00a01\r\n\r\nBody\ftext\u202Fhere", + "Section\u00a01\r\n\r\nBody\ftext\u202fhere", "word1 \u00a9 word2", "a \u00e9 b", ] diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 8be6bb5a5..2145fa5da 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -95,13 +95,18 @@ def try_fix_tokenizer(tokenizer, prepend = True): if token is None: continue token_id = getattr(tokenizer, token_name + "_id", None) + if token_id is None: + continue # Locate the token's id mapping in the string find_text = f'"id":{token_id},"content":"' - start = tokenizer_string.find(find_text) + len(find_text) - if start == -1: + find_pos = tokenizer_string.find(find_text) + if find_pos == -1: continue + start = find_pos + len(find_text) end = tokenizer_string.find('",', start) + if end == -1: + continue bad_token = tokenizer_string[start:end] # Check if token is the actual same one - if not, edit it