fix: check find() return value before adding offset in try_fix_tokenizer (#4923)

* fix: check find() return value before adding offset in try_fix_tokenizer The `str.find()` result was checked for -1 only after adding `len(find_text)`, turning the guard into dead code. When the substring is absent, `start` becomes `len(find_text) - 1` (a positive number), so the `if start == -1: continue` never triggers and the subsequent slice extracts garbage from the tokenizer string. Split the find and offset into two steps so the -1 check works correctly. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Add defensive guards for token_id None and end find() returning -1 - Skip loop iteration early when token_id is None to avoid constructing a find_text that can never match valid JSON - Guard end = tokenizer_string.find('",', start) against -1 to prevent silent garbage extraction from malformed tokenizer strings * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Co-authored-by: Daniel Han <danielhanchen@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2026-04-21 13:37:39 +00:00 · 2026-04-09 21:15:46 +08:00 · 2026-04-09 21:15:46 +08:00 · d5525e8bbb
commit d5525e8bbb
parent dc16e0c65b
2 changed files with 10 additions and 7 deletions
--- a/tests/test_raw_text.py
+++ b/tests/test_raw_text.py
@ -174,9 +174,7 @@ def test_raw_text_loader():
            )

        # Mixed paragraph + Unicode whitespace realistic input
-        mixed = preprocessor.clean_text(
-            "Section\u00a01\r\n\r\nBody\ftext\u202Fhere"
-        )
+        mixed = preprocessor.clean_text("Section\u00a01\r\n\r\nBody\ftext\u202fhere")
        assert mixed == "Section 1\n\nBody text here", (
            "Should preserve paragraph breaks and normalize Unicode "
            "whitespace simultaneously"
@ -197,7 +195,7 @@ def test_raw_text_loader():
        # becomes "word1 word2" on a second call.
        assert preprocessor.clean_text("word1 \u00a9 word2") == "word1 word2"
        assert preprocessor.clean_text("a \u00e9 b") == "a b"
-        assert preprocessor.clean_text("prefix \U0001F600 suffix") == "prefix suffix"
+        assert preprocessor.clean_text("prefix \U0001f600 suffix") == "prefix suffix"

        # Stripping a non-ASCII character adjacent to a newline must not
        # leave a stray leading/trailing space on the neighbouring line.
@ -212,7 +210,7 @@ def test_raw_text_loader():
            "  messy   text  \n\n\n  ",
            "Line 1\r\n\r\n\r\nLine 2",
            "hello\u00a0world",
-            "Section\u00a01\r\n\r\nBody\ftext\u202Fhere",
+            "Section\u00a01\r\n\r\nBody\ftext\u202fhere",
            "word1 \u00a9 word2",
            "a \u00e9 b",
        ]
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@ -95,13 +95,18 @@ def try_fix_tokenizer(tokenizer, prepend = True):
        if token is None:
            continue
        token_id = getattr(tokenizer, token_name + "_id", None)
+        if token_id is None:
+            continue

        # Locate the token's id mapping in the string
        find_text = f'"id":{token_id},"content":"'
-        start = tokenizer_string.find(find_text) + len(find_text)
-        if start == -1:
+        find_pos = tokenizer_string.find(find_text)
+        if find_pos == -1:
            continue
+        start = find_pos + len(find_text)
        end = tokenizer_string.find('",', start)
+        if end == -1:
+            continue

        bad_token = tokenizer_string[start:end]
        # Check if token is the actual same one - if not, edit it