fix: check find() return value before adding offset in try_fix_tokenizer (#4923)

* fix: check find() return value before adding offset in try_fix_tokenizer

The `str.find()` result was checked for -1 only after adding
`len(find_text)`, turning the guard into dead code. When the substring
is absent, `start` becomes `len(find_text) - 1` (a positive number),
so the `if start == -1: continue` never triggers and the subsequent
slice extracts garbage from the tokenizer string.

Split the find and offset into two steps so the -1 check works correctly.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* Add defensive guards for token_id None and end find() returning -1

- Skip loop iteration early when token_id is None to avoid constructing
  a find_text that can never match valid JSON
- Guard end = tokenizer_string.find('",', start) against -1 to prevent
  silent garbage extraction from malformed tokenizer strings

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Co-authored-by: Daniel Han <danielhanchen@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
Ricardo-M-L 2026-04-09 21:15:46 +08:00 committed by GitHub
parent dc16e0c65b
commit d5525e8bbb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 10 additions and 7 deletions

View file

@ -174,9 +174,7 @@ def test_raw_text_loader():
)
# Mixed paragraph + Unicode whitespace realistic input
mixed = preprocessor.clean_text(
"Section\u00a01\r\n\r\nBody\ftext\u202Fhere"
)
mixed = preprocessor.clean_text("Section\u00a01\r\n\r\nBody\ftext\u202fhere")
assert mixed == "Section 1\n\nBody text here", (
"Should preserve paragraph breaks and normalize Unicode "
"whitespace simultaneously"
@ -197,7 +195,7 @@ def test_raw_text_loader():
# becomes "word1 word2" on a second call.
assert preprocessor.clean_text("word1 \u00a9 word2") == "word1 word2"
assert preprocessor.clean_text("a \u00e9 b") == "a b"
assert preprocessor.clean_text("prefix \U0001F600 suffix") == "prefix suffix"
assert preprocessor.clean_text("prefix \U0001f600 suffix") == "prefix suffix"
# Stripping a non-ASCII character adjacent to a newline must not
# leave a stray leading/trailing space on the neighbouring line.
@ -212,7 +210,7 @@ def test_raw_text_loader():
" messy text \n\n\n ",
"Line 1\r\n\r\n\r\nLine 2",
"hello\u00a0world",
"Section\u00a01\r\n\r\nBody\ftext\u202Fhere",
"Section\u00a01\r\n\r\nBody\ftext\u202fhere",
"word1 \u00a9 word2",
"a \u00e9 b",
]

View file

@ -95,13 +95,18 @@ def try_fix_tokenizer(tokenizer, prepend = True):
if token is None:
continue
token_id = getattr(tokenizer, token_name + "_id", None)
if token_id is None:
continue
# Locate the token's id mapping in the string
find_text = f'"id":{token_id},"content":"'
start = tokenizer_string.find(find_text) + len(find_text)
if start == -1:
find_pos = tokenizer_string.find(find_text)
if find_pos == -1:
continue
start = find_pos + len(find_text)
end = tokenizer_string.find('",', start)
if end == -1:
continue
bad_token = tokenizer_string[start:end]
# Check if token is the actual same one - if not, edit it