mirror of
https://github.com/unslothai/unsloth
synced 2026-04-21 13:37:39 +00:00
fix: check find() return value before adding offset in try_fix_tokenizer (#4923)
* fix: check find() return value before adding offset in try_fix_tokenizer
The `str.find()` result was checked for -1 only after adding
`len(find_text)`, turning the guard into dead code. When the substring
is absent, `start` becomes `len(find_text) - 1` (a positive number),
so the `if start == -1: continue` never triggers and the subsequent
slice extracts garbage from the tokenizer string.
Split the find and offset into two steps so the -1 check works correctly.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
* Add defensive guards for token_id None and end find() returning -1
- Skip loop iteration early when token_id is None to avoid constructing
a find_text that can never match valid JSON
- Guard end = tokenizer_string.find('",', start) against -1 to prevent
silent garbage extraction from malformed tokenizer strings
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Co-authored-by: Daniel Han <danielhanchen@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
dc16e0c65b
commit
d5525e8bbb
2 changed files with 10 additions and 7 deletions
|
|
@ -174,9 +174,7 @@ def test_raw_text_loader():
|
|||
)
|
||||
|
||||
# Mixed paragraph + Unicode whitespace realistic input
|
||||
mixed = preprocessor.clean_text(
|
||||
"Section\u00a01\r\n\r\nBody\ftext\u202Fhere"
|
||||
)
|
||||
mixed = preprocessor.clean_text("Section\u00a01\r\n\r\nBody\ftext\u202fhere")
|
||||
assert mixed == "Section 1\n\nBody text here", (
|
||||
"Should preserve paragraph breaks and normalize Unicode "
|
||||
"whitespace simultaneously"
|
||||
|
|
@ -197,7 +195,7 @@ def test_raw_text_loader():
|
|||
# becomes "word1 word2" on a second call.
|
||||
assert preprocessor.clean_text("word1 \u00a9 word2") == "word1 word2"
|
||||
assert preprocessor.clean_text("a \u00e9 b") == "a b"
|
||||
assert preprocessor.clean_text("prefix \U0001F600 suffix") == "prefix suffix"
|
||||
assert preprocessor.clean_text("prefix \U0001f600 suffix") == "prefix suffix"
|
||||
|
||||
# Stripping a non-ASCII character adjacent to a newline must not
|
||||
# leave a stray leading/trailing space on the neighbouring line.
|
||||
|
|
@ -212,7 +210,7 @@ def test_raw_text_loader():
|
|||
" messy text \n\n\n ",
|
||||
"Line 1\r\n\r\n\r\nLine 2",
|
||||
"hello\u00a0world",
|
||||
"Section\u00a01\r\n\r\nBody\ftext\u202Fhere",
|
||||
"Section\u00a01\r\n\r\nBody\ftext\u202fhere",
|
||||
"word1 \u00a9 word2",
|
||||
"a \u00e9 b",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -95,13 +95,18 @@ def try_fix_tokenizer(tokenizer, prepend = True):
|
|||
if token is None:
|
||||
continue
|
||||
token_id = getattr(tokenizer, token_name + "_id", None)
|
||||
if token_id is None:
|
||||
continue
|
||||
|
||||
# Locate the token's id mapping in the string
|
||||
find_text = f'"id":{token_id},"content":"'
|
||||
start = tokenizer_string.find(find_text) + len(find_text)
|
||||
if start == -1:
|
||||
find_pos = tokenizer_string.find(find_text)
|
||||
if find_pos == -1:
|
||||
continue
|
||||
start = find_pos + len(find_text)
|
||||
end = tokenizer_string.find('",', start)
|
||||
if end == -1:
|
||||
continue
|
||||
|
||||
bad_token = tokenizer_string[start:end]
|
||||
# Check if token is the actual same one - if not, edit it
|
||||
|
|
|
|||
Loading…
Reference in a new issue