diff --git a/tests/test_raw_text.py b/tests/test_raw_text.py index 7c7272a55..9f2e8cda4 100644 --- a/tests/test_raw_text.py +++ b/tests/test_raw_text.py @@ -125,8 +125,12 @@ def test_raw_text_loader(): ), "input_ids and attention_mask should have same length" # Verify labels field exists (for causal LM training) - assert "labels" in tokenized_dataset.column_names, "Dataset should have 'labels' column" - assert first_sample["labels"] == first_sample["input_ids"], "labels should match input_ids" + assert ( + "labels" in tokenized_dataset.column_names + ), "Dataset should have 'labels' column" + assert ( + first_sample["labels"] == first_sample["input_ids"] + ), "labels should match input_ids" # Test constructor validation try: diff --git a/unsloth/dataprep/raw_text.py b/unsloth/dataprep/raw_text.py index da64565bb..ba010edab 100644 --- a/unsloth/dataprep/raw_text.py +++ b/unsloth/dataprep/raw_text.py @@ -97,7 +97,11 @@ class RawTextDataLoader: # Labels are same as input_ids for causal LM training labels = [list(ids) for ids in input_ids] return Dataset.from_dict( - {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels} + { + "input_ids": input_ids, + "attention_mask": attention_mask, + "labels": labels, + } ) else: # If chunks are text strings (backward compatibility)