# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 from pathlib import Path import tempfile from unittest.mock import MagicMock, patch import pandas as pd import pytest from data_designer.config.errors import InvalidFileFormatError from data_designer.config.seed import LocalSeedDatasetReference from data_designer.engine.model_provider import ModelProvider from data_designer.engine.secret_resolver import PlaintextResolver from data_designer.interface.data_designer import DataDesigner from data_designer.interface.errors import ( DataDesignerGenerationError, DataDesignerProfilingError, InvalidBufferValueError, ) @pytest.fixture def stub_artifact_path(tmp_path): """Temporary directory for artifacts.""" return tmp_path / "artifacts" @pytest.fixture def stub_model_providers(): return [ ModelProvider( name="stub-model-provider", endpoint="https://api.stub-model-provider.com/v1", api_key="stub-model-provider-api-key", ) ] def test_init_with_custom_secret_resolver(stub_artifact_path, stub_model_providers): """Test DataDesigner initialization with custom secret resolver.""" designer = DataDesigner( artifact_path=stub_artifact_path, model_providers=stub_model_providers, secret_resolver=PlaintextResolver(), ) assert designer is not None def test_init_with_string_path(stub_artifact_path, stub_model_providers): """Test DataDesigner accepts string paths.""" designer = DataDesigner(artifact_path=str(stub_artifact_path), model_providers=stub_model_providers) assert designer is not None assert isinstance(designer._artifact_path, Path) def test_init_with_path_object(stub_artifact_path, stub_model_providers): """Test DataDesigner accepts Path objects.""" designer = DataDesigner(artifact_path=stub_artifact_path, model_providers=stub_model_providers) assert designer is not None def test_make_seed_reference_from_dataframe(stub_dataframe): """Test creating seed reference from DataFrame.""" with tempfile.TemporaryDirectory() as temp_dir: file_path = Path(temp_dir) / "seed.parquet" ref = DataDesigner.make_seed_reference_from_dataframe(stub_dataframe, file_path=file_path) assert isinstance(ref, LocalSeedDatasetReference) assert ref.dataset == str(file_path) assert file_path.exists() # Verify the file contains the correct data loaded_df = pd.read_parquet(file_path) pd.testing.assert_frame_equal(loaded_df, stub_dataframe) def test_make_seed_reference_from_dataframe_writes_parquet_format(stub_dataframe): """Test that seed reference writes DataFrame as parquet.""" with tempfile.TemporaryDirectory() as temp_dir: file_path = Path(temp_dir) / "seed.parquet" DataDesigner.make_seed_reference_from_dataframe(stub_dataframe, file_path=file_path) # Verify we can read it back as parquet loaded_df = pd.read_parquet(file_path) assert len(loaded_df) == len(stub_dataframe) assert list(loaded_df.columns) == list(stub_dataframe.columns) def test_make_seed_reference_from_dataframe_writes_csv_format(stub_dataframe): """Test that seed reference writes DataFrame as CSV.""" with tempfile.TemporaryDirectory() as temp_dir: file_path = Path(temp_dir) / "seed.csv" ref = DataDesigner.make_seed_reference_from_dataframe(stub_dataframe, file_path=file_path) assert isinstance(ref, LocalSeedDatasetReference) assert ref.dataset == str(file_path) assert file_path.exists() # Verify we can read it back as CSV loaded_df = pd.read_csv(file_path) assert len(loaded_df) == len(stub_dataframe) assert list(loaded_df.columns) == list(stub_dataframe.columns) def test_make_seed_reference_from_dataframe_writes_json_format(stub_dataframe): """Test that seed reference writes DataFrame as JSON.""" with tempfile.TemporaryDirectory() as temp_dir: file_path = Path(temp_dir) / "seed.json" ref = DataDesigner.make_seed_reference_from_dataframe(stub_dataframe, file_path=file_path) assert isinstance(ref, LocalSeedDatasetReference) assert ref.dataset == str(file_path) assert file_path.exists() # Verify we can read it back as JSON loaded_df = pd.read_json(file_path, orient="records", lines=True) assert len(loaded_df) == len(stub_dataframe) assert list(loaded_df.columns) == list(stub_dataframe.columns) def test_make_seed_reference_from_dataframe_writes_jsonl_format(stub_dataframe): """Test that seed reference writes DataFrame as JSONL.""" with tempfile.TemporaryDirectory() as temp_dir: file_path = Path(temp_dir) / "seed.jsonl" ref = DataDesigner.make_seed_reference_from_dataframe(stub_dataframe, file_path=file_path) assert isinstance(ref, LocalSeedDatasetReference) assert ref.dataset == str(file_path) assert file_path.exists() # Verify we can read it back as JSONL loaded_df = pd.read_json(file_path, orient="records", lines=True) assert len(loaded_df) == len(stub_dataframe) assert list(loaded_df.columns) == list(stub_dataframe.columns) def test_make_seed_reference_from_dataframe_raises_error_for_invalid_extension(stub_dataframe): """Test that make_seed_reference_from_dataframe raises error for invalid file extensions.""" with tempfile.TemporaryDirectory() as temp_dir: # Test with .txt extension txt_path = Path(temp_dir) / "seed.txt" with pytest.raises(InvalidFileFormatError): DataDesigner.make_seed_reference_from_dataframe(stub_dataframe, file_path=txt_path) # Test with no extension no_ext_path = Path(temp_dir) / "seed" with pytest.raises(InvalidFileFormatError): DataDesigner.make_seed_reference_from_dataframe(stub_dataframe, file_path=no_ext_path) def test_make_seed_reference_from_dataframe_accepts_uppercase_extensions(stub_dataframe): """Test that make_seed_reference_from_dataframe accepts uppercase file extensions (case insensitive).""" with tempfile.TemporaryDirectory() as temp_dir: # Test .PARQUET parquet_path = Path(temp_dir) / "seed.PARQUET" ref = DataDesigner.make_seed_reference_from_dataframe(stub_dataframe, file_path=parquet_path) assert isinstance(ref, LocalSeedDatasetReference) assert ref.dataset == str(parquet_path) assert parquet_path.exists() # Test .CSV csv_path = Path(temp_dir) / "seed.CSV" ref = DataDesigner.make_seed_reference_from_dataframe(stub_dataframe, file_path=csv_path) assert isinstance(ref, LocalSeedDatasetReference) assert ref.dataset == str(csv_path) assert csv_path.exists() # Test .JSON json_path = Path(temp_dir) / "seed.JSON" ref = DataDesigner.make_seed_reference_from_dataframe(stub_dataframe, file_path=json_path) assert isinstance(ref, LocalSeedDatasetReference) assert ref.dataset == str(json_path) assert json_path.exists() # Test .JSONL jsonl_path = Path(temp_dir) / "seed.JSONL" ref = DataDesigner.make_seed_reference_from_dataframe(stub_dataframe, file_path=jsonl_path) assert isinstance(ref, LocalSeedDatasetReference) assert ref.dataset == str(jsonl_path) assert jsonl_path.exists() def test_make_seed_reference_from_dataframe_overwrites_existing_file(stub_dataframe): """Test that make_seed_reference_from_dataframe overwrites existing file.""" with tempfile.TemporaryDirectory() as temp_dir: file_path = Path(temp_dir) / "seed.parquet" # Create initial file with different data initial_df = pd.DataFrame({"col": [999]}) initial_df.to_parquet(file_path) # Overwrite with new data new_df = pd.DataFrame({"col": [1, 2, 3]}) _ = DataDesigner.make_seed_reference_from_dataframe(new_df, file_path=file_path) # Verify the file was overwritten loaded_df = pd.read_parquet(file_path) pd.testing.assert_frame_equal(loaded_df, new_df) assert len(loaded_df) == 3 # Should have 3 rows, not 1 def test_make_seed_reference_from_file_with_string_path(): """Test creating seed reference from file path string.""" with tempfile.TemporaryDirectory() as temp_dir: file_path = Path(temp_dir) / "dataset.parquet" df = pd.DataFrame({"col": [1, 2, 3]}) df.to_parquet(file_path) ref = DataDesigner.make_seed_reference_from_file(str(file_path)) assert isinstance(ref, LocalSeedDatasetReference) assert ref.dataset == str(file_path) def test_make_seed_reference_from_file_with_path_object(stub_dataframe): """Test creating seed reference from Path object.""" with tempfile.TemporaryDirectory() as temp_dir: file_path = Path(temp_dir) / "dataset.parquet" stub_dataframe.to_parquet(file_path) ref = DataDesigner.make_seed_reference_from_file(file_path) assert isinstance(ref, LocalSeedDatasetReference) assert ref.dataset == str(file_path) def test_buffer_size_setting_persists(stub_artifact_path, stub_model_providers): """Test that buffer size setting persists across multiple calls.""" data_designer = DataDesigner(artifact_path=stub_artifact_path, model_providers=stub_model_providers) custom_buffer_size = 500 data_designer.set_buffer_size(custom_buffer_size) assert data_designer._buffer_size == custom_buffer_size another_buffer_size = 750 data_designer.set_buffer_size(another_buffer_size) assert data_designer._buffer_size == another_buffer_size def test_set_buffer_size_raises_error_for_invalid_buffer_size(stub_artifact_path, stub_model_providers): """Test that set_buffer_size raises error for invalid buffer size.""" data_designer = DataDesigner(artifact_path=stub_artifact_path, model_providers=stub_model_providers) with pytest.raises(InvalidBufferValueError, match="Buffer size must be greater than 0."): data_designer.set_buffer_size(0) def test_multiple_seed_references_can_be_created(): """Test that multiple seed references can be created from different sources.""" with tempfile.TemporaryDirectory() as temp_dir: # Create seed reference from DataFrame df1 = pd.DataFrame({"col": [1, 2, 3]}) file_path_1 = Path(temp_dir) / "seed1.parquet" ref1 = DataDesigner.make_seed_reference_from_dataframe(df1, file_path=file_path_1) # Create seed reference from another DataFrame df2 = pd.DataFrame({"col": [4, 5, 6]}) file_path_2 = Path(temp_dir) / "seed2.parquet" ref2 = DataDesigner.make_seed_reference_from_dataframe(df2, file_path=file_path_2) # Create seed reference from existing file ref3 = DataDesigner.make_seed_reference_from_file(file_path_1) # Verify all references are unique and valid assert ref1.dataset != ref2.dataset assert ref1.dataset == ref3.dataset assert all(isinstance(ref, LocalSeedDatasetReference) for ref in [ref1, ref2, ref3]) def test_create_dataset_e2e_using_only_sampler_columns( stub_sampler_only_config_builder, stub_artifact_path, stub_model_providers ): column_names = [config.name for config in stub_sampler_only_config_builder.get_column_configs()] num_records = 3 data_designer = DataDesigner( artifact_path=stub_artifact_path, model_providers=stub_model_providers, secret_resolver=PlaintextResolver(), ) results = data_designer.create(stub_sampler_only_config_builder, num_records=num_records) df = results.load_dataset() assert len(df) == num_records assert set(df.columns) == set(column_names) # cycle through with no errors for _ in range(num_records + 2): results.display_sample_record() analysis = results.load_analysis() assert analysis.target_num_records == num_records # display report with no errors analysis.to_report() def test_create_raises_error_when_builder_fails( stub_artifact_path, stub_model_providers, stub_sampler_only_config_builder ): """Test that create method raises DataDesignerCreateError when builder.build fails.""" data_designer = DataDesigner( artifact_path=stub_artifact_path, model_providers=stub_model_providers, secret_resolver=PlaintextResolver(), ) with patch.object(data_designer, "_create_dataset_builder") as mock_builder_method: mock_builder = MagicMock() mock_builder.build.side_effect = RuntimeError("Builder failed") mock_builder_method.return_value = mock_builder with pytest.raises(DataDesignerGenerationError, match="🛑 Error generating dataset: Builder failed"): data_designer.create(stub_sampler_only_config_builder, num_records=3) def test_create_raises_error_when_profiler_fails( stub_artifact_path, stub_model_providers, stub_sampler_only_config_builder ): """Test that create method raises DataDesignerCreateError when profiler.profile_dataset fails.""" data_designer = DataDesigner( artifact_path=stub_artifact_path, model_providers=stub_model_providers, secret_resolver=PlaintextResolver(), ) with ( patch.object(data_designer, "_create_dataset_builder") as mock_builder_method, patch.object(data_designer, "_create_dataset_profiler") as mock_profiler_method, ): # Mock builder to succeed mock_builder = MagicMock() mock_builder.build.return_value = None mock_builder.artifact_storage.load_dataset_with_dropped_columns.return_value = pd.DataFrame({"col": [1, 2, 3]}) mock_builder_method.return_value = mock_builder # Mock profiler to fail mock_profiler = MagicMock() mock_profiler.profile_dataset.side_effect = ValueError("Profiler failed") mock_profiler_method.return_value = mock_profiler with pytest.raises(DataDesignerProfilingError, match="🛑 Error profiling dataset: Profiler failed"): data_designer.create(stub_sampler_only_config_builder, num_records=3) def test_preview_raises_error_when_builder_fails( stub_artifact_path, stub_model_providers, stub_sampler_only_config_builder ): """Test that preview method raises DataDesignerPreviewError when builder.build_preview fails.""" data_designer = DataDesigner( artifact_path=stub_artifact_path, model_providers=stub_model_providers, secret_resolver=PlaintextResolver(), ) with patch.object(data_designer, "_create_dataset_builder") as mock_builder_method: mock_builder = MagicMock() mock_builder.build_preview.side_effect = RuntimeError("Builder preview failed") mock_builder_method.return_value = mock_builder with pytest.raises( DataDesignerGenerationError, match="🛑 Error generating preview dataset: Builder preview failed" ): data_designer.preview(stub_sampler_only_config_builder, num_records=3) def test_preview_raises_error_when_profiler_fails( stub_artifact_path, stub_model_providers, stub_sampler_only_config_builder ): """Test that preview method raises DataDesignerPreviewError when profiler.profile_dataset fails.""" data_designer = DataDesigner( artifact_path=stub_artifact_path, model_providers=stub_model_providers, secret_resolver=PlaintextResolver(), ) with ( patch.object(data_designer, "_create_dataset_builder") as mock_builder_method, patch.object(data_designer, "_create_dataset_profiler") as mock_profiler_method, ): # Mock builder to succeed mock_builder = MagicMock() mock_builder.build_preview.return_value = pd.DataFrame({"col": [1, 2, 3]}) mock_builder.drop_columns_if_needed.return_value = pd.DataFrame({"col": [1, 2, 3]}) mock_builder_method.return_value = mock_builder # Mock profiler to fail mock_profiler = MagicMock() mock_profiler.profile_dataset.side_effect = ValueError("Profiler failed in preview") mock_profiler_method.return_value = mock_profiler with pytest.raises( DataDesignerProfilingError, match="🛑 Error profiling preview dataset: Profiler failed in preview" ): data_designer.preview(stub_sampler_only_config_builder, num_records=3)